From 117885128073c9c2b32f4b33c6c79df3895b7071 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 24 Sep 2018 09:58:48 -0700
Subject: [PATCH 01/51] Get rid of most usages of Type.tensor. (#12002)

Summary:
1) Most usages are replaced by at::empty.
2) native_tensor has its namespace function removed
3) Type.tensor(sizes, strides) becomes at::empty_strided(sizes, strides).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12002

Differential Revision: D10007201

Pulled By: gchanan

fbshipit-source-id: 5e5647c050ed2ecb87a33e0b5ce4928fa3186c34
---
 aten/src/ATen/SparseTensorImpl.h              |  4 +--
 aten/src/ATen/TensorGeometry.cpp              |  4 ---
 aten/src/ATen/TensorGeometry.h                |  3 --
 aten/src/ATen/TensorOperators.h               |  6 ++--
 aten/src/ATen/core/Type.h                     |  2 ++
 aten/src/ATen/native/Activation.cpp           |  4 +--
 aten/src/ATen/native/BinaryOps.cpp            |  8 ++---
 aten/src/ATen/native/Convolution.cpp          |  2 +-
 aten/src/ATen/native/ConvolutionTBC.cpp       |  4 +--
 aten/src/ATen/native/Distance.cpp             |  2 +-
 aten/src/ATen/native/Distributions.cpp        |  2 +-
 aten/src/ATen/native/Embedding.cpp            |  6 ++--
 aten/src/ATen/native/Indexing.cpp             |  2 +-
 aten/src/ATen/native/Linear.cpp               |  2 +-
 aten/src/ATen/native/LinearAlgebra.cpp        |  8 ++---
 aten/src/ATen/native/ReduceOps.cpp            | 16 ++++-----
 aten/src/ATen/native/RoiPooling.cpp           |  4 +--
 aten/src/ATen/native/TensorCompare.cpp        | 22 ++++++------
 aten/src/ATen/native/TensorFactories.cpp      |  6 ++++
 aten/src/ATen/native/TensorIterator.cpp       |  2 +-
 aten/src/ATen/native/TensorShape.cpp          |  4 +--
 aten/src/ATen/native/UnaryOps.cpp             | 10 +++---
 aten/src/ATen/native/cuda/DistanceKernel.cu   |  2 +-
 aten/src/ATen/native/cuda/Distributions.cu    |  6 ++--
 aten/src/ATen/native/cuda/Dropout.cu          |  2 +-
 aten/src/ATen/native/cuda/Embedding.cu        |  2 +-
 aten/src/ATen/native/cuda/RoiPooling.cu       |  6 ++--
 aten/src/ATen/native/cuda/SpectralOps.cu      |  2 +-
 aten/src/ATen/native/cuda/TensorCompare.cu    |  2 +-
 aten/src/ATen/native/cuda/TensorFactories.cu  |  4 +--
 aten/src/ATen/native/cuda/WeightNorm.cu       |  2 +-
 .../ATen/native/cudnn/AffineGridGenerator.cpp |  4 +--
 aten/src/ATen/native/cudnn/BatchNorm.cpp      | 12 +++----
 aten/src/ATen/native/cudnn/Conv.cpp           | 13 +++----
 aten/src/ATen/native/cudnn/GridSampler.cpp    |  6 ++--
 aten/src/ATen/native/cudnn/LossCTC.cpp        |  2 +-
 aten/src/ATen/native/cudnn/RNN.cpp            | 36 +++++++++----------
 aten/src/ATen/native/mkl/SpectralOps.cpp      |  2 +-
 aten/src/ATen/native/mkldnn/Conv.cpp          | 10 +++---
 aten/src/ATen/native/native_functions.yaml    |  5 ++-
 aten/src/ATen/native/sparse/SparseTensor.cpp  |  8 ++---
 .../ATen/native/sparse/SparseTensorMath.cpp   | 16 ++++-----
 aten/src/ATen/native/sparse/SparseUtils.h     |  2 +-
 .../native/sparse/cuda/SparseCUDATensor.cpp   |  4 +--
 .../sparse/cuda/SparseCUDATensorMath.cu       |  2 +-
 aten/src/ATen/templates/NativeFunctions.h     |  1 -
 torch/csrc/autograd/functions/tensor.cpp      |  2 +-
 .../csrc/autograd/python_legacy_variable.cpp  |  2 +-
 torch/csrc/cuda/comm.cpp                      |  2 +-
 torch/csrc/jit/batched/BatchTensor.cpp        |  8 ++---
 torch/csrc/jit/fusers/common/fused_kernel.cpp |  2 +-
 torch/csrc/jit/test_jit.cpp                   |  2 +-
 torch/csrc/utils/tensor_new.cpp               |  6 ++--
 torch/lib/c10d/ProcessGroupGloo.cpp           |  2 +-
 torch/lib/c10d/Utils.hpp                      |  4 +--
 55 files changed, 154 insertions(+), 150 deletions(-)

diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 42b670bea08541..9f9569ac06bbf3 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -157,11 +157,11 @@ struct AT_API SparseTensorImpl : public TensorImpl {
     sparseDims_ = sparseDims;
     denseDims_ = denseDims;
 
-    auto empty_indices = indices().type().tensor({sparseDims, 0});
+    auto empty_indices = at::empty({sparseDims, 0}, indices().options());
     std::vector<int64_t> values_size = {0};
     auto dense_size = sizes().slice(sparseDims);
     values_size.insert(values_size.end(), dense_size.begin(), dense_size.end());
-    auto empty_values = values().type().tensor(values_size);
+    auto empty_values = at::empty(values_size, values().options());
     set_indices_and_values_unsafe(empty_indices, empty_values);
     refresh_numel();
   }
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
index b11c7bb159900b..20ab6bb6690c5d 100644
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -12,8 +12,4 @@ bool TensorGeometry::is_contiguous() const {
   return at::geometry_is_contiguous(sizes_, strides_);
 }
 
-Tensor TensorGeometry::zeros_with_stride(const Type& type) const {
-  return type.tensor(sizes_, strides_).zero_();
-}
-
 } // namespace at
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 5f441ed8fa71cf..9e7c6f6b440e57 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -30,9 +30,6 @@ struct AT_API TensorGeometry {
   // true if the tensor is contiguous
   bool is_contiguous() const;
 
-  // creates a new tensor with the sizes and strides of the source
-  Tensor zeros_with_stride(const Type& type) const;
-
   int64_t dim() const { return sizes_.size(); }
   int64_t size(int64_t dim) const {
     dim = maybe_wrap_dim(dim, this->dim());
diff --git a/aten/src/ATen/TensorOperators.h b/aten/src/ATen/TensorOperators.h
index 57a986b5d46f70..f4bdab0bf35d7d 100644
--- a/aten/src/ATen/TensorOperators.h
+++ b/aten/src/ATen/TensorOperators.h
@@ -68,9 +68,9 @@ inline Tensor Tensor::operator[](int64_t index) const {
 #define AT_FORALL_BINARY_OPS(_) \
 _(+,x.add(y), y.add(x)) \
 _(*,x.mul(y), y.mul(x)) \
-_(-,x.sub(y), y.type().tensor().resize_(y.sizes()).fill_(x).sub_(y)) \
-_(/,x.div(y), y.type().tensor().resize_(y.sizes()).fill_(x).div_(y)) \
-_(%,x.remainder(y), y.type().tensor().resize_(y.sizes()).fill_(x).remainder_(y)) \
+_(-,x.sub(y), ::at::empty(y.sizes(), y.options()).fill_(x).sub_(y)) \
+_(/,x.div(y), ::at::empty(y.sizes(), y.options()).fill_(x).div_(y)) \
+_(%,x.remainder(y), ::at::empty(y.sizes(), y.options()).fill_(x).remainder_(y)) \
 _(<,x.lt(y), y.gt(x)) \
 _(<=,x.le(y), y.ge(x)) \
 _(>,x.gt(y),y.lt(x)) \
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index fdec4c6408d4c2..2d19e0de588416 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -579,6 +579,8 @@ struct AT_API Type {
   virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0;
   virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor native_tensor() const = 0;
+  virtual Tensor native_tensor(IntList size) const = 0;
   virtual Tensor tensor() const = 0;
   virtual Tensor tensor(IntList size) const = 0;
   virtual Tensor native_sparse_coo_tensor(IntList size) const = 0;
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 30ccc616c6e7c0..84f83946094c8a 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -37,11 +37,11 @@ Tensor & celu_(Tensor & self, Scalar alpha) {
 }
 
 Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
-  return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator);
+  return at::rrelu_with_noise(self, at::empty({0}, self.options()), lower, upper, training, generator);
 }
 
 Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
-  return at::rrelu_with_noise_(self, self.type().tensor(), lower, upper, training, generator);
+  return at::rrelu_with_noise_(self, at::empty({0}, self.options()), lower, upper, training, generator);
 }
 
 // -----------------------------------
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 517d00164e37f1..5ddb36bb5b4ddf 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -16,7 +16,7 @@ DEFINE_DISPATCH(div_stub);
 Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) {
   if (other.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     if (self.is_sparse()) {
       at::_sparse_add_out(result, self, other, alpha);
@@ -44,7 +44,7 @@ Tensor& add_(Tensor& self, const Tensor& other, Scalar alpha) {
 Tensor& div_out(Tensor& result, const Tensor& self, const Tensor& other) {
   if (self.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     if (other.dim() != 0) {
       AT_ERROR("div(): sparse division only supports division by a scalar ",
@@ -69,7 +69,7 @@ Tensor& div_(Tensor& self, const Tensor& other) {
 Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
   if (self.is_sparse() || other.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     return at::_sparse_mul_out(result, self, other);
   }
@@ -90,7 +90,7 @@ Tensor& mul_(Tensor& self, const Tensor& other) {
 Tensor& sub_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) {
   if (other.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     if (!self.sizes().equals(other.sizes())) {
       AT_ERROR("sizes do not match");
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 96ddb5ae3928b1..171f892aef2144 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -316,7 +316,7 @@ at::Tensor _convolution(
     weight = view4d(weight);
   }
 
-  auto output = input.type().tensor();
+  auto output = at::empty({0}, input.options());
 
   if (params.is_depthwise(input, weight)) {
       /* output.resize_(output_size(input, weight)); */
diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp
index 0c2ac96dce8066..8b9779313bf89e 100644
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@@ -33,11 +33,11 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
       "the weight tensor (output channels).");
 
   // input * weights + bias -> output_features
-  Tensor output = self.type().tensor({
+  Tensor output = at::empty({
     olen,
     input_size[1],
     weight_size[2],
-  });
+  }, self.options());
   output.copy_(bias.expand(output.sizes()));
   for (int k = 0; k < kw; k++) {
     int iShift = std::max(0, static_cast<int>(k - real_pad));
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 08f306869d89f2..f075269291d642 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -26,7 +26,7 @@ Tensor _pdist_forward(const Tensor& self, const double p) {
   AT_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
   auto device = self.type().device_type();
   AT_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   if (self.size(0) <= 1) {
     result.resize_({0});
   } else {
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 3a2d1da5bd9a5a..9810c9128980e9 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -173,7 +173,7 @@ Tensor& bernoulli_scalar_cpu_(Tensor& self, double p, Generator* gen) {
 
 
 Tensor _standard_gamma_grad_cpu(const Tensor& self, const Tensor& output) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_FLOATING_TYPES(self.type(), "_standard_gamma_grad", [&] {
     CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(ret, self, output,
       [](scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) {
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 99fa4c701d4bbf..761d16b3d1ba03 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -66,12 +66,12 @@ Tensor embedding_sparse_backward(
 
   int64_t num_features = grad_.size(-1);
   auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
-  auto& dense_type = grad.type();
+  auto dense_options = grad.options();
 
   // check if all our grad come from padding_idx
   if (grad.numel() == 0) {
-    return at::_sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}),
-                                         dense_type.tensor({0, num_features}),
+    return at::_sparse_coo_tensor_unsafe(at::empty({1, 0}, indices_.options()),
+                                         at::empty({0, num_features}, dense_options),
                                          weight_size);
   }
 
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 5566fd397320aa..90fdf7b80a6c9a 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -73,7 +73,7 @@ static std::vector<Tensor> expandByteTensors(const Tensor & self, TensorList ind
         if (special_empty) {
           // We can't call select on an empty tensor so we just create an empty
           // tensor.
-          result.emplace_back(nonzero.type().tensor());
+          result.emplace_back(at::empty({0}, nonzero.options()));
         } else {
           result.emplace_back(nonzero.select(1, j));
         }
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index bdf9602fe9ae00..7b0d89d4d5675d 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -404,7 +404,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
   int64_t slicemul2 = (expand2[unroll_dim] ? 0 : 1);
   int64_t slicemul3 = (expand3[unroll_dim] ? 0 : 1);
 
-  auto output = i1.type().tensor(output_size).zero_();
+  auto output = at::zeros(output_size, i1.options());
   if (! sumdim[unroll_dim]) {
     for (int64_t k = 0; k < unroll_size; k++) {
       Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 04bf617081387b..1ec850ce9a5926 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -88,7 +88,7 @@ std::tuple<Tensor, Tensor> slogdet(const Tensor& self) {
 }
 
 Tensor inverse(const Tensor& self) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::inverse_out(result, self);
 }
 
@@ -111,7 +111,7 @@ Tensor pinverse(const Tensor& self, double rcond) {
            "of floating types");
   if (self.numel() == 0) {
     // Match NumPy
-    return self.type().tensor({self.size(1), self.size(0)});
+    return at::empty({self.size(1), self.size(0)}, self.options());
   }
   Tensor U, S, V;
   std::tie(U, S, V) = self.svd();
@@ -345,7 +345,7 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
 
 
 Tensor baddbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::baddbmm_out_cpu(result, self, batch1, batch2, beta, alpha);
 }
 
@@ -362,7 +362,7 @@ Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, S
 }
 
 Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::bmm_out_cpu(result, self, mat2);
 }
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index c976121e77ae3f..9e61db8543fbcf 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -321,7 +321,7 @@ Tensor sum(const Tensor& self, IntList dim, ScalarType dtype) {
 
 Tensor _sum(const Tensor &self, int64_t dim_, bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::_sum_out(result, self, dim, keepdim);
 }
 
@@ -343,7 +343,7 @@ Tensor prod(const Tensor& self, int64_t dim, ScalarType dtype) {
 
 Tensor _prod(const Tensor &self, int64_t dim_, bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::_prod_out(result, self, dim, keepdim);
 }
 
@@ -365,7 +365,7 @@ Tensor& logsumexp_out(Tensor& result, const Tensor &self, int64_t dim_, bool kee
 
 Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::logsumexp_out(result, self, dim, keepdim);
 }
 
@@ -639,7 +639,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
 }
 
 Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::norm_out(result, self, p, dim, keepdim);
 }
 
@@ -648,7 +648,7 @@ Tensor norm(const Tensor& self, Scalar p) {
 }
 
 Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::all_out(result, self, dim, keepdim);
 }
 
@@ -665,7 +665,7 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
 }
 
 Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::any_out(result, self, dim, keepdim);
 }
 
@@ -690,7 +690,7 @@ Tensor var(const Tensor& self, bool unbiased) {
 }
 
 Tensor var(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::var_out(result, self, dim, unbiased, keepdim);
 }
 
@@ -715,7 +715,7 @@ Tensor std(const Tensor& self, bool unbiased) {
 }
 
 Tensor std(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::std_out(result, self, dim, unbiased, keepdim);
 }
 
diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp
index 1a089a9f473c17..26aeee9caf7194 100644
--- a/aten/src/ATen/native/RoiPooling.cpp
+++ b/aten/src/ATen/native/RoiPooling.cpp
@@ -28,13 +28,13 @@ std::tuple<at::Tensor, at::Tensor> RoiPooling2d_forward_cpu(
   auto inputWidth = input.size(3);
 
   // Output Tensor is (num_rois, C, pooledHeight, pooledWidth)
-  auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto output = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options());
 
   // TODO: need some mechanism for determining train vs. test
 
   // During training, we need to store the argmaxes for the pooling operation, so
   // the argmaxes Tensor should be the same size as the output Tensor
-  auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto argmaxes = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options().dtype(kInt));
 
   AT_CHECK(input.is_contiguous(), "input must be contiguous");
   AT_CHECK(rois.is_contiguous(), "rois must be contiguous");
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 40c4ce39addeb4..b215180d746e0a 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -85,7 +85,7 @@ Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) {
 }
 
 Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& other) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_ALL_TYPES(ret.type(), "where", [&] {
     where_cpu<scalar_t>(ret, condition, self, other);
   });
@@ -93,8 +93,8 @@ Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& o
 }
 
 std::tuple<Tensor, Tensor> kthvalue(const Tensor& self, int64_t k, int64_t dim, bool keepdim) {
-  Tensor values = self.type().tensor();
-  Tensor indices = self.type().toScalarType(kLong).tensor();
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::kthvalue_out(values, indices, self, k, dim, keepdim);
 }
 
@@ -113,8 +113,8 @@ std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor& values, Tensor& indices,
 }
 
 std::tuple<Tensor, Tensor> median(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor values = self.type().tensor();
-  Tensor indices = self.type().toScalarType(kLong).tensor();
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::median_out(values, indices, self, dim, keepdim);
 }
 
@@ -133,8 +133,8 @@ std::tuple<Tensor &,Tensor &> median_out(Tensor& values, Tensor& indices,
 }
 
 std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor values = self.type().tensor();
-  Tensor indices = self.type().toScalarType(kLong).tensor();
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::mode_out(values, indices, self, dim, keepdim);
 }
 
@@ -168,8 +168,8 @@ std::tuple<Tensor &,Tensor &> _max_out_cpu(Tensor& max, Tensor& max_indices,
 }
 
 std::tuple<Tensor, Tensor> max(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor max = self.type().tensor();
-  Tensor max_indices = self.type().toScalarType(kLong).tensor();
+  Tensor max = at::empty({0}, self.options());
+  Tensor max_indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::max_out(max, max_indices, self, dim, keepdim);
 }
 
@@ -211,8 +211,8 @@ std::tuple<Tensor &,Tensor &> _min_out_cpu(Tensor& min, Tensor& min_indices,
 }
 
 std::tuple<Tensor, Tensor> min(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor min = self.type().tensor();
-  Tensor min_indices = self.type().toScalarType(kLong).tensor();
+  Tensor min = at::empty({0}, self.options());
+  Tensor min_indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::min_out(min, min_indices, self, dim, keepdim);
 }
 
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 178045d9fd0de4..2e37acc951a61e 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -118,6 +118,12 @@ Tensor& empty_out(Tensor& result, IntList size) {
   return result;
 }
 
+Tensor empty_strided(IntList size, IntList stride, const TensorOptions& options) {
+  // Note [Native bindings for legacy TH factory functions]
+  return getFactoryType(options).tensor(size, stride);
+}
+
+
 // Temporary type cast operators. These are needed to trace type-casts now since
 // Type's are not supported in the IR. Instead, we call down to these
 // specialized operators for each datatype.
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index c3535a92a05723..97645c0d0256c5 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -153,7 +153,7 @@ void TensorIterator::allocate_outputs() {
       for (int dim = 0; dim < ndim(); dim++) {
         tensor_stride[dim] /= element_size;
       }
-      *op.tensor = op.type->tensor(tensor_shape, tensor_stride);
+      *op.tensor = at::empty_strided(tensor_shape, tensor_stride, op.type->options());
     }
   }
 }
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 634e7a443d21fd..c470f554c14234 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -133,7 +133,7 @@ Tensor expand_as(const Tensor& self, const Tensor& other) {
 }
 
 Tensor as_strided(const Tensor& self, IntList size, IntList stride, int64_t storage_offset) {
-  return self.type().tensor().set_(self.storage(), storage_offset, size, stride);
+  return at::empty({0}, self.options()).set_(self.storage(), storage_offset, size, stride);
 }
 
 Tensor &as_strided_(Tensor& self, IntList size, IntList stride, int64_t storage_offset) {
@@ -196,7 +196,7 @@ Tensor repeat(const Tensor& self, IntList repeats) {
 
   Tensor xtensor = self.expand(padded_size);
 
-  Tensor result = self.type().tensor(target_size);
+  Tensor result = at::empty(target_size, self.options());
   Tensor urtensor = at::alias(result);
   for (int64_t i = 0; i < xtensor.dim(); ++i) {
     // can't unfold with step 0, so make sure step is at least 1
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 89a13e14b8b2e8..a8955a976828bf 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -31,17 +31,17 @@ namespace at {
 namespace native {
 
 Tensor clamp(const Tensor& self, Scalar min, Scalar max) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return clamp_out(result, self, min, max);
 }
 
 Tensor clamp_max(const Tensor& self, Scalar max) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return clamp_max_out(result, self, max);
 }
 
 Tensor clamp_min(const Tensor& self, Scalar min) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return clamp_min_out(result, self, min);
 }
 
@@ -123,7 +123,7 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
 
 #define IMPLEMENT_UNARY_OP_VEC(op)                              \
   Tensor op(const Tensor& self) {                               \
-    Tensor result = self.type().tensor();                       \
+    Tensor result = at::empty({0}, self.options());             \
     return at::op##_out(result, self);                          \
   }                                                             \
   Tensor& _##op##__cpu(Tensor& self_) {                         \
@@ -143,7 +143,7 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
 
 #define IMPLEMENT_UNARY_OP_TH(op)                               \
   Tensor op(const Tensor& self) {                               \
-    Tensor result = self.type().tensor();                       \
+    Tensor result = at::empty({0}, self.options());             \
     return at::op##_out(result, self);                          \
   }                                                             \
   Tensor& _##op##__cpu(Tensor& self) {                          \
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index 02c143254ced76..f6128389f16f25 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -192,7 +192,7 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
   const dim3 grid(grid_x, grid_y);
   const dim3 block(block_x, block_y);
 
-  Tensor buffer = result.type().tensor({n - 1, result.size(0), result.size(1)});
+  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
   AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
     if (p == 1.0) {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index fc908714f18f28..50ea3a9bf32b20 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -182,7 +182,7 @@ void bernoulli_scalar_cuda_kernel(
 
 namespace at { namespace native {
 Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
-  Tensor ret = lambda.type().tensor(lambda.sizes());
+  Tensor ret = at::empty(lambda.sizes(), lambda.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "poisson", [&] {
     poisson_cuda_kernel<scalar_t>(ret, lambda, next_philox_seed(gen, 20));
   });
@@ -190,7 +190,7 @@ Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
 }
 
 Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
-  Tensor ret = alpha.type().tensor(alpha.sizes());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "gamma", [&] {
      gamma_cuda_kernel<scalar_t>(ret, alpha, next_philox_seed(gen, 10));
    });
@@ -198,7 +198,7 @@ Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
 }
 
 Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "_standard_gamma_grad", [&] {
      gamma_grad_cuda_kernel<scalar_t>(ret, self, output);
    });
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 2d133a70dc23b6..6976565de059a9 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -97,7 +97,7 @@ void masked_scale_kernel(at::Tensor& ret, const at::Tensor src, const at::Tensor
 std::tuple<Tensor,Tensor>
 fused_dropout_cuda(const Tensor& self, double p, Generator * gen){
   Tensor ret = at::empty_like(self);
-  Tensor mask = self.type().toScalarType(kByte).tensor(self.sizes());
+  Tensor mask = at::empty(self.sizes(), self.options().dtype(kByte));
   const int64_t nelem = self.numel();
   const int64_t block_size = 256;
   unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 27b079fe219e2e..ddc01923859b1c 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -349,7 +349,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
   // FIXME: thrust::unique only removes consecutive elements that are equal.
   // We have race conditions when indices contain duplicates which are not
   // adjacent
-  auto unique_indices = indices.type().tensor(indices.numel());
+  auto unique_indices = at::empty(indices.numel(), indices.options());
   auto unique_data = device_ptr(unique_indices.data<int64_t>());
   auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data);
   auto num_unique_indices = static_cast<int>(end - unique_data);
diff --git a/aten/src/ATen/native/cuda/RoiPooling.cu b/aten/src/ATen/native/cuda/RoiPooling.cu
index 0fd3f1d6efd153..6c0a90d4c2f481 100644
--- a/aten/src/ATen/native/cuda/RoiPooling.cu
+++ b/aten/src/ATen/native/cuda/RoiPooling.cu
@@ -122,13 +122,13 @@ std::tuple<Tensor, Tensor> RoiPooling2d_forward_cuda(
   auto inputWidth = input.size(3);
 
   // Output Tensor is (num_rois, C, pooledHeight, pooledWidth)
-  auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto output = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options());
 
   // TODO: need some mechanism for determining train vs. test
 
   // During training, we need to store the argmaxes for the pooling operation, so
   // the argmaxes Tensor should be the same size as the output Tensor
-  auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto argmaxes = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options().dtype(kInt));
 
   AT_CHECK(input.is_contiguous(), "input must be contiguous");
   AT_CHECK(rois.is_contiguous(), "rois must be contiguous");
@@ -198,7 +198,7 @@ Tensor RoiPooling2d_backward_cuda(
   auto inputHeight = input.size(2);
   auto inputWidth = input.size(3);
 
-  auto gradInput = input.type().tensor(input.sizes());
+  auto gradInput = at::empty(input.sizes(), input.options());
 
   dim3 block(512);
   dim3 grid((gradInput.numel() + 512 - 1) / 512);
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index 38b1dddb496276..51ab68a4f78f15 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -184,7 +184,7 @@ static inline Tensor _run_cufft(
   auto& ctx = at::globalContext();
 
   // set output
-  auto output = input.type().tensor(output_sizes);
+  auto output = at::empty(output_sizes, input.options());
 
   // set to current stream
   CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu
index 8e0cf4e1b76c64..8f99241ca35a92 100644
--- a/aten/src/ATen/native/cuda/TensorCompare.cu
+++ b/aten/src/ATen/native/cuda/TensorCompare.cu
@@ -32,7 +32,7 @@ Tensor _s_where_cuda(
     const Tensor& condition,
     const Tensor& self,
     const Tensor& other) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_ALL_TYPES_AND_HALF(ret.type(), "where", [&] {
     where_cuda<scalar_t>(ret, condition, self, other);
   });
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 309b54a299caaa..cbddd0ae87a131 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -49,14 +49,14 @@ Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
     result.copy_(randperm_out_cuda(result_float, n, generator));
   } else {
     if (n < 30000) {  // For small inputs, we offload it to CPU instead.
-      auto result_cpu = result.type().cpu().tensor({n});
+      auto result_cpu = at::empty({n}, result.options().device(kCPU));
       randperm_out(result_cpu, n, generator);
       result.copy_(result_cpu);
     } else {
       // Generate random values for the keys array
       AT_DISPATCH_ALL_TYPES(
         result.type(), "randperm_out_cuda", [&] {
-          auto keys = result.type().tensor(result.sizes()).random_(generator);
+          auto keys = at::empty(result.sizes(), result.options()).random_(generator);
 
           auto result_data = thrust::device_ptr<scalar_t>(result.data<scalar_t>());
           auto keys_data = thrust::device_ptr<scalar_t>(keys.data<scalar_t>());
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
index 67d8f39e2de71d..5700ca559f0fea 100644
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -329,7 +329,7 @@ std::tuple<Tensor,Tensor> weight_norm_cuda
                            at::ScalarType::Float : g.type().scalarType();
   // Will this create norms on the same device as g, regardless of what the thread's default 
   // current device is?  I believe so, because Type::* functions are DeviceGuard()ed.
-  auto norms = g.type().toScalarType(AccType).tensor(g.sizes(), g.strides());
+  auto norms = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(AccType));
 
   const int ndims = v.dim();
 
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
index 463d4ffea3cf04..a12df78c767e2d 100644
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -59,7 +59,7 @@ Tensor cudnn_affine_grid_generator_forward(
   checkContiguous(c, theta);
   checkSize(c, theta, {N, 2, 3});
 
-  auto grid_t = theta->type().tensor();
+  auto grid_t = at::empty({0}, theta->options());
   grid_t.resize_({N, H, W, 2});
 
   auto dataType = getCudnnDataType(*theta);
@@ -82,7 +82,7 @@ Tensor cudnn_affine_grid_generator_backward(
   checkContiguous(c, grad_grid);
   checkSize(c, grad_grid, {N, H, W, 2});
 
-  auto grad_theta_t = grad_grid->type().tensor();
+  auto grad_theta_t = at::empty({0}, grad_grid->options());
   grad_theta_t.resize_({N, 2, 3});
 
   auto dataType = getCudnnDataType(grad_theta_t);
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index d54fe256b29152..427f7e00d9d909 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -94,7 +94,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
 #endif
   }
 
-  auto output_t = input->type().tensor(input->sizes());
+  auto output_t = at::empty(input->sizes(), input->options());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getCudnnHandle();
@@ -108,8 +108,8 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
 
   if (training) {
     int64_t num_features = input_t.size(1);
-    save_mean = weight_t.type().tensor({ num_features });
-    save_var = weight_t.type().tensor({ num_features });
+    save_mean = at::empty({ num_features }, weight_t.options());
+    save_var = at::empty({ num_features }, weight_t.options());
     AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
       handle, mode, &one, &zero,
       idesc.desc(), input->data_ptr(),
@@ -190,9 +190,9 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
 #endif
   }
 
-  auto grad_input_t  = input->type().tensor(input->sizes());
-  auto grad_weight_t = weight->type().tensor(weight->sizes());
-  auto grad_bias_t   = weight->type().tensor(weight->sizes());
+  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_weight_t = at::empty(weight->sizes(), weight->options());
+  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
   auto handle = getCudnnHandle();
   auto dataType = getCudnnDataType(*input);
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index afbd7653aefa67..9638740c24a6ac 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -836,9 +836,10 @@ Tensor cudnn_convolution_forward(
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  auto output_t = input->type().tensor(
+  auto output_t = at::empty(
                     conv_output_size(input->sizes(), weight->sizes(),
-                                     padding, stride, dilation, groups));
+                                     padding, stride, dilation, groups),
+                    input->options());
 
   // Avoid ambiguity of "output" when this is being used as backwards
   TensorArg output{ output_t, "result", 0 };
@@ -976,7 +977,7 @@ Tensor cudnn_convolution_backward_input(
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
-  auto grad_input_t = grad_output->type().tensor(input_size);
+  auto grad_input_t = at::empty(input_size, grad_output->options());
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -1111,7 +1112,7 @@ Tensor cudnn_convolution_backward_weight(
   checkAllSameType(c, {grad_output, input});
   checkAllSameGPU(c, {grad_output, input});
 
-  auto grad_weight_t = grad_output->type().tensor(weight_size);
+  auto grad_weight_t = at::empty(weight_size, grad_output->options());
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
@@ -1179,8 +1180,8 @@ Tensor cudnn_convolution_backward_bias(
   TensorArg grad_output{ grad_output_t, "grad_output", 1 };
   setCuDNNStreamToCurrent();
 
-  auto grad_bias_t = grad_output->type().tensor(
-                        { grad_output->size(output_channels_dim) });
+  auto grad_bias_t = at::empty(
+                        { grad_output->size(output_channels_dim) }, grad_output->options());
 
   TensorArg grad_bias{ grad_bias_t, "result", 0 };
 
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index e859344bcc3691..f9b77810365204 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -75,7 +75,7 @@ Tensor cudnn_grid_sampler_forward(
   checkGridSize(c, grid, input);
   checkDim(c, input, 4);
 
-  auto output_t = input->type().tensor();
+  auto output_t = at::empty({0}, input->options());
   output_t.resize_({input->size(0), input->size(1), grid->size(1), grid->size(2)});
 
   TensorDescriptor idesc{ *input };  // input descriptor
@@ -114,9 +114,9 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
   checkDim(c, input, 4);
   checkDim(c, grad_output, 4);
 
-  auto grad_input_t = input->type().tensor();
+  auto grad_input_t = at::empty({0}, input->options());
   grad_input_t.resize_(input->sizes());
-  auto grad_grid_t = grid->type().tensor();
+  auto grad_grid_t = at::empty({0}, grid->options());
   grad_grid_t.resize_(grid->sizes());
 
   TensorDescriptor idesc{ *input };  // input descriptor
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 98c0cb7918f02f..28fd81f9a9a998 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -75,7 +75,7 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
 					      algo, ctc_loss_desc.desc(), &workspace_size));
 
 
-  Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty?
+  Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
   Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
 
   AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 09c9365793ec75..5acdab7717b61f 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -464,7 +464,7 @@ namespace {
               mat_numel * num_linear_layers / 2, 1};
             // Generate a new parameter tensor which is a view into the
             // weight_buf.
-            Tensor param = weight_buf.type().tensor().set_(weight_buf.storage(), offset, size);
+            Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
             params.emplace_back(std::move(param));
             layer_params_count++;
           } else {
@@ -616,7 +616,7 @@ Tensor _cudnn_rnn_flatten_weight(
   x_desc.set(getCudnnDataType(any_param), x_geom.sizes(), x_geom.strides(), 5);
 
   auto num_weights = get_num_weights(handle, rnn_desc, x_desc, rnn.datatype);
-  auto weight_buf = any_param.type().tensor(num_weights).zero_();
+  auto weight_buf = at::zeros(num_weights, any_param.options());
 
   FilterDescriptor w_desc;
   w_desc.set(weight_buf, 3);
@@ -691,13 +691,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
            "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
-  auto output = input.type().tensor(output_size);
-  auto hy = hx.type().tensor(hidden_size);
+  auto output = at::empty(output_size, input.options());
+  auto hy = at::empty(hidden_size, hx.options());
   Tensor cy;
   if (cx.defined()) {
-    cy = cx.type().tensor(hidden_size);
+    cy = at::empty(hidden_size, cx.options());
   } else {
-    cy = hx.type().tensor(); // NB: Not allowed to return undefined tensors
+    cy = at::empty({0}, hx.options()); // NB: Not allowed to return undefined tensors
   }
   auto y = output;
 
@@ -709,7 +709,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   FilterDescriptor w_desc;
   if (!weight_buf.defined()) {
     auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], fn.rnn.datatype);
-    weight_buf = x.type().tensor(num_weights);
+    weight_buf = at::empty(num_weights, x.options());
     w_desc.set(weight_buf, 3);
     weight_buf.zero_();
     std::vector<Tensor> params;
@@ -734,7 +734,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
         x_descs_arr.data(),
         &workspace_size
         ));
-  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 
   Tensor reserve;
   // NB: Previously, the test was for fn.requires_grad, but we don't have
@@ -748,7 +748,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
           x_descs_arr.data(),
           &reserve_size
           ));
-    reserve = input.type().toScalarType(kByte).tensor(reserve_size);
+    reserve = at::empty(reserve_size, input.options().dtype(kByte));
     AT_CUDNN_CHECK(cudnnRNNForwardTraining(
           handle,
           descs.rnn_desc.desc(),
@@ -764,7 +764,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
           reserve.data_ptr(), reserve.size(0)
           ));
   } else { // inference
-    reserve = input.type().toScalarType(kByte).tensor();
+    reserve = at::empty({0}, input.options().dtype(kByte));
     AT_CUDNN_CHECK(cudnnRNNForwardInference(
           handle,
           descs.rnn_desc.desc(),
@@ -836,12 +836,12 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto dy = grad_output.contiguous();
   auto y = output;
   auto w = weight_buf;
-  auto dx = input.type().tensor(input.sizes()); // TODO: more compact way of saying this
+  auto dx = at::empty(input.sizes(), input.options()); // TODO: more compact way of saying this
   auto dhy = grad_hy.contiguous().view(hidden_size);
   auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(hidden_size) : Tensor();
-  auto dhx = hx.type().tensor(hidden_size);
+  auto dhx = at::empty(hidden_size, hx.options());
   AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
-  auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor();
+  auto dcx = cx.defined() ? at::empty(hidden_size, cx.options()) : Tensor();
 
   AT_CHECK(fn_train,
            "cudnn RNN backward can only be called in training mode");
@@ -881,7 +881,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
         &workspace_size
         ));
   // TODO: put this in the correct device???
-  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 
   AT_CUDNN_CHECK(cudnnRNNBackwardData(
         handle,
@@ -965,7 +965,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
 
   auto x = input.contiguous();
   const auto& y = output;
-  auto dw = weight_buf.type().tensor(weight_buf.sizes()).zero_();
+  auto dw = at::zeros(weight_buf.sizes(), weight_buf.options());
 
   cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors);
   fn.rnn.set_algo(algo);
@@ -984,7 +984,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
         x_descs_arr.data(),
         &workspace_size
         ));
-  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 
   AT_CUDNN_CHECK(cudnnRNNBackwardWeights(
         handle,
@@ -1001,7 +1001,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   std::vector<Tensor> grad_weight_arr;
   grad_weight_arr.reserve( weight.numel() );
   for (const auto& w : weight_arr) {
-    grad_weight_arr.emplace_back(w.type().tensor(w.sizes()).zero_());
+    grad_weight_arr.emplace_back(at::zeros(w.sizes(), w.options()));
   }
 
   std::vector<Tensor> grad_params_arr;
@@ -1155,7 +1155,7 @@ Tensor try_get_weight_buf(
   // Try to get parameter storage
   auto & any_param = parameters.at(0);
   auto param_storage = any_param.storage();
-  auto weight_buf = any_param.type().tensor().set_(param_storage);
+  auto weight_buf = at::empty({0}, any_param.options()).set_(param_storage);
   if (weight_buf.size(0) < num_params) {
     return {};
   } else if (weight_buf.size(0) > num_params) {
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 2c81d69d3b8435..1d92de58bb7ec0 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -207,7 +207,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
       onumel *= osize;
     }
   }
-  Tensor output = input.type().tensor(output_sizes);
+  Tensor output = at::empty(output_sizes, input.options());
 
   // precision
   DFTI_CONFIG_VALUE prec;
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index ddbd6977645e74..adfe15decbc9bb 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -70,8 +70,8 @@ at::Tensor mkldnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
     IntList padding, IntList stride, IntList dilation, int64_t groups)
 {
-  auto output = input.type().tensor(conv_output_size(
-    input.sizes(), weight.sizes(), padding, stride, dilation, groups));
+  auto output = at::empty(conv_output_size(
+    input.sizes(), weight.sizes(), padding, stride, dilation, groups), input.options());
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
 
@@ -182,7 +182,7 @@ Tensor mkldnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
 {
-  auto grad_input = grad_output.type().tensor(input_size);
+  auto grad_input = at::empty(input_size, grad_output.options());
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
 
@@ -294,11 +294,11 @@ std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
 {
-  auto grad_weight = grad_output.type().tensor(weight_size);
+  auto grad_weight = at::empty(weight_size, grad_output.options());
 
   Tensor grad_bias;
   if (bias_defined) {
-    grad_bias = grad_output.type().tensor({grad_output.size(1)});
+    grad_bias = at::empty({grad_output.size(1)}, grad_output.options());
   }
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f54c9110c21f2b..37451065261c81 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -647,6 +647,8 @@
 
 - func: empty_like(Tensor self, *, TensorOptions options) -> Tensor
 
+- func: empty_strided(IntList size, IntList stride, *, TensorOptions options={}) -> Tensor
+
 - func: erf(Tensor self) -> Tensor
   variants: function, method
 
@@ -1887,11 +1889,13 @@
 
 
 - func: native_tensor(Type self_ty) -> Tensor
+  variants: []
   dispatch:
     SparseCPU: new_sparse
     SparseCUDA: new_sparse
 
 - func: native_tensor(Type self_ty, IntList size) -> Tensor
+  variants: []
   dispatch:
     SparseCPU: new_with_size_sparse
     SparseCUDA: new_with_size_sparse
@@ -1941,7 +1945,6 @@
 - func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
   variants: []
 
-
 - func: _native_sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
   variants: []
   dispatch:
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 49efed2a1e066b..83aee52cf81021 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -286,8 +286,8 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
   SparseTensor dst = new_sparse(self.type());
   _get_sparse_impl(dst)->resize_(sparseDims, denseDims, self.sizes());
   // TODO: is there a more idiomatic way to do this?
-  LongTensor newIndices = indices.type().tensor(indices.sizes());
-  Tensor newValues = values.type().tensor(values.sizes());
+  LongTensor newIndices = at::empty(indices.sizes(), indices.options());
+  Tensor newValues = at::empty(values.sizes(), values.options());
   _alias_into_sparse(dst, newIndices, newValues);
 
   LongTensor indicesBuffer;
@@ -348,7 +348,7 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
   int64_t sparseDims = mask._sparseDims();
   LongTensor mask_indices = mask._indices();
   Tensor mask_values = mask._values();
-  Tensor r_values = r._values().type().tensor(mask_values.sizes());
+  Tensor r_values = at::empty(mask_values.sizes(), r._values().options());
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   int64_t r_nnz = mask._nnz();
@@ -392,7 +392,7 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
 }
 
 SparseTensor sparse_mask_cpu(const Tensor& t, SparseTensorRef mask) {
-  SparseTensor r = t.type().toSparse().tensor();
+  SparseTensor r = at::empty({0}, t.options().layout(kSparse));
   sparse_mask_out_cpu(r, t, mask.tref);
   return r;
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index ec074b5a6c8a88..8a8668fc48b8a1 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -135,7 +135,7 @@ SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Sca
 }
 
 SparseTensor pow_sparse_scalar(const SparseTensor& t, Scalar value) {
-  SparseTensor r = t.type().tensor();
+  SparseTensor r = at::empty({0}, t.options());
   pow_out_sparse_scalar(r, t, value);
   return r;
 }
@@ -208,7 +208,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
   Tensor t_values = t._values();
   LongTensor src_indices = src._indices();
   Tensor s_values = src._values();
-  LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz});
+  LongTensor r_indices = at::empty({sparseDims, max_nnz}, t_indices.options());
   Tensor r_values = _new_values_with_size_of(s_values, max_nnz).zero_();
   r.resize_as_(src);
   _get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
@@ -387,7 +387,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   Tensor t_values = t._values();
   LongTensor src_indices = src._indices();
   Tensor s_values = src._values();
-  LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz});
+  LongTensor r_indices = at::empty({sparseDims, max_nnz}, t_indices.options());
   Tensor r_values = _new_values_with_size_of(t_values, max_nnz).zero_();
   r.resize_as_(src);
   _get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
@@ -570,7 +570,7 @@ Tensor s_addmm_sparse_dense_cpu(
     Scalar beta,
     Scalar alpha
 ) {
-  Tensor r = t.type().tensor();
+  Tensor r = at::empty({0}, t.options());
   s_addmm_out_sparse_dense_cpu(r, t, sparse, dense, beta, alpha);
   return r;
 }
@@ -646,7 +646,7 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
   }
   int64_t outNnz = i + 1;
   indices.resize_({1, outNnz});
-  Tensor values = dense.type().tensor({outNnz, n});
+  Tensor values = at::empty({outNnz, n}, dense.options());
 
   std::vector<int64_t> new_size = _get_sparse_impl(newSparse)->sizes().vec();
   new_size[0] = outNnz;
@@ -660,7 +660,7 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
 }
 
 SparseTensor hspmm_sparse_cpu(const SparseTensor& sparse, const Tensor& dense) {
-  SparseTensor r = sparse.type().tensor();
+  SparseTensor r = at::empty({0}, sparse.options());
   hspmm_out_sparse_cpu(r, sparse, dense);
   return r;
 }
@@ -787,7 +787,7 @@ Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self,
 
 // sparse, dense -> sparse
 Tensor smm(const Tensor& self, const Tensor& mat2) {
-  auto result = self.type().tensor();
+  auto result = at::empty({0}, self.options());
   at::sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
   return result;
 }
@@ -795,7 +795,7 @@ Tensor smm(const Tensor& self, const Tensor& mat2) {
 // sparse, sparse, dense, real, real -> sparse
 Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2,
     Scalar beta, Scalar alpha) {
-  auto result = self.type().tensor();
+  auto result = at::empty({0}, self.options());
   at::sspaddmm_out(result, self, mat1, mat2, beta, alpha);
   return result;
 }
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
index 3ce0eee53353e8..2626eedebaf5e2 100644
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -110,7 +110,7 @@ inline LongTensor _newFlattenedIndices(const SparseTensor& self, bool forceClone
 inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) {
   std::vector<int64_t> size = values.sizes().vec();
   size[0] = nnz;
-  return values.type().tensor(size);
+  return at::empty(size, values.options());
 }
 
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index 107a30f51c2a97..ab9fb15c628735 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -21,7 +21,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
   }
   LongTensor mask_indices = mask._indices();
   Tensor mask_values = mask._values();
-  Tensor r_values = r._values().type().tensor(mask_values.sizes());
+  Tensor r_values = at::empty(mask_values.sizes(), r._values().options());
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz());
@@ -51,7 +51,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
 }
 
 SparseTensor sparse_mask_cuda(const Tensor& t, SparseTensorRef mask) {
-  SparseTensor r = t.type().toSparse().tensor();
+  SparseTensor r = at::empty({0}, t.options().layout(kSparse));
   sparse_mask_out_cuda(r, t, mask.tref);
   return r;
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 712c28817bf6b1..036666bec82ac2 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -238,7 +238,7 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse
 }
 
 SparseTensor hspmm_sparse_cuda(const SparseTensor& sparse, const Tensor& dense) {
-  SparseTensor r = sparse.type().tensor();
+  SparseTensor r = at::empty({0}, sparse.options());
   hspmm_out_sparse_cuda(r, sparse, dense);
   return r;
 }
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index c6355127734b1b..4190e2fcfd1d98 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -4,7 +4,6 @@
 
 #include <ATen/Context.h>
 #include <ATen/ScalarType.h>
-#include <ATen/TensorOperators.h>
 #include <ATen/core/TensorMethods.h>
 #include <ATen/core/TensorOptions.h>
 
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index d5a94d49985bca..493a1aadd17552 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -60,7 +60,7 @@ auto CopySlices::apply(variable_list&& inputs) -> variable_list {
     throw std::runtime_error(ERR_BACKWARD_TWICE);
   }
 
-  auto result = grad.type().tensor(base.sizes(), base.strides());
+  auto result = at::empty_strided(base.sizes(), base.strides(), grad.options());
   result.copy_(grad);
 
   auto offset = view.storage_offset() - base.storage_offset();
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 3ba7ff94bc1fd7..4c6ac18453c062 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -46,7 +46,7 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject
   if (!data || data == Py_None) {
     // For legacy serialization code, create an empty tensor. This is also used
     // by nn.Parameter() with no arguments.
-    auto var = torch::tensors::get_default_tensor_type().tensor();
+    auto var = at::empty({0}, torch::tensors::get_default_tensor_type().options());
     tensor = static_cast<Variable&>(var).data();
   } else if (THPVariable_Check(data)) {
     tensor = ((THPVariable*)data)->cdata.data();
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 716a1d30c3c9cd..5531348ebdaf0d 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -47,7 +47,7 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
     tensors.push_back(tensor);
     for (auto device : devices.slice(1)) {
       _device_guard.set_index(device);
-      tensors.push_back(type.tensor(tensor.sizes()));
+      tensors.push_back(at::empty(tensor.sizes(), type.options()));
     }
     nccl::broadcast(tensors);
   } else {
diff --git a/torch/csrc/jit/batched/BatchTensor.cpp b/torch/csrc/jit/batched/BatchTensor.cpp
index a8432809124375..7d621679999f9f 100644
--- a/torch/csrc/jit/batched/BatchTensor.cpp
+++ b/torch/csrc/jit/batched/BatchTensor.cpp
@@ -14,14 +14,14 @@ BatchTensor::BatchTensor(at::Tensor data, at::Tensor mask, at::Tensor dims){
 }
 
 BatchTensor::BatchTensor(at::Tensor data, int64_t batch_size){
-  dims = data.type().toScalarType(at::kByte).tensor(data.dim());
+  dims = at::empty(data.dim(), data.options().dtype(at::kByte));
   dims.fill_(0);
   std::vector<int64_t> sizes(data.dim() + 1, -1);
   sizes[0] = batch_size;
   this->data = data.unsqueeze(0).expand(sizes);
   std::vector<int64_t> mask_sizes(data.dim() + 1, 1);
   mask_sizes[0] = batch_size;
-  mask = data.type().toScalarType(at::kByte).tensor(mask_sizes);
+  mask = at::empty(mask_sizes, data.options().dtype(at::kByte));
   mask.fill_(1);
 }
 
@@ -36,9 +36,9 @@ BatchTensor::BatchTensor(const std::vector<at::Tensor> datalist, at::Tensor dims
     }
     mask_sizes[i] = *dims[i - 1].toByteData() ? sizes[i] : 1;
   }
-  data = datalist[0].type().tensor(sizes);
+  data = at::empty(sizes, datalist[0].options());
   data.fill_(0);
-  mask = datalist[0].type().toScalarType(at::kByte).tensor(mask_sizes);
+  mask = at::empty(mask_sizes, datalist[0].options().dtype(at::kByte));
   mask.fill_(0);
   for(std::size_t i = 0; i < datalist.size(); i++){
     auto data_item = data.narrow(0, i, 1);
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
index 54a3c57b83a754..6095bb13748470 100644
--- a/torch/csrc/jit/fusers/common/fused_kernel.cpp
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -221,7 +221,7 @@ void FusedKernel::launch(
   outputs.clear();
   outputs.reserve(outputDescriptors().size());
   for(auto & od : outputDescriptors()) {
-    outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
+    outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type)));
   }
 
   launch_with_tensors(inputs, outputs);
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index c853db1b1632d6..e76dae7f961e52 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -533,7 +533,7 @@ struct ADTestSpec {
 
 variable_list get_grad_outputs(const variable_list& vars) {
   return fmap(vars, [](const Variable& v) -> Variable {
-                      return v.type().tensor(v.sizes()).normal_();
+                      return at::randn(v.sizes(), v.options());
                     });
 }
 
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 73b4adbf45a45b..2bdc07ad21e8bb 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -314,7 +314,7 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.tensor();
+    return at::empty({0}, type.options());
   } else if (r.idx == 1) {
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return type.unsafeTensorFromTH(cdata, true);
@@ -374,7 +374,7 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.tensor();
+    return at::empty({0}, type.options());
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
   } else if (r.idx == 2) {
@@ -420,7 +420,7 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.tensor();
+    return at::empty({0}, type.options());
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
   } else if (r.idx == 2) {
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index e56d996a36ba33..a521c36eacf88a 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -518,7 +518,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
     }
     deviceGuard.set_index(-1);
 #endif
-    entry->src[i] = key.type->tensor(srcSizes[i]);
+    entry->src[i] = at::empty(srcSizes[i], key.type->options());
   }
 
 #ifdef USE_CUDA
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 84032a0e3945fa..29b7f3665d3ecd 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -79,7 +79,7 @@ inline at::Tensor newLikeFlat(
   at::DeviceGuard gpuGuard(device);
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors[deviceIdx].size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return t.type().tensor(sizes);
+  return at::empty(sizes, t.options());
 }
 
 inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
@@ -90,7 +90,7 @@ inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
   at::DeviceGuard gpuGuard(t.is_cuda() ? t.get_device() : -1);
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return t.type().tensor(sizes);
+  return at::empty(sizes, t.options());
 }
 
 inline std::vector<std::vector<int64_t>> getSizes(

From a9e6a673aec6c479447c61f3bcc5c10ddd1a099f Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Mon, 24 Sep 2018 10:39:10 -0700
Subject: [PATCH 02/51] Remove caffe2::Tensor::capacity_nbytes,
 at::Tensor::to##name##Data, (#11876)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11876

Modern C++ api instead of macros, item() is aligned with Python frontend. caffe2::Tensor::capacity_nbytes is effecitvely unused and confusing w.r.t. caffe2::Tensor::nbytes().

codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCByte   "item<uint8_t>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCLong   "item<int64_t>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCInt    "item<int32_t>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCDouble "item<double>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCFloat  "item<float>"

codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toByteData   "data<uint8_t>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toLongData   "data<int64_t>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toIntData    "data<int32_t>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toDoubleData "data<double>"
codemod -d caffe2           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toFloatData  "data<float>"

codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCByte   "item<uint8_t>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCLong   "item<int64_t>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCInt    "item<int32_t>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCDouble "item<double>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCFloat  "item<float>"

codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toByteData   "data<uint8_t>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toLongData   "data<int64_t>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toIntData    "data<int32_t>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toDoubleData "data<double>"
codemod -d hphp           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toFloatData  "data<float>"

codemod -d caffe2 --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCComplexDouble "item<std::complex<double>>"

codemod -d tc           --extensions cc,cpp,cu,cuh,h,py,hpp,mm toCFloat  "item<float>"

Reviewed By: ezyang

Differential Revision: D9948572

fbshipit-source-id: 70c9f5390d92b82c85fdd5f8a5aebca338ab413c
---
 aten/src/ATen/core/Tensor.h                   | 14 +++--------
 aten/src/ATen/core/TensorMethods.h            | 10 ++++----
 aten/src/ATen/native/Embedding.cpp            |  2 +-
 aten/src/ATen/native/Indexing.cpp             |  4 ++--
 aten/src/ATen/native/LinearAlgebra.cpp        |  4 ++--
 aten/src/ATen/native/TensorCompare.cpp        |  2 +-
 aten/src/ATen/native/UnaryOps.cpp             |  4 ++--
 aten/src/ATen/native/cuda/SummaryOps.cu       |  2 +-
 aten/src/ATen/native/cudnn/RNN.cpp            |  2 +-
 aten/src/ATen/templates/Tensor.h              | 14 +++--------
 aten/src/ATen/templates/TensorMethods.h       | 10 ++++----
 aten/src/ATen/test/atest.cpp                  |  3 +--
 aten/src/ATen/test/basic.cpp                  | 24 +++++++++----------
 aten/src/ATen/test/scalar_test.cpp            | 10 ++++----
 caffe2/core/tensor.cc                         |  2 +-
 caffe2/core/tensor.h                          |  8 +++----
 caffe2/core/tensor_impl.h                     |  5 ----
 .../mobile/contrib/opengl/test/opengl_test.cc | 10 ++++----
 test/cpp/api/any.cpp                          |  6 ++---
 test/cpp/api/integration.cpp                  | 14 +++++------
 test/cpp/api/jit.cpp                          |  6 ++---
 test/cpp/api/misc.cpp                         |  2 +-
 test/cpp/api/module.cpp                       |  6 ++---
 test/cpp/api/modules.cpp                      | 16 ++++++-------
 test/cpp/api/optim.cpp                        |  8 +++----
 test/cpp/api/parallel.cpp                     |  8 +++----
 test/cpp/api/rnn.cpp                          | 12 +++++-----
 test/cpp/api/serialize.cpp                    | 18 +++++++-------
 test/cpp/api/tensor.cpp                       |  8 +++----
 tools/autograd/templates/Functions.cpp        | 10 ++++----
 .../templates/python_variable_methods.cpp     |  8 +++----
 .../csrc/api/include/torch/optim/serialize.h  |  2 +-
 torch/csrc/api/src/optim/lbfgs.cpp            | 15 ++++++------
 torch/csrc/api/src/optim/serialize.cpp        |  2 +-
 torch/csrc/autograd/engine.cpp                |  2 +-
 .../autograd/python_variable_indexing.cpp     |  2 +-
 torch/csrc/jit/batched/BatchTensor.cpp        |  8 +++----
 torch/csrc/jit/register_prim_ops.cpp          | 10 ++++----
 torch/csrc/jit/test_jit.cpp                   | 16 ++++++-------
 torch/csrc/utils/pybind.h                     |  2 +-
 torch/csrc/utils/python_arg_parser.h          |  2 +-
 41 files changed, 146 insertions(+), 167 deletions(-)

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index 7445c332200da6..39d12e7d6499ba 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -172,20 +172,12 @@ struct AT_API Tensor {
   template<typename T>
   T * data() const;
 
+  template <typename T>
+  T item() const;
+
   // Purposely not defined here to avoid inlining
   void print() const;
 
-  //toLongData(), toFloatData() etc.
-  #define TO_TYPE_DATA(T,name,_) \
-  T * to##name##Data() const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
-  #undef TO_TYPE_DATA
-
-  #define TO_C_TYPE(T,name,_) \
-  T toC##name () const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
-  #undef TO_C_TYPE
-
   // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
   // dimension.
   template<typename T, size_t N>
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index 789340dc1b91d0..c6197b4fc2d08b 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -1241,16 +1241,16 @@ inline Device Tensor::device() const {
         " but found ",                           \
         at::toString(type().scalarType()));      \
     return static_cast<T*>(this->data_ptr());    \
-  }                                              \
-  inline T* Tensor::to##name##Data() const {     \
-    return data<T>();                            \
   }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
 #undef DEFINE_CAST
 
-#define DEFINE_TO_C_TYPE(T,name,_) \
-inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
+#define DEFINE_TO_C_TYPE(T, name, _)   \
+  template <>                          \
+  inline T Tensor::item() const {      \
+    return _local_scalar().to##name(); \
+  }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
 #undef DEFINE_TO_C_TYPE
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 761d16b3d1ba03..72518fbd4a0e85 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -168,7 +168,7 @@ Tensor & embedding_renorm_cpu_(
       continue;
     }
     auto row = self[sorted_indices[i]];
-    auto norm = row.norm(norm_type).toCDouble();
+    auto norm = row.norm(norm_type).item<double>();
     if (norm > max_norm) {
       auto scale = max_norm / (norm + 1e-7);
       row *= scale;
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 90fdf7b80a6c9a..bb06719d85a498 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -143,8 +143,8 @@ static Tensor unsqueezeN(const Tensor & src, int64_t before, int64_t after) {
 
 static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) {
   if (index.numel() != 0) {
-    auto max_idx = index.max().toCLong();
-    auto min_idx = index.min().toCLong();
+    auto max_idx = index.max().item<int64_t>();
+    auto min_idx = index.min().item<int64_t>();
     AT_CHECK(max_idx < dim_size,
              "index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
     AT_CHECK(min_idx >= -dim_size,
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 1ec850ce9a5926..0cd08c5b2c491e 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -22,7 +22,7 @@ static inline std::tuple<double, Tensor, int> _lu_det_P_diag_U_info(const Tensor
   std::tie(lu, p, info) = self.unsqueeze(0).btrifact_with_info();
   p.squeeze_(0);
   lu.squeeze_(0);
-  int int_info = info.squeeze_().toCInt();
+  int int_info = info.squeeze_().item<int32_t>();
   AT_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info);
   auto n = self.size(0);
   auto num_exchanges = (at::arange(1, n + 1, p.type()) != p).nonzero().size(0);
@@ -63,7 +63,7 @@ Tensor logdet(const Tensor& self) {
   } else {
     det = diag_U.prod().mul_(det_P);
   }
-  if (det.sign().toCDouble() <= 0) {
+  if (det.sign().item<double>() <= 0) {
     return det.log_();  // in order to get proper -inf (det=0) or nan (det<0)
   } else {
     return diag_U.abs().log().sum();
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index b215180d746e0a..1cca4191fd0792 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -34,7 +34,7 @@ DEFINE_DISPATCH(max_kernel);
 DEFINE_DISPATCH(min_kernel);
 
 bool allclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
-  return at::isclose(self, other, rtol, atol, equal_nan).all().toCByte();
+  return at::isclose(self, other, rtol, atol, equal_nan).all().item<uint8_t>();
 }
 
 Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index a8955a976828bf..f6434b2c957c19 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -99,7 +99,7 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
 Tensor mvlgamma(const Tensor& self, int64_t p) {
   AT_CHECK(at::isFloatingType(self.type().scalarType()),
            "mvlgamma is not implemented for ", self.type());
-  AT_CHECK((self > 0.5 * (p - 1.)).all().toCByte(),
+  AT_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
            "Condition for computing multivariate log-gamma not met");
   AT_CHECK(p >= 1, "p has to be greater than or equal to 1");
   Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
@@ -110,7 +110,7 @@ Tensor mvlgamma(const Tensor& self, int64_t p) {
 Tensor& mvlgamma_(Tensor& self, int64_t p) {
   AT_CHECK(at::isFloatingType(self.type().scalarType()),
            "mvlgamma is not implemented for ", self.type());
-  AT_CHECK((self > 0.5 * (p - 1.)).all().toCByte(),
+  AT_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
            "Condition for computing multivariate log-gamma not met");
   AT_CHECK(p >= 1, "p has to be greater than or equal to 1");
   Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index 2ab983c17721ae..0ef8ebabf065aa 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -258,7 +258,7 @@ Tensor _bincount_cuda_template(
     AT_ERROR("input and weights should have the same length");
   }
 
-  auto nbins = self.max().toCLong() + 1L;
+  auto nbins = self.max().item<int64_t>() + 1L;
   nbins = std::max(nbins, minlength);
   // alloc output counter on GPU
   Tensor output;
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 5acdab7717b61f..35af9919d46d2e 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -1125,7 +1125,7 @@ DropoutState& get_dropout_state(const Type& tp, double dropout_p, bool train) {
                                  : ten_dropout_state_cache.at(device);
   if (train && dropout_p > 0 && !state.buffer.defined()) {
     std::unique_lock<std::mutex> lock {state.mutex};
-    int64_t seed = at::empty({}, at::kLong).random_().toCLong();
+    int64_t seed = at::empty({}, at::kLong).random_().item<int64_t>();
     state.buffer = at::_cudnn_init_dropout_state(
       tp.toScalarType(at::kByte), dropout_p, train, seed);
     // NB: CUDA binds the event to a device at creation time, so we can initialize it
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 85e7c84961d6ee..73fdcf4ecb6d9d 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -172,20 +172,12 @@ struct AT_API Tensor {
   template<typename T>
   T * data() const;
 
+  template <typename T>
+  T item() const;
+
   // Purposely not defined here to avoid inlining
   void print() const;
 
-  //toLongData(), toFloatData() etc.
-  #define TO_TYPE_DATA(T,name,_) \
-  T * to##name##Data() const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
-  #undef TO_TYPE_DATA
-
-  #define TO_C_TYPE(T,name,_) \
-  T toC##name () const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
-  #undef TO_C_TYPE
-
   // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
   // dimension.
   template<typename T, size_t N>
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 8283bea01f6bed..70f56bd37697d5 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -81,16 +81,16 @@ inline Device Tensor::device() const {
         " but found ",                           \
         at::toString(type().scalarType()));      \
     return static_cast<T*>(this->data_ptr());    \
-  }                                              \
-  inline T* Tensor::to##name##Data() const {     \
-    return data<T>();                            \
   }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
 #undef DEFINE_CAST
 
-#define DEFINE_TO_C_TYPE(T,name,_) \
-inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
+#define DEFINE_TO_C_TYPE(T, name, _)   \
+  template <>                          \
+  inline T Tensor::item() const {      \
+    return _local_scalar().to##name(); \
+  }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
 #undef DEFINE_TO_C_TYPE
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index 8dffa3d7c02c75..edb3f79fd2d55d 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -18,7 +18,7 @@ void trace() {
     trace += foo_a[i][i];
   }
 
-  EXPECT_FLOAT_EQ(foo.trace().toCFloat(), trace);
+  EXPECT_FLOAT_EQ(foo.trace().item<float>(), trace);
 }
 
 // TEST_CASE( "atest", "[]" ) {
@@ -27,7 +27,6 @@ TEST(atest, atest) {
   manual_seed(123, at::kCUDA);
 
   auto foo = rand({12,6});
-  EXPECT_EQ(foo.data<float>(), foo.toFloatData());
 
   EXPECT_EQ(foo.size(0), 12);
   EXPECT_EQ(foo.size(1), 6);
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index c64fdec0089dff..c04518a14fc4d1 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -31,15 +31,15 @@ static void test(Type & type) {
 
   CATCH_SECTION( "ones and dot" ) {
     Tensor b0 = ones({1, 1}, type);
-    CATCH_REQUIRE(2 == (b0+b0).sum().toCDouble());
+    CATCH_REQUIRE(2 == (b0+b0).sum().item<double>());
 
     Tensor b1 = ones({1, 2}, type);
-    CATCH_REQUIRE(4 == (b1+b1).sum().toCDouble());
+    CATCH_REQUIRE(4 == (b1+b1).sum().item<double>());
 
     Tensor b = ones({3, 4}, type);
-    CATCH_REQUIRE(24 == (b+b).sum().toCDouble());
+    CATCH_REQUIRE(24 == (b+b).sum().item<double>());
     CATCH_REQUIRE(12 == b.numel());
-    CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
+    CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).item<double>() == 12);
   }
 
   CATCH_SECTION( "rand" ) {
@@ -54,7 +54,7 @@ static void test(Type & type) {
     auto z = b.sort(1);
     auto z_sorted = std::get<0>(z);
 
-    CATCH_REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat());
+    CATCH_REQUIRE(z_sorted[0][0].item<float>() < z_sorted[0][1].item<float>());
   }
 
   if(type.backend() != Backend::CUDA)
@@ -62,7 +62,7 @@ static void test(Type & type) {
     Tensor b = randperm(15, type);
     Tensor rv, ri;
     std::tie(rv, ri) = sort(b, 0);
-    CATCH_REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat());
+    CATCH_REQUIRE(rv[0].item<float>() <= rv[1].item<float>());
   }
 
   CATCH_SECTION( "context" ) {
@@ -89,7 +89,7 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
   }
 
   CATCH_SECTION( "loads of adds (with copy)" ) {
@@ -102,7 +102,7 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
   }
 
   CATCH_SECTION( "isContiguous" ) {
@@ -154,7 +154,7 @@ static void test(Type & type) {
 
   CATCH_SECTION( "abs(value)" ) {
     Tensor r = at::abs(type.scalarTensor(-3));
-    CATCH_REQUIRE(r.toCInt() == 3);
+    CATCH_REQUIRE(r.item<int32_t>() == 3);
   }
 
 //TODO(zach): operator overloads
@@ -195,7 +195,7 @@ static void test(Type & type) {
     auto f = rand({3,4}, type);
     f[2] = zeros({4}, type);
     f[1][0] = -1;
-    CATCH_REQUIRE(f[2][0].toCDouble() == 0);
+    CATCH_REQUIRE(f[2][0].item<double>() == 0);
   }
 
   CATCH_SECTION( "tensor from TH" ) {
@@ -206,14 +206,14 @@ static void test(Type & type) {
     CATCH_REQUIRE_NOTHROW(tt);
   }
 
-  CATCH_SECTION( "toCFloat" ) {
+  CATCH_SECTION( "item<float>" ) {
     Tensor a = zeros({3,4});
     Tensor b = ones({3,7});
     Tensor c = cat({a,b},1);
     CATCH_REQUIRE(c.size(1) == 11);
 
     Tensor e = rand({});
-    CATCH_REQUIRE(*e.data<float>() == e.sum().toCFloat());
+    CATCH_REQUIRE(*e.data<float>() == e.sum().item<float>());
   }
 
   CATCH_SECTION( "to string" ) {
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 247830c3cc839c..10ffa9afc326ff 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -71,7 +71,7 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
   auto t = ones({4,4});
 
   auto wha2 = zeros({4,4}).add(t).sum();
-  CATCH_REQUIRE( wha2.toCDouble() == 16.0 );
+  CATCH_REQUIRE( wha2.item<double>() == 16.0 );
 
   CATCH_REQUIRE( t.sizes()[0] == 4 );
   CATCH_REQUIRE( t.sizes()[1] == 4 );
@@ -116,10 +116,10 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
   // test direct C-scalar type conversions
   {
     auto x = ones({1,2}, T);
-    _CATCH_REQUIRE_THROWS(x.toCFloat());
+    _CATCH_REQUIRE_THROWS(x.item<float>());
   }
   auto float_one = ones({}, T);
-  CATCH_REQUIRE(float_one.toCFloat() == 1);
-  CATCH_REQUIRE(float_one.toCInt() == 1);
-  CATCH_REQUIRE((float_one.toCHalf() == 1));
+  CATCH_REQUIRE(float_one.item<float>() == 1);
+  CATCH_REQUIRE(float_one.item<int32_t>() == 1);
+  CATCH_REQUIRE((float_one.item<at::Half>() == 1));
 }
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index e142e1a6b6a90c..caa0ba9ea55f49 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -86,7 +86,7 @@ vector<int64_t> GetTensorInfo(
   CHECK(tc);
   CHECK(tc->unsafeGetTensorImpl());
   CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImpl());
-  *capacity = tc->capacity_nbytes();
+  *capacity = tc->storage().capacity();
   tc->ExtractDeviceOption(device);
   return tc->dims();
 }
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 286718d4268ca2..1e4cac2788b560 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -262,10 +262,6 @@ class CAFFE2_API Tensor final {
     return impl_.get()->nbytes();
   }
 
-  inline size_t capacity_nbytes() const {
-    return impl_.get()->capacity_nbytes();
-  }
-
   inline const vector<int64_t>& dims() const {
     return impl_.get()->dims();
   }
@@ -322,6 +318,10 @@ class CAFFE2_API Tensor final {
   const Storage& storage() {
     return impl_->storage();
   }
+
+  const Storage& storage() const {
+    return impl_->storage();
+  }
 };
 
 using TensorCPU = Tensor;
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 20c398f7e4c82e..53c812f55e297b 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -693,11 +693,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     ;
   }
 
-  // NB: This capacity may also include available space
-  // in the storage BEFORE the tensor data, if storage_offset != 0
-  inline size_t capacity_nbytes() const {
-    return storage_.capacity();
-  }
   /**
    * Returns the dimensions of the tensor as a vector.
    */
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 49a875184c10d3..9da266c4e85051 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -2039,19 +2039,19 @@ void compareModelsForOpenGL(std::string name,
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
       t_gl->Resize(1, height, width, channel);
       uint8_t* input = t_gl->mutable_data<uint8_t>();
-      memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->capacity_nbytes());
+      memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->storage().capacity());
     } else if (name == "segmentation") {
       CAFFE_ENFORCE_EQ(input_order, "NCHW");
       CAFFE_ENFORCE_EQ(input_type, "float");
       t_gl->Resize(1, channel, height, width);
       float* input = t_gl->mutable_data<float>();
-      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes());
+      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
     } else if (name == "denoiser") {
       CAFFE_ENFORCE_EQ(input_order, "NCHW");
       CAFFE_ENFORCE_EQ(input_type, "float");
       t_gl->Resize(1, channel, height, width);
       float* input = t_gl->mutable_data<float>();
-      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes());
+      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
     }
 
     cws.RunNetOnce(truncatedPredictNet);
@@ -2149,14 +2149,14 @@ void compareBatchedToTiledModels(std::string name,
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
       t_tiling->Resize(1, height, width, channel);
       uint8_t* input = t_tiling->mutable_data<uint8_t>();
-      memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->capacity_nbytes());
+      memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->storage().capacity());
 
     } else if (name == "segmentation") {
       CAFFE_ENFORCE_EQ(input_order, "NCHW");
       CAFFE_ENFORCE_EQ(input_type, "float");
       t_tiling->Resize(1, channel, height, width);
       float* input = t_tiling->mutable_data<float>();
-      memcpy(input, t_batch->mutable_data<float>(), t_batch->capacity_nbytes());
+      memcpy(input, t_batch->mutable_data<float>(), t_batch->storage().capacity());
     }
 
     bws.RunNetOnce(bachedNet);
diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp
index 0d8e98c4157ab9..22eda0d1004d23 100644
--- a/test/cpp/api/any.cpp
+++ b/test/cpp/api/any.cpp
@@ -71,7 +71,7 @@ TEST_F(
   ASSERT_TRUE(
       any.forward(std::string("a"), std::string("ab"), std::string("abc"))
           .sum()
-          .toCInt() == 6);
+          .item<int32_t>() == 6);
 }
 
 TEST_F(AnyModuleTest, WrongArgumentType) {
@@ -232,10 +232,10 @@ TEST_F(AnyModuleTest, ConvertsVariableToTensorCorrectly) {
   // mismatch).
   AnyModule any(M{});
   ASSERT_TRUE(
-      any.forward(torch::autograd::Variable(torch::ones(5))).sum().toCFloat() ==
+      any.forward(torch::autograd::Variable(torch::ones(5))).sum().item<float>() ==
       5);
   // at::Tensors that are not variables work too.
-  ASSERT_EQ(any.forward(at::ones(5)).sum().toCFloat(), 5);
+  ASSERT_EQ(any.forward(at::ones(5)).sum().item<float>(), 5);
 }
 
 namespace torch {
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 131b0440a41a11..b2d10097b23939 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -63,10 +63,10 @@ class CartPole {
   }
 
   void step(int action) {
-    auto x = state[0].toCFloat();
-    auto x_dot = state[1].toCFloat();
-    auto theta = state[2].toCFloat();
-    auto theta_dot = state[3].toCFloat();
+    auto x = state[0].item<float>();
+    auto x_dot = state[1].item<float>();
+    auto theta = state[2].item<float>();
+    auto theta_dot = state[3].item<float>();
 
     auto force = (action == 1) ? force_mag : -force_mag;
     auto costheta = std::cos(theta);
@@ -222,7 +222,7 @@ bool test_mnist(
   torch::NoGradGuard guard;
   auto result = std::get<1>(forward_op(tedata).max(1));
   torch::Tensor correct = (result == telabel).toType(torch::kFloat32);
-  return correct.sum().toCFloat() > telabel.size(0) * 0.8;
+  return correct.sum().item<float>() > telabel.size(0) * 0.8;
 }
 
 struct IntegrationTest : torch::test::SeedingFixture {};
@@ -251,7 +251,7 @@ TEST_F(IntegrationTest, CartPole) {
     auto out = forward(state);
     auto probs = torch::Tensor(std::get<0>(out));
     auto value = torch::Tensor(std::get<1>(out));
-    auto action = probs.multinomial(1)[0].toCInt();
+    auto action = probs.multinomial(1)[0].item<int32_t>();
     // Compute the log prob of a multinomial distribution.
     // This should probably be actually implemented in autogradpp...
     auto p = probs / probs.sum(-1, true);
@@ -274,7 +274,7 @@ TEST_F(IntegrationTest, CartPole) {
     std::vector<torch::Tensor> policy_loss;
     std::vector<torch::Tensor> value_loss;
     for (auto i = 0U; i < saved_log_probs.size(); i++) {
-      auto r = rewards[i] - saved_values[i].toCFloat();
+      auto r = rewards[i] - saved_values[i].item<float>();
       policy_loss.push_back(-r * saved_log_probs[i]);
       value_loss.push_back(
           torch::smooth_l1_loss(saved_values[i], torch::ones(1) * rewards[i]));
diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp
index 34b3e8f630c2a1..9aa6968df71f55 100644
--- a/test/cpp/api/jit.cpp
+++ b/test/cpp/api/jit.cpp
@@ -20,10 +20,10 @@ TEST(TorchScriptTest, CanCompileMultipleFunctions) {
   auto a = torch::ones(1);
   auto b = torch::ones(1);
 
-  ASSERT_EQ(1, module->run_method("test_mul", a, b).toTensor().toCLong());
+  ASSERT_EQ(1, module->run_method("test_mul", a, b).toTensor().item<int64_t>());
 
-  ASSERT_EQ(2, module->run_method("test_relu", a, b).toTensor().toCLong());
+  ASSERT_EQ(2, module->run_method("test_relu", a, b).toTensor().item<int64_t>());
 
   ASSERT_TRUE(
-      0x200 == module->run_method("test_while", a, b).toTensor().toCLong());
+      0x200 == module->run_method("test_while", a, b).toTensor().item<int64_t>());
 }
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index ca716d0ac0c956..b85cb9dcc1a86f 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -49,5 +49,5 @@ TEST(NNInitTest, CanInitializeTensorThatRequiresGrad) {
       tensor.fill_(1),
       "a leaf Variable that requires grad "
       "has been used in an in-place operation");
-  ASSERT_EQ(torch::nn::init::ones_(tensor).sum().toCInt(), 12);
+  ASSERT_EQ(torch::nn::init::ones_(tensor).sum().item<int32_t>(), 12);
 }
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index f2bca9501ae648..70d05d4240e77d 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -41,13 +41,13 @@ TEST_F(ModuleTest, ZeroGrad) {
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
     ASSERT_TRUE(grad.defined());
-    ASSERT_NE(grad.sum().toCFloat(), 0);
+    ASSERT_NE(grad.sum().item<float>(), 0);
   }
   module->zero_grad();
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
     ASSERT_TRUE(grad.defined());
-    ASSERT_EQ(grad.sum().toCFloat(), 0);
+    ASSERT_EQ(grad.sum().item<float>(), 0);
   }
 }
 
@@ -72,7 +72,7 @@ TEST_F(ModuleTest, ZeroGradWithUndefined) {
   ASSERT_TRUE(module.x.grad().defined());
   ASSERT_FALSE(module.y.grad().defined());
 
-  ASSERT_EQ(module.x.grad().sum().toCFloat(), 0);
+  ASSERT_EQ(module.x.grad().sum().item<float>(), 0);
 }
 
 TEST_F(ModuleTest, CanGetName) {
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 11e54a97a1885f..fd9416eb3b9b6a 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -134,7 +134,7 @@ TEST_F(ModulesTest, SimpleContainer) {
   ASSERT_EQ(x.ndimension(), 2);
   ASSERT_EQ(x.size(0), 1000);
   ASSERT_EQ(x.size(1), 100);
-  ASSERT_EQ(x.min().toCFloat(), 0);
+  ASSERT_EQ(x.min().item<float>(), 0);
 }
 
 TEST_F(ModulesTest, EmbeddingBasic) {
@@ -181,12 +181,12 @@ TEST_F(ModulesTest, Dropout) {
   y.backward();
   ASSERT_EQ(y.ndimension(), 1);
   ASSERT_EQ(y.size(0), 100);
-  ASSERT_LT(y.sum().toCFloat(), 130); // Probably
-  ASSERT_GT(y.sum().toCFloat(), 70); // Probably
+  ASSERT_LT(y.sum().item<float>(), 130); // Probably
+  ASSERT_GT(y.sum().item<float>(), 70); // Probably
 
   dropout->eval();
   y = dropout->forward(x);
-  ASSERT_EQ(y.sum().toCFloat(), 100);
+  ASSERT_EQ(y.sum().item<float>(), 100);
 }
 
 TEST_F(ModulesTest, Parameters) {
@@ -228,15 +228,15 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
 
 TEST_F(ModulesTest, FunctionalWithTorchFunction) {
   auto functional = Functional(torch::relu);
-  ASSERT_EQ(functional(torch::ones({})).toCFloat(), 1);
-  ASSERT_EQ(functional(torch::ones({})).toCFloat(), 1);
-  ASSERT_EQ(functional(torch::ones({}) * -1).toCFloat(), 0);
+  ASSERT_EQ(functional(torch::ones({})).item<float>(), 1);
+  ASSERT_EQ(functional(torch::ones({})).item<float>(), 1);
+  ASSERT_EQ(functional(torch::ones({}) * -1).item<float>(), 0);
 }
 
 TEST_F(ModulesTest, FunctionalArgumentBinding) {
   auto functional =
       Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
-  ASSERT_EQ(functional(torch::ones({})).toCFloat(), 0);
+  ASSERT_EQ(functional(torch::ones({})).item<float>(), 0);
 }
 
 TEST_F(ModulesTest, BatchNormStateful) {
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index 03f7ed92a9b35c..944a31ca7e997b 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -44,7 +44,7 @@ bool test_optimizer_xor(Options options) {
     auto labels = torch::empty({kBatchSize});
     for (size_t i = 0; i < kBatchSize; i++) {
       inputs[i] = torch::randint(2, {2}, torch::kInt64);
-      labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+      labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
     }
     inputs.set_requires_grad(true);
     optimizer.zero_grad();
@@ -54,7 +54,7 @@ bool test_optimizer_xor(Options options) {
 
     optimizer.step();
 
-    running_loss = running_loss * 0.99 + loss.toCFloat() * 0.01;
+    running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
     if (epoch > kMaximumNumberOfEpochs) {
       std::cout << "Loss is too high after epoch " << epoch << ": "
                 << running_loss << std::endl;
@@ -286,14 +286,14 @@ TEST(OptimTest, ZeroGrad) {
 
   for (const auto& parameter : model->parameters()) {
     ASSERT_TRUE(parameter->grad().defined());
-    ASSERT_GT(parameter->grad().sum().toCFloat(), 0);
+    ASSERT_GT(parameter->grad().sum().item<float>(), 0);
   }
 
   optimizer.zero_grad();
 
   for (const auto& parameter : model->parameters()) {
     ASSERT_TRUE(parameter->grad().defined());
-    ASSERT_EQ(parameter->grad().sum().toCFloat(), 0);
+    ASSERT_EQ(parameter->grad().sum().item<float>(), 0);
   }
 }
 
diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
index 71bcc542f8439f..a1910782364479 100644
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@@ -38,7 +38,7 @@ TEST_F(ParallelTest, DifferentiableScatter_MultiCUDA) {
 
   ASSERT_TRUE(input.grad().defined());
   ASSERT_TRUE(input.grad().device().is_cpu());
-  ASSERT_EQ(input.grad().sum().toCInt(), 10);
+  ASSERT_EQ(input.grad().sum().item<int32_t>(), 10);
 }
 
 TEST_F(ParallelTest, DifferentiableGather_MultiCUDA) {
@@ -62,11 +62,11 @@ TEST_F(ParallelTest, DifferentiableGather_MultiCUDA) {
 
   ASSERT_TRUE(a.grad().defined());
   ASSERT_EQ(a.grad().device(), torch::Device(torch::kCUDA, 0));
-  ASSERT_EQ(a.grad().sum().toCInt(), 5);
+  ASSERT_EQ(a.grad().sum().item<int32_t>(), 5);
 
   ASSERT_TRUE(b.grad().defined());
   ASSERT_EQ(b.grad().device(), torch::Device(torch::kCUDA, 1));
-  ASSERT_EQ(b.grad().sum().toCInt(), 5);
+  ASSERT_EQ(b.grad().sum().item<int32_t>(), 5);
 }
 
 TEST_F(ParallelTest, Replicate_MultiCUDA) {
@@ -226,6 +226,6 @@ TEST_F(ParallelTest, DataParallelUsesAllAvailableCUDADevices_CUDA) {
   const auto device_count = torch::cuda::device_count();
   ASSERT_EQ(output.numel(), device_count);
   for (size_t i = 0; i < device_count; ++i) {
-    ASSERT_EQ(output[i].toCInt(), i);
+    ASSERT_EQ(output[i].item<int32_t>(), i);
   }
 }
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index 96ffd37eb0f628..e0d511fb099381 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -56,7 +56,7 @@ bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
     loss.backward();
     optimizer.step();
 
-    running_loss = running_loss * 0.99 + loss.toCFloat() * 0.01;
+    running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
     if (epoch > max_epoch) {
       return false;
     }
@@ -81,7 +81,7 @@ void check_lstm_sizes(RNNOutput output) {
   ASSERT_EQ(output.state.size(3), 64); // 64 hidden dims
 
   // Something is in the hiddens
-  ASSERT_GT(output.state.norm().toCFloat(), 0);
+  ASSERT_GT(output.state.norm().item<float>(), 0);
 }
 
 struct RNNTest : torch::test::SeedingFixture {};
@@ -103,7 +103,7 @@ TEST_F(RNNTest, CheckOutputSizes) {
   torch::Tensor diff = next.state - output.state;
 
   // Hiddens changed
-  ASSERT_GT(diff.abs().sum().toCFloat(), 1e-3);
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }
 
 TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
@@ -137,7 +137,7 @@ TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
                    0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
                    0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
   for (size_t i = 0; i < 3 * 4 * 2; i++) {
-    ASSERT_LT(std::abs(flat[i].toCFloat() - c_out[i]), 1e-3);
+    ASSERT_LT(std::abs(flat[i].item<float>() - c_out[i]), 1e-3);
   }
 
   ASSERT_EQ(out.state.ndimension(), 4); // (hx, cx) x layers x B x 2
@@ -163,7 +163,7 @@ TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
                    1.0931,
                    1.4911};
   for (size_t i = 0; i < 16; i++) {
-    ASSERT_LT(std::abs(flat[i].toCFloat() - h_out[i]), 1e-3);
+    ASSERT_LT(std::abs(flat[i].item<float>() - h_out[i]), 1e-3);
   }
 }
 
@@ -206,7 +206,7 @@ TEST_F(RNNTest, Sizes_CUDA) {
   torch::Tensor diff = next.state - output.state;
 
   // Hiddens changed
-  ASSERT_GT(diff.abs().sum().toCFloat(), 1e-3);
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }
 
 TEST_F(RNNTest, EndToEndLSTM_CUDA) {
diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp
index a37c00c2e3eff9..0612029f53bcab 100644
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@@ -90,7 +90,7 @@ TEST(Serialize, XOR) {
     auto labels = torch::empty({batch_size});
     for (size_t i = 0; i < batch_size; i++) {
       inputs[i] = torch::randint(2, {2}, torch::kInt64);
-      labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+      labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
     }
     auto x = model->forward<torch::Tensor>(inputs);
     return torch::binary_cross_entropy(x, labels);
@@ -112,7 +112,7 @@ TEST(Serialize, XOR) {
     loss.backward();
     optimizer.step();
 
-    running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
+    running_loss = running_loss * 0.99 + loss.sum().item<float>() * 0.01;
     ASSERT_LT(epoch, 3000);
     epoch++;
   }
@@ -122,7 +122,7 @@ TEST(Serialize, XOR) {
   torch::load(model2, tempfile.str());
 
   auto loss = getLoss(model2, 100);
-  ASSERT_LT(loss.toCFloat(), 0.1);
+  ASSERT_LT(loss.item<float>(), 0.1);
 }
 
 TEST(Serialize, Optim) {
@@ -188,9 +188,9 @@ TEST(Serialize, Optim) {
     const auto& name = p.key;
     // Model 1 and 3 should be the same
     ASSERT_TRUE(
-        param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
+        param1[name].norm().item<float>() == param3[name].norm().item<float>());
     ASSERT_TRUE(
-        param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
+        param1[name].norm().item<float>() != param2[name].norm().item<float>());
   }
 }
 
@@ -202,7 +202,7 @@ TEST(Serialize, Optim) {
 //     auto labels = torch::empty({batch_size});
 //     for (size_t i = 0; i < batch_size; i++) {
 //       inputs[i] = torch::randint(2, {2}, torch::kInt64);
-//       labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+//       labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
 //     }
 //     auto x = model->forward<torch::Tensor>(inputs);
 //     return torch::binary_cross_entropy(x, labels);
@@ -224,7 +224,7 @@ TEST(Serialize, Optim) {
 //     loss.backward();
 //     optimizer.step();
 //
-//     running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
+//     running_loss = running_loss * 0.99 + loss.sum().item<float>() * 0.01;
 //     ASSERT_LT(epoch, 3000);
 //     epoch++;
 //   }
@@ -234,7 +234,7 @@ TEST(Serialize, Optim) {
 //   torch::load(model2, tempfile.str());
 //
 //   auto loss = getLoss(model2, 100);
-//   ASSERT_LT(loss.toCFloat(), 0.1);
+//   ASSERT_LT(loss.item<float>(), 0.1);
 //
 //   model2->to(torch::kCUDA);
 //   torch::test::TempFile tempfile2;
@@ -242,5 +242,5 @@ TEST(Serialize, Optim) {
 //   torch::load(model3, tempfile2.str());
 //
 //   loss = getLoss(model3, 100);
-//   ASSERT_LT(loss.toCFloat(), 0.1);
+//   ASSERT_LT(loss.item<float>(), 0.1);
 // }
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index ad14298d86c967..3996132cc84796 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -104,7 +104,7 @@ TEST(TensorTest, ContainsCorrectValueForSingleValue) {
   auto tensor = at::tensor(123);
   ASSERT_EQ(tensor.numel(), 1);
   ASSERT_EQ(tensor.dtype(), at::kInt);
-  ASSERT_EQ(tensor[0].toCInt(), 123);
+  ASSERT_EQ(tensor[0].item<int32_t>(), 123);
 
   tensor = at::tensor(123.456f);
   ASSERT_EQ(tensor.numel(), 1);
@@ -189,7 +189,7 @@ TEST(TensorTest, FromBlob) {
   auto tensor = torch::from_blob(v.data(), v.size(), torch::kInt32);
   ASSERT_TRUE(tensor.is_variable());
   ASSERT_EQ(tensor.numel(), 3);
-  ASSERT_EQ(tensor[0].toCInt(), 1);
-  ASSERT_EQ(tensor[1].toCInt(), 2);
-  ASSERT_EQ(tensor[2].toCInt(), 3);
+  ASSERT_EQ(tensor[0].item<int32_t>(), 1);
+  ASSERT_EQ(tensor[1].item<int32_t>(), 2);
+  ASSERT_EQ(tensor[2].item<int32_t>(), 3);
 }
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index 36cb420fb1be9e..f30701a4065176 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -219,7 +219,7 @@ Tensor prod_backward(Tensor grad, const Tensor& input, Tensor result, int64_t di
 
   Tensor zero_mask = (input == 0);
   Tensor slice_zero_count = zero_mask.sum(dim, true);
-  int64_t total_zeros = slice_zero_count.sum().toCLong();
+  int64_t total_zeros = slice_zero_count.sum().item<int64_t>();
   if (total_zeros == 0) {
     return (grad * result) / input;
   } else {
@@ -321,7 +321,7 @@ Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) {
   }
 
   // Simple case with nonzero elements in the input
-  if ((input != 0).all().toCByte()) {
+  if ((input != 0).all().item<uint8_t>()) {
     Tensor result = at::cumprod(input, dim);
     return sum_scan_exclusive(result * grad, dim) / input;
   }
@@ -1600,7 +1600,7 @@ Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, cons
 // Invertible case is derived from Jacobi's formula, and also can be found at:
 // http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf
 Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) {
-  auto det_val = det.toCDouble();
+  auto det_val = det.item<double>();
   if (det_val != 0 /* invertible */) {
     return grad * det * self.inverse().t();
   } else /* otherwise det = \prod(sigma) = 0, use svd */ {
@@ -1612,7 +1612,7 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det)
 }
 
 Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& logdet) {
-  auto logdet_val = logdet.toCDouble();
+  auto logdet_val = logdet.item<double>();
   if (logdet_val != -INFINITY /* det != 0, invertible */) {
     return grad * self.inverse().t();
   } else /* otherwise det = \prod(sigma) = 0, use svd */ {
@@ -1628,7 +1628,7 @@ Tensor slogdet_backward(const std::vector<torch::autograd::Variable> &grads,
                         const Tensor& self,
                         const Tensor& signdet, const Tensor& logabsdet) {
   AT_ASSERTM(!grads[0].defined(), "slogdet's sign output should never have gradient");
-  auto signdet_val = signdet.toCDouble();
+  auto signdet_val = signdet.item<double>();
   if (signdet_val != 0 /* det != 0, invertible */) {
     return grads[1] * self.inverse().t();
   } else /* otherwise det = \prod(sigma) = 0, use svd */ {
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index d92ad3dbf7688b..c10de2c19f6f72 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -156,7 +156,7 @@ static double dispatch_to_CDouble(const Tensor & self) {
   if (self.numel() != 1) {
     throw ValueError("only one element tensors can be converted to Python scalars");
   }
-  return self.toCDouble();
+  return self.item<double>();
 }
 
 static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
@@ -165,7 +165,7 @@ static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
   if (self.numel() != 1) {
     throw ValueError("only one element tensors can be converted to Python scalars");
   }
-  return self.toCComplexDouble();
+  return self.item<std::complex<double>>();
 }
 
 static int64_t dispatch_to_CLong(const Tensor & self) {
@@ -174,7 +174,7 @@ static int64_t dispatch_to_CLong(const Tensor & self) {
   if (self.numel() != 1) {
     throw ValueError("only one element tensors can be converted to Python scalars");
   }
-  return self.toCLong();
+  return self.item<int64_t>();
 }
 
 static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
@@ -190,7 +190,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
   jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (isFloatingType(self_.type().scalarType())) {
-    // we can't dispatch to toCLong here because we want to avoid ATen overflow checks;
+    // we can't dispatch to item<int64_t> here because we want to avoid ATen overflow checks;
     // the python integral type (long in python2) can't overflow.
     return THPUtils_packDoubleAsInt(dispatch_to_CDouble(self_));
   } else {
diff --git a/torch/csrc/api/include/torch/optim/serialize.h b/torch/csrc/api/include/torch/optim/serialize.h
index 163ebbdcf098b5..1c85fa74e0062f 100644
--- a/torch/csrc/api/include/torch/optim/serialize.h
+++ b/torch/csrc/api/include/torch/optim/serialize.h
@@ -51,7 +51,7 @@ void serialize(
     BufferContainer& buffers) {
   torch::Tensor size_tensor;
   archive.read(key + "/size", size_tensor);
-  const size_t size = size_tensor.toCLong();
+  const size_t size = size_tensor.item<int64_t>();
   for (size_t index = 0; index < size; ++index) {
     buffers.emplace_back();
     archive.read(
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index 7f6104876bcf09..37c4b1dcaf4251 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -31,7 +31,7 @@ void LBFGS::add_grad(const torch::Tensor& step_size, const Tensor& update) {
     Tensor& pd = autograd::Variable(parameter).data();
     pd.add_(
         update.slice(0, offset, offset + numel, 1).view_as(pd),
-        step_size.toCFloat());
+        step_size.item<float>());
     offset += numel;
   }
 }
@@ -45,7 +45,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
   Tensor flat_grad = gather_flat_grad();
   Tensor abs_grad_sum = flat_grad.abs().sum();
 
-  if (abs_grad_sum.toCFloat() <= options.tolerance_grad_) {
+  if (abs_grad_sum.item<float>() <= options.tolerance_grad_) {
     return loss;
   }
 
@@ -65,7 +65,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
       Tensor s = d.mul(t);
       Tensor ys = y.dot(s);
 
-      if (ys.toCFloat() > 1e-10) {
+      if (ys.item<float>() > 1e-10) {
         // updating memory
 
         if (old_dirs.size() == options.history_size_) {
@@ -140,14 +140,15 @@ torch::Tensor LBFGS::step(LossClosure closure) {
       break;
     } else if (current_evals >= options.max_eval_) {
       break;
-    } else if (abs_grad_sum.toCFloat() <= options.tolerance_grad_) {
+    } else if (abs_grad_sum.item<float>() <= options.tolerance_grad_) {
       break;
-    } else if (gtd.toCFloat() > -options.tolerance_grad_) {
+    } else if (gtd.item<float>() > -options.tolerance_grad_) {
       break;
-    } else if (d.mul(t).abs_().sum().toCFloat() <= options.tolerance_change_) {
+    } else if (
+        d.mul(t).abs_().sum().item<float>() <= options.tolerance_change_) {
       break;
     } else if (
-        std::abs(loss.toCFloat() - prev_loss.toCFloat()) <
+        std::abs(loss.item<float>() - prev_loss.item<float>()) <
         options.tolerance_change_) {
       break;
     }
diff --git a/torch/csrc/api/src/optim/serialize.cpp b/torch/csrc/api/src/optim/serialize.cpp
index fbda6af91f32c6..24f9096c6ac36a 100644
--- a/torch/csrc/api/src/optim/serialize.cpp
+++ b/torch/csrc/api/src/optim/serialize.cpp
@@ -31,7 +31,7 @@ void serialize(
   serialize(archive, key, tensors);
   steps.clear();
   for (const auto& step : tensors) {
-    steps.push_back(step.toCLong());
+    steps.push_back(step.item<int64_t>());
   }
 }
 } // namespace detail
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index d0ecc017b42b50..1847bb65b08f8a 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -407,7 +407,7 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {
     for (int i = 0; i < num_outputs; ++i) {
       auto& output = outputs[i];
       at::DeviceGuard guard(output);
-      if (output.defined() && output.ne(output).any().toCByte()) {
+      if (output.defined() && output.ne(output).any().item<uint8_t>()) {
         std::stringstream ss;
         ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
         throw std::runtime_error(ss.str());
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index b50dddace66c50..abb588bee8fc54 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -173,7 +173,7 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis
           result = applySelect(result, dim, THPUtils_unpackLong(obj));
         } else {
           result = result.unsqueeze(dim);
-          handle_var(boolToIndexingTensor(result, var.toCByte() != 0));
+          handle_var(boolToIndexingTensor(result, var.item<uint8_t>() != 0));
         }
       } else {
         handle_var(var);
diff --git a/torch/csrc/jit/batched/BatchTensor.cpp b/torch/csrc/jit/batched/BatchTensor.cpp
index 7d621679999f9f..564b4b7e4449b4 100644
--- a/torch/csrc/jit/batched/BatchTensor.cpp
+++ b/torch/csrc/jit/batched/BatchTensor.cpp
@@ -34,7 +34,7 @@ BatchTensor::BatchTensor(const std::vector<at::Tensor> datalist, at::Tensor dims
     for(auto x : datalist){
       sizes[i] = std::max(sizes[i], x.size(i));
     }
-    mask_sizes[i] = *dims[i - 1].toByteData() ? sizes[i] : 1;
+    mask_sizes[i] = *dims[i - 1].data<uint8_t>() ? sizes[i] : 1;
   }
   data = at::empty(sizes, datalist[0].options());
   data.fill_(0);
@@ -44,7 +44,7 @@ BatchTensor::BatchTensor(const std::vector<at::Tensor> datalist, at::Tensor dims
     auto data_item = data.narrow(0, i, 1);
     auto mask_item = mask.narrow(0, i, 1);
     for(int64_t j = 0; j < dims.size(0); j++){
-      if(*dims[j].toByteData()){
+      if(*dims[j].data<uint8_t>()){
         data_item = data_item.narrow(j + 1, 0, datalist[i].size(j + 1));
         mask_item = mask_item.narrow(j + 1, 0, datalist[i].size(j + 1));
       }
@@ -62,12 +62,12 @@ std::vector<at::Tensor> BatchTensor::examples() {
     data = data.sum(d, /*keepdim=*/true);
     while(data.dim() >= 1)
       data = data[0];
-    return *data.toLongData();
+    return *data.data<int64_t>();
   };
   for(int64_t i = 0; i < data.size(0); i++){
     auto data_tmp = data.narrow(0, i, 1);
     for(int64_t d = 0; d < dims.size(0); d++){
-      if(*dims[d].toByteData()){
+      if(*dims[d].data<uint8_t>()){
         data_tmp = data_tmp.narrow(d + 1, 0, mask_sum(mask[i], d));
       }
     }
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 1f8618121f1e28..71168cd3ee3d4d 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -70,7 +70,7 @@ RegisterOperators reg({
               at::Tensor a;
               pop(stack, a);
               at::DeviceGuard guard(a);
-              push(stack, a.toCLong());
+              push(stack, a.item<int64_t>());
               return 0;
             };
           } else {
@@ -78,7 +78,7 @@ RegisterOperators reg({
               at::Tensor a;
               pop(stack, a);
               at::DeviceGuard guard(a);
-              push(stack, a.toCDouble());
+              push(stack, a.item<double>());
               return 0;
             };
           }
@@ -92,7 +92,7 @@ RegisterOperators reg({
               pop(stack, a);
               checkImplicitTensorToNum(a, /*to int*/true);
               at::DeviceGuard guard(a);
-              push(stack, a.toCLong());
+              push(stack, a.item<int64_t>());
               return 0;
             };
           } else {
@@ -101,7 +101,7 @@ RegisterOperators reg({
               pop(stack, a);
               checkImplicitTensorToNum(a, /*to int*/false);
               at::DeviceGuard guard(a);
-              push(stack, a.toCDouble());
+              push(stack, a.item<double>());
               return 0;
             };
           }
@@ -727,7 +727,7 @@ RegisterOperators reg2({
             pop(stack, t);
             std::vector<int64_t> elems;
             for(int i = 0; i < t.size(0); i++){
-              elems.push_back(*t[i].toIntData());
+              elems.push_back(*t[i].data<int32_t>());
             }
             push(stack, jit::IntList::create(elems));
             return 0;
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index e76dae7f961e52..9942437c9eb288 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -148,7 +148,7 @@ static void fusionTests() {
     auto outputs = debugLaunchGraph(graph, 0, {a,b});
     CATCH_REQUIRE(outputs.size() == 1);
     auto o2 = a*b;
-    float max_diff = (o2 - outputs[0]).abs().max().toCDouble();
+    float max_diff = (o2 - outputs[0]).abs().max().item<double>();
     //std::cout << "max diff: " << max_diff << "\n";
     CATCH_REQUIRE(max_diff == 0);
   };
@@ -202,7 +202,7 @@ static void fusionTests() {
     auto outputs = debugLaunchGraph(graph, 0, inputs);
     CATCH_REQUIRE(outputs.size() == graph.outputs().size());
     CATCH_REQUIRE(out0.is_same_size(outputs.front()));
-    float max_diff = (outputs.front() - out0).abs().max().toCDouble();
+    float max_diff = (outputs.front() - out0).abs().max().item<double>();
     CATCH_REQUIRE(max_diff < 1e-6);
 
   };
@@ -236,9 +236,9 @@ static void fusionTests() {
     auto outputs = debugLaunchGraph(graph, 0, {a,b});
     CATCH_REQUIRE(outputs.size() == 2);
 
-    float max_diff = (o_r - outputs[0]).abs().max().toCDouble();
+    float max_diff = (o_r - outputs[0]).abs().max().item<double>();
     CATCH_REQUIRE(max_diff == 0);
-    float max_diff2 = (o2_r - outputs[1]).abs().max().toCDouble();
+    float max_diff2 = (o2_r - outputs[1]).abs().max().item<double>();
     CATCH_REQUIRE(max_diff2 == 0);
   };
   testConcat(0);
@@ -325,16 +325,16 @@ at::Tensor t_def(at::Tensor x) {
 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
   double maxValue = 0.0;
   for (auto& tensor : inputs) {
-    maxValue = fmax(tensor.abs().max().toCFloat(), maxValue);
+    maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
   }
-  return diff.abs().max().toCFloat() < 2e-6 * maxValue;
+  return diff.abs().max().item<float>() < 2e-6 * maxValue;
 }
 bool almostEqual(const at::Tensor & a, const at::Tensor & b) {
   return checkRtol(a - b,{a, b});
 }
 
 bool exactlyEqual(const at::Tensor & a, const at::Tensor & b) {
-  return (a - b).abs().max().toCFloat() == 0.f;
+  return (a - b).abs().max().item<float>() == 0.f;
 }
 
 std::pair<at::Tensor, at::Tensor>
@@ -873,7 +873,7 @@ void testControlFlow() {
   };
 
   auto L = [](int64_t l) { return IValue(autograd::make_variable(scalar_to_tensor(at::Scalar(l)))); };
-  auto V = [](IValue t) { return std::move(t).toTensor().toCLong(); };
+  auto V = [](IValue t) { return std::move(t).toTensor().item<int64_t>(); };
   auto run_binary = [&](const std::string & name, int64_t a, int64_t b) {
     return V(run(name, {L(a), L(b)})[0]);
   };
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 2188681906d531..85fc443bb40b75 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -72,7 +72,7 @@ template<> struct type_caster<at::IntList> {
       for (int idx = 0; idx < size; idx++) {
 	PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
 	if (THPVariable_Check(obj)) {
-	  v_value[idx] = THPVariable_Unpack(obj).toCLong();
+	  v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
 	} else if (PyLong_Check(obj)) {
 	  // use THPUtils_unpackLong after it is safe to include python_numbers.h
 	  v_value[idx] = THPUtils_unpackLong(obj);
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 9ff25d2d4e5134..d4c15fd9482f08 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -300,7 +300,7 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(int i, std::vector<in
         auto & var = THPVariable_Unpack(obj);
         jit::tracer::ArgumentStash::stashIntListElem(
             signature.params[i].name, size, idx, var);
-        res[idx] = var.toCLong();
+        res[idx] = var.item<int64_t>();
         continue;
       } else {
         res[idx] = THPUtils_unpackIndex(obj);

From 1a1d79e761fd09d27276a88dfe4ec59b1cc3067d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Mon, 24 Sep 2018 10:48:55 -0700
Subject: [PATCH 03/51] Remove TIndex typedef from core/common.h (#11993)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11993

See title

Reviewed By: ezyang

Differential Revision: D10006069

fbshipit-source-id: 5e2aac993968307c850e431c00052cb1a339ced2
---
 caffe2/core/common.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 048d634df80dfa..a5d4cf60b603c0 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -30,10 +30,6 @@
 
 namespace caffe2 {
 
-// Data type for caffe2 Index/Size. We use size_t to be safe here as well as for
-// large matrices that are common in sparse math.
-typedef int64_t TIndex;
-
 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
 // forcing us to use std::map instead of unordered_map. This may affect speed
 // in some cases, but in most of the computation code we do not access map very

From a6f1ae7f20656d91528fd5964e1da09b8b440e35 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Mon, 24 Sep 2018 11:02:46 -0700
Subject: [PATCH 04/51] set up c10 scaffolding. Move macros proper first.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11939

Reviewed By: orionr, dzhulgakov

Differential Revision: D10004629

Pulled By: Yangqing

fbshipit-source-id: ba50a96820d35c7922d81c78c4cbe849c85c251c
---
 CMakeLists.txt                                |  6 ++
 CONTRIBUTING.md                               |  6 +-
 aten/src/ATen/CPUGeneral.h                    |  8 +-
 aten/src/ATen/CPUTypeDefault.h                |  2 +-
 aten/src/ATen/Context.h                       | 14 +--
 aten/src/ATen/DLConvertor.h                   |  6 +-
 aten/src/ATen/ExpandUtils.h                   |  9 +-
 aten/src/ATen/SparseTensorImpl.h              |  2 +-
 aten/src/ATen/TensorGeometry.h                |  2 +-
 aten/src/ATen/TensorUtils.h                   | 94 +++++++++++++------
 aten/src/ATen/Utils.h                         |  2 +-
 aten/src/ATen/core/ATenGeneral.h              |  1 -
 aten/src/ATen/core/Formatting.h               | 11 ++-
 aten/src/ATen/core/Generator.h                |  2 +-
 aten/src/ATen/core/Macros.h                   | 43 +--------
 aten/src/ATen/core/OptionsGuard.h             |  7 +-
 aten/src/ATen/core/Registry.h                 | 15 +--
 aten/src/ATen/core/Scalar.h                   |  4 +-
 aten/src/ATen/core/Storage.h                  |  4 +-
 aten/src/ATen/core/StorageImpl.h              |  2 +-
 aten/src/ATen/core/Tensor.h                   |  4 +-
 aten/src/ATen/core/TensorImpl.h               |  2 +-
 aten/src/ATen/core/TensorOptions.h            |  2 +-
 aten/src/ATen/core/Type.h                     |  3 +-
 aten/src/ATen/core/UndefinedTensorImpl.h      |  4 +-
 aten/src/ATen/core/VariableHooksInterface.h   |  8 +-
 aten/src/ATen/core/context_base.h             |  6 +-
 aten/src/ATen/core/ivalue.cpp                 |  3 +-
 aten/src/ATen/core/ivalue.h                   | 10 +-
 aten/src/ATen/cuda/CUDAContext.h              | 27 +++---
 aten/src/ATen/detail/CUDAHooksInterface.h     |  8 +-
 aten/src/ATen/detail/ComplexHooksInterface.h  |  6 +-
 aten/src/ATen/function_wrapper.py             |  2 +-
 aten/src/ATen/native/DispatchStub.h           |  8 +-
 aten/src/ATen/native/TensorIterator.h         |  8 +-
 aten/src/ATen/templates/NativeFunctions.h     | 34 +++----
 aten/src/ATen/templates/Tensor.h              |  4 +-
 aten/src/ATen/templates/Type.h                |  3 +-
 aten/src/ATen/templates/TypeDefault.h         |  2 +-
 .../ATen/templates/TypeExtendedInterface.h    |  2 +-
 aten/src/TH/THAllocator.h                     | 12 ++-
 aten/src/THC/THCAllocator.h                   |  4 +-
 c10/CMakeLists.txt                            | 38 ++++++++
 c10/c10_dummy.cpp                             |  7 ++
 c10/c10_dummy.h                               |  7 ++
 c10/macros/Export.h                           | 76 +++++++++++++++
 c10/macros/Legacy.h                           | 20 ++++
 c10/macros/Macros.h                           | 32 +++++++
 c10/macros/cmake_macros.h.in                  |  6 ++
 caffe2/CMakeLists.txt                         |  4 +-
 caffe2/core/common.h                          | 44 +--------
 caffe2/core/logging.h                         |  1 +
 caffe2/perfkernels/CMakeLists.txt             |  4 +-
 modules/observers/macros.h                    |  7 ++
 modules/observers/net_observer_reporter.h     |  1 +
 .../observers/net_observer_reporter_print.h   |  1 +
 modules/observers/observer_config.h           |  1 +
 modules/observers/perf_observer.h             |  1 +
 setup.py                                      |  2 +
 59 files changed, 412 insertions(+), 242 deletions(-)
 create mode 100644 c10/CMakeLists.txt
 create mode 100644 c10/c10_dummy.cpp
 create mode 100644 c10/c10_dummy.h
 create mode 100644 c10/macros/Export.h
 create mode 100644 c10/macros/Legacy.h
 create mode 100644 c10/macros/Macros.h
 create mode 100644 c10/macros/cmake_macros.h.in
 create mode 100644 modules/observers/macros.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60d69ef2d9bc68..7bf587dc30fc97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,11 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
 # ---[ Project and semantic versioning.
 project(Caffe2 CXX C)
 
+set(CMAKE_CXX_STANDARD 11)
+if (NOT MSVC)
+  set(CMAKE_C_STANDARD 11)
+endif()
+
 set(CAFFE2_VERSION_MAJOR 0)
 set(CAFFE2_VERSION_MINOR 8)
 set(CAFFE2_VERSION_PATCH 2)
@@ -294,6 +299,7 @@ include_directories(BEFORE ${PROJECT_BINARY_DIR})
 include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
 
 # ---[ Main build
+add_subdirectory(c10)
 add_subdirectory(caffe2)
 
 # --[ Documentation
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0f46fa1cf62a7b..f0be7e770b97e4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -262,9 +262,9 @@ than Linux, which are worth keeping in mind when fixing these problems.
 1. Symbols are NOT exported by default on Windows; instead, you have to explicitly
    mark a symbol as exported/imported in a header file with `__declspec(dllexport)` /
    `__declspec(dllimport)`.  We have codified this pattern into a set of macros
-   which follow the convention `*_API`, e.g., `AT_API` inside ATen. (Every separate
-   shared library needs a unique macro name, because symbol visibility is on a per
-   shared library basis.)
+   which follow the convention `*_API`, e.g., `CAFFE2_API` inside Caffe2 and ATen.
+   (Every separate shared library needs a unique macro name, because symbol visibility
+   is on a per shared library basis. See c10/macros/Macros.h for more details.)
 
    The upshot is if you see an "unresolved external" error in your Windows build, this
    is probably because you forgot to mark a function with `*_API`.  However, there is
diff --git a/aten/src/ATen/CPUGeneral.h b/aten/src/ATen/CPUGeneral.h
index b406669053dd82..04bd0aacb528f6 100644
--- a/aten/src/ATen/CPUGeneral.h
+++ b/aten/src/ATen/CPUGeneral.h
@@ -1,12 +1,12 @@
 #pragma once
 
-// Using AT_API is crucial as otherwise you'll see
+// Using CAFFE2_API is crucial as otherwise you'll see
 // linking errors using MSVC
 // See https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-// This header adds this if using AT_API
+// This header adds this if using CAFFE2_API
 #include "ATen/core/ATenGeneral.h"
 
 namespace at {
-AT_API void set_num_threads(int);
-AT_API int get_num_threads();
+CAFFE2_API void set_num_threads(int);
+CAFFE2_API int get_num_threads();
 }
diff --git a/aten/src/ATen/CPUTypeDefault.h b/aten/src/ATen/CPUTypeDefault.h
index c9776b7b0a2ccf..6a854c98d16e35 100644
--- a/aten/src/ATen/CPUTypeDefault.h
+++ b/aten/src/ATen/CPUTypeDefault.h
@@ -3,7 +3,7 @@
 
 namespace at {
 
-struct AT_API CPUTypeDefault : public TypeDefault {
+struct CAFFE2_API CPUTypeDefault : public TypeDefault {
   CPUTypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : TypeDefault(type_id, is_variable, is_undefined) {}
   Allocator* allocator() const override;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 4e147cffabbe86..6a2f28cf9eb32b 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -24,8 +24,8 @@ namespace at {
 
 struct Tensor;
 
-class AT_API Context {
-public:
+class CAFFE2_API Context {
+ public:
   Context();
   TypeExtendedInterface* getNonVariableTypeRaw(Backend p, ScalarType s) {
     return static_cast<TypeExtendedInterface*>(globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s));
@@ -133,7 +133,7 @@ class AT_API Context {
   friend struct Type;
 };
 
-AT_API Context & globalContext();
+CAFFE2_API Context& globalContext();
 
 static inline void init() {
   globalContext();
@@ -153,11 +153,11 @@ static inline TypeExtendedInterface& getNonVariableType(DeviceType p, ScalarType
   return globalContext().getNonVariableType(deviceTypeToBackend(p), s);
 }
 
-AT_API TypeExtendedInterface& getType(TensorOptions options);
-AT_API TypeExtendedInterface& getType(const TensorImpl*);
-AT_API TypeExtendedInterface& getType(const Tensor&);
+CAFFE2_API TypeExtendedInterface& getType(TensorOptions options);
+CAFFE2_API TypeExtendedInterface& getType(const TensorImpl*);
+CAFFE2_API TypeExtendedInterface& getType(const Tensor&);
 
-AT_API Allocator* getCPUAllocator();
+CAFFE2_API Allocator* getCPUAllocator();
 
 static inline TypeExtendedInterface& CPU(ScalarType s) {
   return getNonVariableType(Backend::CPU, s);
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
index 5ed9899fc5500e..d254fb568fd095 100644
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@@ -10,8 +10,8 @@
 
 namespace at {
 
-AT_API ScalarType toScalarType(const DLDataType& dtype);
-AT_API DLManagedTensor * toDLPack(const Tensor& src);
-AT_API Tensor fromDLPack(const DLManagedTensor* src);
+CAFFE2_API ScalarType toScalarType(const DLDataType& dtype);
+CAFFE2_API DLManagedTensor* toDLPack(const Tensor& src);
+CAFFE2_API Tensor fromDLPack(const DLManagedTensor* src);
 
 } //namespace at
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 3453155da5b1d8..cd95271adf427a 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -9,9 +9,12 @@
 
 namespace at {
 
-AT_API std::vector<int64_t> infer_size(IntList a, IntList b);
-AT_API std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(
-    IntList tensor_sizes, IntList tensor_strides, IntList sizes);
+CAFFE2_API std::vector<int64_t> infer_size(IntList a, IntList b);
+CAFFE2_API std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+inferExpandGeometry(
+    IntList tensor_sizes,
+    IntList tensor_strides,
+    IntList sizes);
 
 // avoid copy-construction of Tensor by using a reference_wrapper.
 inline void check_defined(std::initializer_list<std::reference_wrapper<const Tensor>> tensors, const char *api_name) {
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 9f9569ac06bbf3..7ffb68a4963c04 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -5,7 +5,7 @@
 #include "ATen/core/Error.h"
 
 namespace at {
-struct AT_API SparseTensorImpl : public TensorImpl {
+struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // Stored in COO format, indices + values.
 
   // INVARIANTS:
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 9e7c6f6b440e57..c989d2ca8f7d0b 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -5,7 +5,7 @@
 
 namespace at {
 
-struct AT_API TensorGeometry {
+struct CAFFE2_API TensorGeometry {
   TensorGeometry() : storage_offset_(0) {}
 
   explicit TensorGeometry(IntList sizes)
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 2443bde4b482cb..f65093a586004b 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -12,7 +12,7 @@ namespace at {
 // make sense.  These are particularly useful for native functions,
 // which do NO argument checking by default.
 
-struct AT_API TensorArg {
+struct CAFFE2_API TensorArg {
   Tensor tensor;
   const char* name;
   int pos; // 1-indexed
@@ -22,7 +22,7 @@ struct AT_API TensorArg {
   const Tensor& operator*() const { return tensor; }
 };
 
-struct AT_API TensorGeometryArg {
+struct CAFFE2_API TensorGeometryArg {
   TensorGeometry tensor;
   const char* name;
   int pos; // 1-indexed
@@ -49,40 +49,80 @@ using CheckedFrom = const char*;
 // not TensorGeometryArg, because the Tensor to TensorGeometry
 // conversion will blow up if you have undefined tensors.
 
-AT_API std::ostream& operator<<(std::ostream & out, TensorGeometryArg t);
-AT_API void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
+CAFFE2_API void checkDim(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim);
 // NB: this is an inclusive-exclusive range
-AT_API void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end);
-AT_API void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2);
-AT_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
-AT_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
-AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntList sizes);
-AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size);
-AT_API void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel);
-AT_API void checkSameNumel(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2);
-AT_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
-AT_API void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType s);
-AT_API void checkScalarTypes(CheckedFrom c, const TensorArg& t, at::ArrayRef<ScalarType> l);
-AT_API void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
-AT_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
-AT_API void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
-AT_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
-AT_API void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
-AT_API void checkDefined(CheckedFrom c, const TensorArg& t);
-AT_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
+CAFFE2_API void checkDimRange(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim_start,
+    int64_t dim_end);
+CAFFE2_API void checkSameDim(
+    CheckedFrom c,
+    const TensorGeometryArg& t1,
+    const TensorGeometryArg& t2);
+CAFFE2_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
+CAFFE2_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
+CAFFE2_API void checkSize(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    IntList sizes);
+CAFFE2_API void checkSize(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim,
+    int64_t size);
+CAFFE2_API void checkNumel(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t numel);
+CAFFE2_API void checkSameNumel(
+    CheckedFrom c,
+    const TensorGeometryArg& t1,
+    const TensorGeometryArg& t2);
+CAFFE2_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
+CAFFE2_API void checkScalarType(
+    CheckedFrom c,
+    const TensorArg& t,
+    ScalarType s);
+CAFFE2_API void checkScalarTypes(
+    CheckedFrom c,
+    const TensorArg& t,
+    at::ArrayRef<ScalarType> l);
+CAFFE2_API void checkSameGPU(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+CAFFE2_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
+CAFFE2_API void checkSameType(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+CAFFE2_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
+CAFFE2_API void checkSameSize(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+CAFFE2_API void checkDefined(CheckedFrom c, const TensorArg& t);
+CAFFE2_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
 
 // FixMe: does TensorArg slow things down?
-AT_API void checkBackend(CheckedFrom c, at::ArrayRef<Tensor> t, at::Backend backend);
+CAFFE2_API void checkBackend(
+    CheckedFrom c,
+    at::ArrayRef<Tensor> t,
+    at::Backend backend);
 
 // Methods for getting data_ptr if tensor is defined
-AT_API void * maybe_data_ptr(const Tensor& tensor);
-AT_API void * maybe_data_ptr(const TensorArg& tensor);
+CAFFE2_API void* maybe_data_ptr(const Tensor& tensor);
+CAFFE2_API void* maybe_data_ptr(const TensorArg& tensor);
 
 // Return if the tensor geometry represented by `sizes` and `strides` is contiguous
 // Although we cache is_contiguous in tensor now, this is till useful because it
 // allows checking if a particular geometry is contiguous without explicitly
 // constructing a tensor, e.g., when you want to choose a kernel strategy based
 // on whether a subgeometry is contiguous.
-AT_API bool geometry_is_contiguous(IntList sizes, IntList strides);
-
+CAFFE2_API bool geometry_is_contiguous(IntList sizes, IntList strides);
 }
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index c4473d1471ab7d..21ade98cba79c8 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -24,7 +24,7 @@
 
 namespace at {
 
-AT_API int _crash_if_asan(int);
+CAFFE2_API int _crash_if_asan(int);
 
 static inline const Storage& checked_storage(
     const Storage& expr,
diff --git a/aten/src/ATen/core/ATenGeneral.h b/aten/src/ATen/core/ATenGeneral.h
index cbc1d6f13692f2..78cfe5fe4d9f1b 100644
--- a/aten/src/ATen/core/ATenGeneral.h
+++ b/aten/src/ATen/core/ATenGeneral.h
@@ -3,6 +3,5 @@
 #include "ATen/core/Macros.h"
 
 // TODO: Merge the *_API macros.
-#define AT_API AT_CORE_API
 #define AT_EXPORT AT_CORE_EXPORT
 #define AT_IMPORT AT_CORE_IMPORT
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
index c6ac26b8a9e0e3..4906770271f274 100644
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@@ -8,10 +8,13 @@
 
 namespace at {
 
-AT_API std::ostream& operator<<(std::ostream & out, IntList list);
-AT_API std::ostream& operator<<(std::ostream & out, Backend b);
-AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
-AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, IntList list);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, Backend b);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, const Type& t);
+CAFFE2_API std::ostream& print(
+    std::ostream& stream,
+    const Tensor& tensor,
+    int64_t linesize);
 static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
   return print(out,t,80);
 }
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index b8894c4307b043..fce3d35636c274 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -5,7 +5,7 @@
 
 namespace at {
 
-struct AT_API Generator {
+struct CAFFE2_API Generator {
   Generator() {};
   Generator(const Generator& other) = delete;
   Generator(Generator&& other) = delete;
diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h
index 244124475bc08f..cb48b68782ab03 100644
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@@ -3,41 +3,7 @@
 #include <sstream>
 #include <string>
 
-// You can use the definition AT_CORE_STATIC_WINDOWS to control whether
-// or not we apply __declspec.  You will want to set this as
-// -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links
-// against ATen/core on Windows, when ATen/core is built as a
-// static library (in which case, saying the symbol is coming
-// from a DLL would be incorrect).
-
-#ifdef _WIN32
-#if !defined(AT_CORE_STATIC_WINDOWS)
-#define AT_CORE_EXPORT __declspec(dllexport)
-#define AT_CORE_IMPORT __declspec(dllimport)
-#else // !defined(AT_CORE_STATIC_WINDOWS)
-#define AT_CORE_EXPORT
-#define AT_CORE_IMPORT
-#endif // !defined(AT_CORE_STATIC_WINDOWS)
-#else  // _WIN32
-#if defined(__GNUC__)
-#define AT_CORE_EXPORT __attribute__((__visibility__("default")))
-#else // defined(__GNUC__)
-#define AT_CORE_EXPORT
-#endif // defined(__GNUC__)
-#define AT_CORE_IMPORT AT_CORE_EXPORT
-#endif  // _WIN32
-
-// AT_CORE_API is a macro that, depends on whether you are building the
-// main library or not, resolves to either AT_CORE_EXPORT or
-// AT_CORE_IMPORT.
-//
-
-// TODO: unify the controlling macros.
-#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API AT_CORE_EXPORT
-#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API AT_CORE_IMPORT
-#endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#include "c10/macros/Macros.h"
 
 #ifdef __CUDACC__
 // Designates functions callable from the host (CPU) and the device (GPU)
@@ -50,13 +16,6 @@
 #define AT_DEVICE
 #endif
 
-// Disable the copy and assignment operator for a class. Note that this will
-// disable the usage of the class in std containers.
-#define AT_DISABLE_COPY_AND_ASSIGN(classname) \
-  classname(const classname&) = delete;       \
-  classname& operator=(const classname&) = delete
-
-
 #if defined(__ANDROID__)
 #define AT_ANDROID 1
 #define AT_MOBILE 1
diff --git a/aten/src/ATen/core/OptionsGuard.h b/aten/src/ATen/core/OptionsGuard.h
index b359638d53a61a..fc078db6bf90b5 100644
--- a/aten/src/ATen/core/OptionsGuard.h
+++ b/aten/src/ATen/core/OptionsGuard.h
@@ -20,7 +20,7 @@ struct DefaultTensorOptions {
   /// Defined in OptionsGuard.cpp because we can't use optional in headers, due
   /// to Windows and other compilers.
   /// TODO: The inability to use optional in headers is no longer true
-  AT_API static TensorOptions& get();
+  CAFFE2_API static TensorOptions& get();
 
  private:
   /// This is an optional because of compiler bugs that mis-initialize static
@@ -64,8 +64,9 @@ struct OptionsGuard {
 #else // AT_MOBILE
 
 struct DefaultTensorOptions {
-  AT_API static const TensorOptions& get();
-private:
+  CAFFE2_API static const TensorOptions& get();
+
+ private:
   static TensorOptions options_;
 };
 
diff --git a/aten/src/ATen/core/Registry.h b/aten/src/ATen/core/Registry.h
index 8f3caffe491542..98a3e4a18c7258 100644
--- a/aten/src/ATen/core/Registry.h
+++ b/aten/src/ATen/core/Registry.h
@@ -44,7 +44,7 @@ inline void PrintOffendingKey(const std::string& key) {
  * objects.
  */
 template <class SrcType, class ObjectPtrType, class... Args>
-class AT_API Registry {
+class CAFFE2_API Registry {
  public:
   typedef std::function<ObjectPtrType(Args...)> Creator;
 
@@ -114,7 +114,7 @@ class AT_API Registry {
 };
 
 template <class SrcType, class ObjectPtrType, class... Args>
-class AT_API Registerer {
+class CAFFE2_API Registerer {
  public:
   Registerer(
       const SrcType& key,
@@ -152,11 +152,12 @@ class AT_API Registerer {
  * declaration, as well as creating a convenient typename for its corresponding
  * registerer.
  */
-#define AT_DECLARE_TYPED_REGISTRY(                                    \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
-  AT_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName(); \
-  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>        \
-      Registerer##RegistryName; \
+#define AT_DECLARE_TYPED_REGISTRY(                                \
+    RegistryName, SrcType, ObjectType, PtrType, ...)              \
+  CAFFE2_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* \
+  RegistryName();                                                 \
+  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>   \
+      Registerer##RegistryName;                                   \
   extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
 
 #define AT_DEFINE_TYPED_REGISTRY(                                         \
diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h
index de01a56ce33748..45b99fdb34cd7e 100644
--- a/aten/src/ATen/core/Scalar.h
+++ b/aten/src/ATen/core/Scalar.h
@@ -14,8 +14,8 @@ namespace at {
 
 struct Tensor;
 
-class AT_API Scalar {
-public:
+class CAFFE2_API Scalar {
+ public:
   Scalar() : Scalar(int64_t(0)) {}
 
 #define DEFINE_IMPLICIT_CTOR(type,name,member) \
diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h
index ab201be88d630e..cd42b33d12e2b0 100644
--- a/aten/src/ATen/core/Storage.h
+++ b/aten/src/ATen/core/Storage.h
@@ -4,8 +4,8 @@
 
 namespace at {
 
-struct AT_API Storage {
-public:
+struct CAFFE2_API Storage {
+ public:
   Storage() {}
   Storage(c10::intrusive_ptr<StorageImpl> ptr) : storage_impl_(std::move(ptr)) {}
   Storage(
diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index cc63bd00906669..bba2df4e0d1bec 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -10,7 +10,7 @@ namespace at {
 
 struct Type;
 
-struct AT_API StorageImpl : public c10::intrusive_ptr_target {
+struct CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
  public:
   StorageImpl(
       caffe2::TypeMeta data_type,
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index 39d12e7d6499ba..fca7fe3189f019 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -37,7 +37,7 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct AT_API Tensor {
+struct CAFFE2_API Tensor {
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : tensor_impl_(std::move(tensor_impl)) {
@@ -648,7 +648,7 @@ struct AT_API Tensor {
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
 };
 
-struct AT_API WeakTensor {
+struct CAFFE2_API WeakTensor {
   WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
 
   // XXX: this can return undefined tensors
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index d2f98ff52780f8..0c257cecdd80fb 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -20,7 +20,7 @@ struct Tensor;
 } // namespace at
 
 namespace at {
-struct AT_API TensorImpl : public c10::intrusive_ptr_target {
+struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
   TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
   TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
diff --git a/aten/src/ATen/core/TensorOptions.h b/aten/src/ATen/core/TensorOptions.h
index 2b589e9b13f481..4ae7b3452bddf1 100644
--- a/aten/src/ATen/core/TensorOptions.h
+++ b/aten/src/ATen/core/TensorOptions.h
@@ -47,7 +47,7 @@ namespace at {
 ///     at::zeros({2,2}, at::device({at::kCUDA, 1})); // place on device 1
 ///     at::zeros({2,2}, at::requires_grad());
 ///
-struct AT_API TensorOptions {
+struct CAFFE2_API TensorOptions {
   TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {}
 
   /// Constructs the `TensorOptions` with defaults taken from the thread local
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 2d19e0de588416..ee40e616f00236 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -76,7 +76,7 @@ enum class TypeID {
   NumOptions
 };
 
-struct AT_API Type {
+struct CAFFE2_API Type {
   explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
 
@@ -613,7 +613,6 @@ struct AT_API Type {
   TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
-
 };
 
 } // namespace at
diff --git a/aten/src/ATen/core/UndefinedTensorImpl.h b/aten/src/ATen/core/UndefinedTensorImpl.h
index 6c734950d90cad..7a6866187c5f21 100644
--- a/aten/src/ATen/core/UndefinedTensorImpl.h
+++ b/aten/src/ATen/core/UndefinedTensorImpl.h
@@ -4,8 +4,8 @@
 
 namespace at {
 
-struct AT_API UndefinedTensorImpl final : public TensorImpl {
-public:
+struct CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
+ public:
   // Without this, we get:
   //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code
   // (ostensibly because the constexpr tricks MSVC into trying to compile this
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index 09c972255ea6f5..e8fd4da9e27536 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -20,8 +20,7 @@ namespace at {
 //
 // We may choose to absorb autograd into ATen, in which case this interface is obsolete.
 //
-struct AT_API VariableHooksInterface {
-
+struct CAFFE2_API VariableHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~VariableHooksInterface() {}
@@ -34,18 +33,17 @@ struct AT_API VariableHooksInterface {
     // no-op if Variable not available; it'll get handled (if at all) when
     // libtorch.so gets loaded
   }
-
 };
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
 // for the "..." in a variadic macro"
-struct AT_API VariableHooksArgs {};
+struct CAFFE2_API VariableHooksArgs {};
 
 AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
 #define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
 
 namespace detail {
-  AT_API const VariableHooksInterface& getVariableHooks();
+CAFFE2_API const VariableHooksInterface& getVariableHooks();
 }
 
 } // namespace at
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 0a653ba0a12379..45b38387b46ca6 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -192,9 +192,9 @@ using at::BaseContext;
 using at::BaseStaticContext;
 
 using StaticContextMap = std::unordered_map<at::DeviceType, BaseStaticContext*>;
-AT_API StaticContextMap& GetStaticContexts();
-AT_API void set_static_context(at::DeviceType t, BaseStaticContext* ptr);
-AT_API BaseStaticContext* get_static_context(at::DeviceType t);
+CAFFE2_API StaticContextMap& GetStaticContexts();
+CAFFE2_API void set_static_context(at::DeviceType t, BaseStaticContext* ptr);
+CAFFE2_API BaseStaticContext* get_static_context(at::DeviceType t);
 
 template <at::DeviceType t>
 struct StaticContextFunctionRegisterer {
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 3d2b56893e7188..9e69f70d025861 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -6,7 +6,8 @@
 
 namespace torch { namespace jit {
 
-AT_API c10::intrusive_ptr<ConstantString> ConstantString::create(std::string str_) {
+CAFFE2_API c10::intrusive_ptr<ConstantString> ConstantString::create(
+    std::string str_) {
   return c10::make_intrusive<ConstantString>(std::move(str_));
 }
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 914598f6ceb426..ef88c8c746093c 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -14,7 +14,7 @@ template <typename T>
 using Shared = c10::intrusive_ptr<T>;
 
 // string
-struct AT_API ConstantString final : c10::intrusive_ptr_target {
+struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
  private:
   const std::string str_;
  public:
@@ -27,7 +27,7 @@ struct AT_API ConstantString final : c10::intrusive_ptr_target {
   operator const std::string & () const {
     return string();
   }
-  AT_API friend std::ostream& operator<<(
+  CAFFE2_API friend std::ostream& operator<<(
       std::ostream& out,
       const ConstantString& v);
 };
@@ -67,7 +67,7 @@ using DoubleList = ConstantList<double>;
 #define TORCH_FORALL_TAGS(_) \
   _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
 
-struct AT_API IValue final {
+struct CAFFE2_API IValue final {
   IValue()
   : payload{0}
   , tag(Tag::None)
@@ -277,7 +277,9 @@ struct AT_API IValue final {
   template<typename T>
   T to() const &;
 
-  AT_API friend std::ostream& operator<<(std::ostream& out, const IValue& v);
+  CAFFE2_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const IValue& v);
 
  private:
   // NOTE: IValue tags are intentionally private. In the future we may encode
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 3a75483aa3e700..564a918d943c01 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -35,32 +35,31 @@ manage their own state. There is only a single CUDA context/state.
 */
 
 /* Device info */
-AT_API int64_t getNumGPUs();
+CAFFE2_API int64_t getNumGPUs();
 
-AT_API int64_t current_device();
+CAFFE2_API int64_t current_device();
 
-AT_API void set_device(int64_t device);
+CAFFE2_API void set_device(int64_t device);
 
-AT_API cudaDeviceProp* getCurrentDeviceProperties();
+CAFFE2_API cudaDeviceProp* getCurrentDeviceProperties();
 
-AT_API cudaDeviceProp* getDeviceProperties(int64_t device);
+CAFFE2_API cudaDeviceProp* getDeviceProperties(int64_t device);
 
 /* Streams */
-AT_API CUDAStream createCUDAStream(
-  const bool isHighPriority = false
-, int64_t device = -1);
+CAFFE2_API CUDAStream
+createCUDAStream(const bool isHighPriority = false, int64_t device = -1);
 
-AT_API CUDAStream getDefaultCUDAStream(int64_t device = -1);
-AT_API CUDAStream getCurrentCUDAStream(int64_t device = -1);
+CAFFE2_API CUDAStream getDefaultCUDAStream(int64_t device = -1);
+CAFFE2_API CUDAStream getCurrentCUDAStream(int64_t device = -1);
 
-AT_API void setCurrentCUDAStream(CUDAStream stream);
-AT_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
+CAFFE2_API void setCurrentCUDAStream(CUDAStream stream);
+CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 
-AT_API Allocator* getCUDADeviceAllocator();
+CAFFE2_API Allocator* getCUDADeviceAllocator();
 
 /* Handles */
 #ifndef __HIP_PLATFORM_HCC__
-  AT_API cusparseHandle_t getCurrentCUDASparseHandle();
+CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
 #endif
 
 
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 4e60ee1597cc41..69149932ac7b98 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -47,7 +47,7 @@ constexpr const char* CUDA_HELP =
 // TODO: Consider putting the stub definitions in another class, so that one
 // never forgets to implement each virtual function in the real implementation
 // in CUDAHooks.  This probably doesn't buy us much though.
-struct AT_API CUDAHooksInterface {
+struct CAFFE2_API CUDAHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~CUDAHooksInterface() {}
@@ -129,14 +129,14 @@ struct AT_API CUDAHooksInterface {
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
 // for the "..." in a variadic macro"
-struct AT_API CUDAHooksArgs {};
+struct CAFFE2_API CUDAHooksArgs {};
 
 AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
 #define REGISTER_CUDA_HOOKS(clsname) \
   AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
 
 namespace detail {
-AT_API const CUDAHooksInterface& getCUDAHooks();
+CAFFE2_API const CUDAHooksInterface& getCUDAHooks();
 
 /// This class exists to let us access `cudaSetDevice`, `cudaGetDevice` and CUDA
 /// error handling functions, when CUDA is available. These functions will first
@@ -144,7 +144,7 @@ AT_API const CUDAHooksInterface& getCUDAHooks();
 /// the `cudaSetDevice`/`cudaGetDevice` functions. This allows us to access them
 /// with only a single pointer indirection, while virtual dispatch would require
 /// two (one for the virtual call, one for `cudaSetDevice`/`cudaGetDevice`).
-struct AT_API DynamicCUDAInterface {
+struct CAFFE2_API DynamicCUDAInterface {
   static void (*set_device)(int32_t);
   static void (*get_device)(int32_t*);
   static void (*unchecked_set_device)(int32_t);
diff --git a/aten/src/ATen/detail/ComplexHooksInterface.h b/aten/src/ATen/detail/ComplexHooksInterface.h
index 80ecfb6f26f833..e5d5c3ec2a83fa 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.h
+++ b/aten/src/ATen/detail/ComplexHooksInterface.h
@@ -7,7 +7,7 @@ namespace at {
 
 class Context;
 
-struct AT_API ComplexHooksInterface {
+struct CAFFE2_API ComplexHooksInterface {
   virtual ~ComplexHooksInterface() {}
 
   virtual void registerComplexTypes(Context*) const {
@@ -15,13 +15,13 @@ struct AT_API ComplexHooksInterface {
   }
 };
 
-struct AT_API ComplexHooksArgs {};
+struct CAFFE2_API ComplexHooksArgs {};
 AT_DECLARE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs)
 #define REGISTER_COMPLEX_HOOKS(clsname) \
   AT_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname)
 
 namespace detail {
-AT_API const ComplexHooksInterface& getComplexHooks();
+CAFFE2_API const ComplexHooksInterface& getComplexHooks();
 }
 
 }
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 323701d69e8376..189cadf0b6d1c6 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -154,7 +154,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 """)
 # add a native declaration for a native function
 NATIVE_DECLARATION = CodeTemplate("""\
-AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
+CAFFE2_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
 """)
 
 # special method definition for factory functions in Functions.h
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 42ef6a4f6bb5f1..c803ecd3f353b5 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -49,10 +49,10 @@ enum class CPUCapability {
 CPUCapability get_cpu_capability();
 
 template <typename FnPtr, typename T>
-struct AT_API DispatchStub;
+struct CAFFE2_API DispatchStub;
 
 template <typename rT, typename T, typename... Args>
-struct AT_API DispatchStub<rT (*) (Args...), T> {
+struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
   using FnPtr = rT (*) (Args...);
 
   template <typename... ArgTypes>
@@ -114,9 +114,9 @@ struct RegisterDispatch {
 // adding parentheses and using helper struct to get rid of the parentheses, do
 // not work with MSVC. So do a `using`-declaration if you need to pass in such
 // `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
-#define DECLARE_DISPATCH(fn, name) \
+#define DECLARE_DISPATCH(fn, name)         \
   struct name : DispatchStub<fn, name> {}; \
-  extern AT_API struct name name
+  extern CAFFE2_API struct name name
 
 #define DEFINE_DISPATCH(name) struct name name
 
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 3faedbec6bb320..7d97d7f7f6635b 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -50,7 +50,7 @@
 
 namespace at {
 
-struct AT_API OperandInfo {
+struct CAFFE2_API OperandInfo {
   OperandInfo() {}
   OperandInfo(const Tensor& t) : tensor(const_cast<Tensor*>(&t)) {}
 
@@ -82,7 +82,7 @@ struct AT_API OperandInfo {
 
 struct SplitUntil32Bit;
 
-struct AT_API TensorIterator {
+struct CAFFE2_API TensorIterator {
   struct Builder;
   friend struct Builder;
 
@@ -212,8 +212,8 @@ struct TensorIterator::Builder {
 /// A container-like struct that acts as if it contains splits of a
 /// TensorIterator that can use 32-bit indexing. Taken together the splits cover
 /// the original TensorIterator.
-struct AT_API SplitUntil32Bit {
-  struct AT_API iterator {
+struct CAFFE2_API SplitUntil32Bit {
+  struct CAFFE2_API iterator {
     iterator() {};
     iterator(const TensorIterator& iter);
     iterator(iterator&&) = default;
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index 4190e2fcfd1d98..0f7b8ba27ef9e1 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -48,23 +48,23 @@ inline Tensor from_blob(
 }
 
 // These functions are defined in native/TensorFactories.cpp.
-#define TENSOR(T, S, _1)                                               \
-  AT_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options);     \
-  inline Tensor tensor(                                                \
-      std::initializer_list<T> values, const TensorOptions& options) { \
-    return native::tensor(ArrayRef<T>(values), options);               \
-  }                                                                    \
-  inline Tensor tensor(T value, const TensorOptions& options) {        \
-    return native::tensor(ArrayRef<T>(value), options);                \
-  }                                                                    \
-  inline Tensor tensor(ArrayRef<T> values) {                           \
-    return native::tensor(std::move(values), at::dtype(k##S));         \
-  }                                                                    \
-  inline Tensor tensor(std::initializer_list<T> values) {              \
-    return native::tensor(ArrayRef<T>(values));                        \
-  }                                                                    \
-  inline Tensor tensor(T value) {                                      \
-    return native::tensor(ArrayRef<T>(value));                         \
+#define TENSOR(T, S, _1)                                                      \
+  CAFFE2_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
+  inline Tensor tensor(                                                       \
+      std::initializer_list<T> values, const TensorOptions& options) {        \
+    return native::tensor(ArrayRef<T>(values), options);                      \
+  }                                                                           \
+  inline Tensor tensor(T value, const TensorOptions& options) {               \
+    return native::tensor(ArrayRef<T>(value), options);                       \
+  }                                                                           \
+  inline Tensor tensor(ArrayRef<T> values) {                                  \
+    return native::tensor(std::move(values), at::dtype(k##S));                \
+  }                                                                           \
+  inline Tensor tensor(std::initializer_list<T> values) {                     \
+    return native::tensor(ArrayRef<T>(values));                               \
+  }                                                                           \
+  inline Tensor tensor(T value) {                                             \
+    return native::tensor(ArrayRef<T>(value));                                \
   }
 AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR)
 #undef TENSOR
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 73fdcf4ecb6d9d..2e5fb25f597418 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -37,7 +37,7 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct AT_API Tensor {
+struct CAFFE2_API Tensor {
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : tensor_impl_(std::move(tensor_impl)) {
@@ -262,7 +262,7 @@ struct AT_API Tensor {
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
 };
 
-struct AT_API WeakTensor {
+struct CAFFE2_API WeakTensor {
   WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
 
   // XXX: this can return undefined tensors
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 0e00a5d3499fcd..2db006c82d5834 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -47,7 +47,7 @@ enum class TypeID {
   NumOptions
 };
 
-struct AT_API Type {
+struct CAFFE2_API Type {
   explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
 
@@ -140,7 +140,6 @@ struct AT_API Type {
   TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
-
 };
 
 } // namespace at
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h
index e4a75abb48993e..73c1f0f1d27cdd 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/templates/TypeDefault.h
@@ -6,7 +6,7 @@
 
 namespace at {
 
-struct AT_API TypeDefault : public TypeExtendedInterface {
+struct CAFFE2_API TypeDefault : public TypeExtendedInterface {
   explicit TypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : TypeExtendedInterface(type_id, is_variable, is_undefined) {}
 
diff --git a/aten/src/ATen/templates/TypeExtendedInterface.h b/aten/src/ATen/templates/TypeExtendedInterface.h
index 82cb658c9eeea8..03af27f146b66f 100644
--- a/aten/src/ATen/templates/TypeExtendedInterface.h
+++ b/aten/src/ATen/templates/TypeExtendedInterface.h
@@ -3,7 +3,7 @@
 
 namespace at {
 
-struct AT_API TypeExtendedInterface : public Type {
+struct CAFFE2_API TypeExtendedInterface : public Type {
   explicit TypeExtendedInterface(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : Type(type_id, is_variable, is_undefined) {}
   ${pure_virtual_extended_type_method_declarations}
diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h
index 578d689400baf4..4cb316adc01744 100644
--- a/aten/src/TH/THAllocator.h
+++ b/aten/src/TH/THAllocator.h
@@ -32,8 +32,8 @@ TH_API THAllocator* getTHDefaultAllocator(void);
 // the non-file descriptor constructor
 enum WithFd { WITH_FD };
 
-class AT_API THMapAllocator {
-public:
+class CAFFE2_API THMapAllocator {
+ public:
   THMapAllocator(const char *filename, int flags, size_t size);
   THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
   THMapAllocator(const THMapAllocator&) = delete;
@@ -82,12 +82,14 @@ class AT_API THMapAllocator {
 };
 
 // Base-from-member idiom
-struct AT_API THRefcountedMapAllocatorArgCheck {
+struct CAFFE2_API THRefcountedMapAllocatorArgCheck {
   THRefcountedMapAllocatorArgCheck(int flags);
 };
 
-class AT_API THRefcountedMapAllocator : private THRefcountedMapAllocatorArgCheck, public THMapAllocator {
-public:
+class CAFFE2_API THRefcountedMapAllocator
+    : private THRefcountedMapAllocatorArgCheck,
+      public THMapAllocator {
+ public:
   THRefcountedMapAllocator(const char *filename, int flags, size_t size);
   THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
 
diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h
index 8dadcc034b2c2f..323f745a4ac30f 100644
--- a/aten/src/THC/THCAllocator.h
+++ b/aten/src/THC/THCAllocator.h
@@ -7,8 +7,8 @@ THC_API THAllocator* getTHCudaHostAllocator(void);
 // IPC doesn't support (re)allocation
 
 #ifdef __cplusplus
-class AT_API THCIpcDeleter {
-public:
+class CAFFE2_API THCIpcDeleter {
+ public:
   THCIpcDeleter(void* data, int device) : data_(data), device_(device) {};
   ~THCIpcDeleter();
   static at::DataPtr makeDataPtr(void* data, int device);
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
new file mode 100644
index 00000000000000..9d9c714e5b0377
--- /dev/null
+++ b/c10/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Main build file for the C10 library.
+#
+# Note that the C10 library should maintain minimal dependencies - especially,
+# it should not depend on any library that is implementation specific or
+# backend specific. It should in particular NOT be dependent on any generated
+# protobuf header files, because protobuf header files will transitively force
+# one to link against a specific protobuf version.
+
+# ---[ Configure macro file.
+set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/macros/cmake_macros.h.in
+    ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
+
+# Note: if you want to add ANY dependency to the c10 library, make sure you
+# check with the core PyTorch developers as the dependendency will be
+# transitively passed on to all libraries dependent on PyTorch.
+file(GLOB_RECURSE C10_SRCS *.cpp)
+file(GLOB_RECURSE C10_HEADERS *.h)
+add_library(c10 ${C10_SRCS} ${C10_HEADERS})
+# If building shared library, set dllimport/dllexport proper.
+target_compile_options(c10 PRIVATE "-DC10_BUILD_MAIN_LIB")
+target_include_directories(
+    c10 PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../>
+    $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+    $<INSTALL_INTERFACE:include>)
+
+# ---[ Installation
+# Note: for now, we will put all export path into one single Caffe2Targets group
+# to deal with the cmake deployment need. Inside the Caffe2Targets set, the
+# individual libraries like libc10.so and libcaffe2.so are still self-contained.
+install(TARGETS c10 EXPORT Caffe2Targets DESTINATION lib)
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+install(FILES ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h
+        DESTINATION include/c10/macros)
diff --git a/c10/c10_dummy.cpp b/c10/c10_dummy.cpp
new file mode 100644
index 00000000000000..df4e73171da3ff
--- /dev/null
+++ b/c10/c10_dummy.cpp
@@ -0,0 +1,7 @@
+#include "c10/c10_dummy.h"
+
+namespace c10 {
+bool HasC10() {
+  return true;
+}
+} // namespace c10
diff --git a/c10/c10_dummy.h b/c10/c10_dummy.h
new file mode 100644
index 00000000000000..cf6c6b30c14bbf
--- /dev/null
+++ b/c10/c10_dummy.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "c10/macros/Macros.h"
+
+namespace c10 {
+C10_API bool HasC10();
+}
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
new file mode 100644
index 00000000000000..8e593e0100bbf9
--- /dev/null
+++ b/c10/macros/Export.h
@@ -0,0 +1,76 @@
+/* Header file to define the common scaffolding for exported symbols.
+ *
+ * Export is by itself a quite tricky situation to deal with, and if you are
+ * hitting this file, make sure you start with the background here:
+ * - Linux: https://gcc.gnu.org/wiki/Visibility
+ * - Windows:
+ * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+#pragma once
+
+// You do not need to edit this part of file unless you are changing the core
+// pytorch export abstractions.
+//
+// This part defines the C10 core export and import macros. This is controlled
+// by whether we are building shared libraries or not, which is determined
+// during build time and codified in c10/core/cmake_macros.h.
+// When the library is built as a shared lib, EXPORT and IMPORT will contain
+// visibility attributes. If it is being built as a static lib, then EXPORT
+// and IMPORT basically have no effect.
+
+// As a rule of thumb, you should almost NEVER mix static and shared builds for
+// libraries that depend on c10. AKA, if c10 is built as a static library, we
+// recommend everything dependent on c10 to be built statically. If c10 is built
+// as a shared library, everything dependent on it should be built as shared. In
+// the PyTorch project, all native libraries shall use the macro
+// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
+// libraries.
+
+#ifdef _WIN32
+#if defined(C10_BUILD_SHARED_LIBS)
+#define C10_EXPORT __declspec(dllexport)
+#define C10_IMPORT __declspec(dllimport)
+#else
+#define C10_EXPORT
+#define C10_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_EXPORT
+#endif // defined(__GNUC__)
+#define C10_IMPORT C10_EXPORT
+#endif // _WIN32
+
+// Definition of an adaptive XX_API macro, that depends on whether you are
+// building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
+// Basically, you will need to do this for each shared library that you are
+// building, and the instruction is as follows: assuming that you are building
+// a library called libawesome.so. You should:
+// (1) for your cmake target (usually done by "add_library(awesome, ...)"),
+//     define a macro called AWESOME_BUILD_MAIN_DLL using
+//     target_compile_options.
+// (2) define the AWESOME_API macro similar to the one below.
+// And in the source file of your awesome library, use AWESOME_API to
+// annotate public symbols.
+
+// Here, for the C10 library, we will define the macro C10_API for both import
+// and export.
+
+// This one is being used by libc10.so
+#ifdef C10_BUILD_MAIN_DLL
+#define C10_API C10_EXPORT
+#else
+#define C10_API C10_IMPORT
+#endif
+
+// This one is being used by libcaffe2.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define CAFFE2_API C10_EXPORT
+#else
+#define CAFFE2_API C10_IMPORT
+#endif
diff --git a/c10/macros/Legacy.h b/c10/macros/Legacy.h
new file mode 100644
index 00000000000000..eb17bdb7940dc5
--- /dev/null
+++ b/c10/macros/Legacy.h
@@ -0,0 +1,20 @@
+/* A centralized location to provide legacy macro support, and a warning about
+ * when this legacy compatibility symbol is going to removed in the future.
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+#pragma once
+
+// Note: this is for caffe2/*. Will need to codemod to use direct C10.
+#define CAFFE2_EXPORT C10_EXPORT
+#define CAFFE2_IMPORT C10_IMPORT
+
+// Note: this is for aten/src/*. Will need to codemod.
+#define AT_CORE_API CAFFE2_API
+#define AT_CORE_EXPORT C10_EXPORT
+#define AT_CORE_IMPORT C10_IMPORT
+
+// Note: this is for both aten and c2, due to cross reference between c2 and
+// aten that we try to unentangle. Will need to codemod.
+#define AT_DISABLE_COPY_AND_ASSIGN C10_DISABLE_COPY_AND_ASSIGN
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
new file mode 100644
index 00000000000000..2b438d670f00de
--- /dev/null
+++ b/c10/macros/Macros.h
@@ -0,0 +1,32 @@
+/* Main entry for c10/macros.
+ *
+ * In your code, include c10/macros/Macros.h directly, instead of individual
+ * files in this folder.
+ */
+
+#pragma once
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include "c10/macros/cmake_macros.h"
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include "c10/macros/Export.h"
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname&) = delete;        \
+  classname& operator=(const classname&) = delete
+
+// Finally, file that provides legacy support for macros
+#include "c10/macros/Legacy.h"
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
new file mode 100644
index 00000000000000..73bc803f063551
--- /dev/null
+++ b/c10/macros/cmake_macros.h.in
@@ -0,0 +1,6 @@
+// Automatically generated header file for the C10 library.
+// Do not include this file directly. Instead, include c10/macros/Macros.h.
+
+#pragma once
+
+#cmakedefine C10_BUILD_SHARED_LIBS
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 19aa3fbbc27539..885ca028fb2464 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -194,7 +194,6 @@ target_include_directories(caffe2_protos INTERFACE $<INSTALL_INTERFACE:include>)
 target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
 
 # Compile exposed libraries.
-list(APPEND Caffe2_CPU_SRCs $<TARGET_OBJECTS:c10>)
 add_library(caffe2 ${Caffe2_CPU_SRCS})
 if (NOT WIN32)
   target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
@@ -206,6 +205,7 @@ if (${CAFFE2_LINK_LOCAL_PROTOBUF})
 else()
   target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 endif()
+target_link_libraries(caffe2 PUBLIC c10)
 target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
@@ -333,7 +333,7 @@ if(USE_CUDA)
   # NB: This must be target_compile_definitions, not target_compile_options,
   # as the latter is not respected by nvcc
   if (MSVC)
-	  target_compile_definitions(caffe2_gpu PRIVATE "-DCAFFE2_CUDA_BUILD_MAIN_LIB")
+    target_compile_definitions(caffe2_gpu PRIVATE "-DCAFFE2_CUDA_BUILD_MAIN_LIB")
   endif()
 
   # Set standard properties on the target
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index a5d4cf60b603c0..2582a605adee55 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -26,7 +26,7 @@
 // is automatically generated by the cmake script during build.
 #include "caffe2/core/macros.h"
 
-#include "ATen/core/Macros.h"
+#include "c10/macros/Macros.h"
 
 namespace caffe2 {
 
@@ -90,48 +90,6 @@ using std::vector;
 #define CAFFE2_NORETURN __attribute__((noreturn))
 #endif
 
-// Defines CAFFE2_EXPORT and CAFFE2_IMPORT. On Windows, this corresponds to
-// different declarations (dllexport and dllimport). On Linux/Mac, it just
-// resolves to the same "default visibility" setting.
-#if defined(_MSC_VER)
-#if defined(CAFFE2_BUILD_SHARED_LIBS)
-#define CAFFE2_EXPORT __declspec(dllexport)
-#define CAFFE2_IMPORT __declspec(dllimport)
-#else
-#define CAFFE2_EXPORT
-#define CAFFE2_IMPORT
-#endif
-#else
-#if defined(__GNUC__)
-#define CAFFE2_EXPORT __attribute__((__visibility__("default")))
-#else
-#define CAFFE2_EXPORT
-#endif
-#define CAFFE2_IMPORT CAFFE2_EXPORT
-#endif
-
-// CAFFE2_API is a macro that, depends on whether you are building the
-// main caffe2 library or not, resolves to either CAFFE2_EXPORT or
-// CAFFE2_IMPORT.
-//
-// This is used in e.g. Caffe2's protobuf files: when building the main library,
-// it is defined as CAFFE2_EXPORT to fix a Windows global-variable-in-dll
-// issue, and for anyone dependent on Caffe2 it will be defined as
-// CAFFE2_IMPORT.
-
-#ifdef CAFFE2_BUILD_MAIN_LIB
-#define CAFFE2_API CAFFE2_EXPORT
-#else
-#define CAFFE2_API CAFFE2_IMPORT
-#endif
-
-#ifdef CAFFE2_BUILD_OBSERVER_LIB
-#define CAFFE2_OBSERVER_API CAFFE2_EXPORT
-#else
-#define CAFFE2_OBSERVER_API CAFFE2_IMPORT
-#endif
-
-
 #if defined(_MSC_VER)
 #define NOMINMAX
 #endif
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index 37fcd939c4d61e..288c34afd5dbe7 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include <ATen/core/Error.h>
+#include "caffe2/core/common.h"
 #include "caffe2/core/flags.h"
 
 // CAFFE2_LOG_THRESHOLD is a compile time flag that would allow us to turn off
diff --git a/caffe2/perfkernels/CMakeLists.txt b/caffe2/perfkernels/CMakeLists.txt
index 3781bbb6afb6b0..a5701da807f4f7 100644
--- a/caffe2/perfkernels/CMakeLists.txt
+++ b/caffe2/perfkernels/CMakeLists.txt
@@ -17,8 +17,8 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${common_srcs})
 if (NOT MSVC AND CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
   add_library(Caffe2_perfkernels_avx OBJECT ${avx_srcs})
   add_library(Caffe2_perfkernels_avx2 OBJECT ${avx2_srcs})
-  add_dependencies(Caffe2_perfkernels_avx Caffe2_PROTO)
-  add_dependencies(Caffe2_perfkernels_avx2 Caffe2_PROTO)
+  add_dependencies(Caffe2_perfkernels_avx Caffe2_PROTO c10)
+  add_dependencies(Caffe2_perfkernels_avx2 Caffe2_PROTO c10)
   if (MSVC)
     set_target_properties(
         Caffe2_perfkernels_avx PROPERTIES COMPILE_FLAGS "/arch:AVX")
diff --git a/modules/observers/macros.h b/modules/observers/macros.h
new file mode 100644
index 00000000000000..e69b055d2a1d55
--- /dev/null
+++ b/modules/observers/macros.h
@@ -0,0 +1,7 @@
+#include "c10/macros/Macros.h"
+
+#ifdef CAFFE2_BUILD_OBSERVER_LIB
+#define CAFFE2_OBSERVER_API C10_EXPORT
+#else
+#define CAFFE2_OBSERVER_API C10_IMPORT
+#endif
diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h
index 3650e4584f9920..5619b69a636e7d 100644
--- a/modules/observers/net_observer_reporter.h
+++ b/modules/observers/net_observer_reporter.h
@@ -4,6 +4,7 @@
 
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
+#include "observers/macros.h"
 
 namespace caffe2 {
 
diff --git a/modules/observers/net_observer_reporter_print.h b/modules/observers/net_observer_reporter_print.h
index eb712b8e71ea2c..098a7f7573399d 100644
--- a/modules/observers/net_observer_reporter_print.h
+++ b/modules/observers/net_observer_reporter_print.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "observers/macros.h"
 #include "observers/net_observer_reporter.h"
 
 #include "caffe2/core/common.h"
diff --git a/modules/observers/observer_config.h b/modules/observers/observer_config.h
index e1a6b3a0ead8b9..cc967263a66b9b 100644
--- a/modules/observers/observer_config.h
+++ b/modules/observers/observer_config.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "observers/macros.h"
 #include "observers/net_observer_reporter.h"
 
 #include "caffe2/core/common.h"
diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h
index 6fb4063ffe4801..11fb870a619614 100644
--- a/modules/observers/perf_observer.h
+++ b/modules/observers/perf_observer.h
@@ -4,6 +4,7 @@
 #include "caffe2/core/net.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/timer.h"
+#include "observers/macros.h"
 
 #include <unordered_map>
 
diff --git a/setup.py b/setup.py
index 381123b2b9ced8..ff05793ce0f316 100644
--- a/setup.py
+++ b/setup.py
@@ -1208,6 +1208,8 @@ def make_relative_rpath(path):
                 'lib/include/ATen/cudnn/*.h',
                 'lib/include/ATen/detail/*.h',
                 'lib/include/caffe2/utils/*.h',
+                'lib/include/c10/*.h',
+                'lib/include/c10/macros/*.h',
                 'lib/include/torch/*.h',
                 'lib/include/torch/csrc/*.h',
                 'lib/include/torch/csrc/api/include/torch/detail/ordered_dict.h',

From ffbac7d0bb4e0772cab0054f79339478124ea9aa Mon Sep 17 00:00:00 2001
From: Syed Tousif Ahmed <syed.ahmed.emails@gmail.com>
Date: Mon, 24 Sep 2018 11:50:41 -0700
Subject: [PATCH 05/51] Miscellaneous updates for CUDA 10 (#12017)

Summary:
This PR has some updates related to CUDA 10.

- https://github.com/pytorch/pytorch/commit/c2195e98647fec9d8227ecb85de28f2ca9a8e29a ensures that the repo successfully builts on CUDA 10. Addresses https://github.com/pytorch/pytorch/issues/11888
- https://github.com/pytorch/pytorch/commit/423d8d3524e29d20d9f7298702f8c068f5c8ad46 follows up on the cufft max plan number bug: https://github.com/pytorch/pytorch/issues/11089, which has been fixed in CUDA 10.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12017

Differential Revision: D10013405

Pulled By: soumith

fbshipit-source-id: 5bc6d7f71d5133f7821b407b1ac6c51bef0f6fa8
---
 aten/src/ATen/native/cuda/CuFFTPlanCache.h | 20 ++++++++++++++++++--
 caffe2/utils/GpuDefs.cuh                   |  2 +-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 8715a9ef460ee6..c8ea2c3c691964 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -346,6 +346,7 @@ class CuFFTConfig {
 //     be fine for now.
 // TODO: When CUDA 10 comes out, check if the bug is fixed or if we need another
 //       number for CUDA 10.
+// Update: bug related to cuFFT plan cache max size has been fixed in CUDA 10.
 constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023;
 static_assert(CUFFT_MAX_PLAN_NUM >= 0 && CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
               "CUFFT_MAX_PLAN_NUM not in size_t range");
@@ -389,12 +390,17 @@ class CuFFTParamsLRUCache {
 
     // Miss
     // remove if needed
+    // bug related to cuFFT plan cache max size has been fixed
+    // in CUDA 10. Hence, when compiling with CUDA 10, just
+    // don't do the erase.
+    #if CUDA_VERSION < 10000
     if (_usage_list.size() >= _max_size) {
       auto last = _usage_list.end();
       last--;
       _cache_map.erase(last->first);
       _usage_list.pop_back();
     }
+    #endif
 
     // construct new plan at list front, then insert into _cache_map
     _usage_list.emplace_front(std::piecewise_construct,
@@ -414,7 +420,8 @@ class CuFFTParamsLRUCache {
 
   void resize(int64_t new_size) {
     _set_max_size(new_size);
-
+    // no-op when compiling with CUDA 10.
+    #if CUDA_VERSION < 10000
     auto cur_size = _usage_list.size();
     if (cur_size > _max_size) {
       auto delete_it = _usage_list.end();
@@ -424,17 +431,26 @@ class CuFFTParamsLRUCache {
       }
       _usage_list.erase(delete_it, _usage_list.end());
     }
+    #endif
   }
 
   size_t size() const { return _cache_map.size(); }
 
-  size_t max_size() const noexcept { return _max_size; }
+  size_t max_size() const noexcept {
+    #if CUDA_VERSION < 10000
+      return _max_size;
+    #else
+      return size();
+    #endif
+  }
 
 private:
   // Only sets size and does value check. Does not resize the data structures.
   void _set_max_size(int64_t new_size) {
+    #if CUDA_VERSION < 10000
     AT_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
              "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
+    #endif
     AT_CHECK(new_size >= 0,
              "cuFFT plan cache size must be non-negative, but got ", new_size);
     _max_size = static_cast<size_t>(new_size);
diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
index cf54f9e851bfac..0f94ae9e018ba5 100644
--- a/caffe2/utils/GpuDefs.cuh
+++ b/caffe2/utils/GpuDefs.cuh
@@ -8,7 +8,7 @@ namespace caffe2 {
 // Static definition of GPU warp size for unrolling and code generation
 
 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 700
+#if __CUDA_ARCH__ <= 750
 constexpr int kWarpSize = 32;
 #else
 #error Unknown __CUDA_ARCH__; please define parameters for compute capability

From 51414822f59806fbe5cc4adf65155fa0ef437564 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Mon, 24 Sep 2018 13:16:13 -0700
Subject: [PATCH 06/51] Stop moving constants into DifferentiableSubgraphs
 (#11809)

Summary:
Or even taking them as inputs. This prevents optimizations to happen
either inside the differentiable subgraphs, or in the surrounding graph.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11809

Differential Revision: D10009680

Pulled By: apaszke

fbshipit-source-id: face638566228e470a6deec48dc2aa3a1cce26d4
---
 test/expect/TestJit.test_cpp_cuda.expect      | 16 +++++++------
 test/test_jit.py                              | 15 ++++++++++++
 torch/csrc/jit/constants.cpp                  |  3 +++
 torch/csrc/jit/init.cpp                       |  4 ++++
 .../jit/passes/create_autodiff_subgraphs.cpp  | 23 ++++++++++++++-----
 .../jit/passes/create_autodiff_subgraphs.h    |  6 +++--
 6 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect
index 451f1f9329601c..8453308a0dfb54 100644
--- a/test/expect/TestJit.test_cpp_cuda.expect
+++ b/test/expect/TestJit.test_cpp_cuda.expect
@@ -65,6 +65,8 @@ graph(%0 : Dynamic
       %3 : Dynamic
       %4 : Dynamic) {
   %23 : Dynamic, %24 : Dynamic = prim::DifferentiableGraph_0(%0, %3, %1, %4, %2)
+  %7 : int = prim::Constant[value=1]()
+  %19 : int = prim::Constant[value=1]()
   return (%24, %23);
 }
 with prim::DifferentiableGraph_0 = graph(%1 : Dynamic
@@ -74,20 +76,20 @@ with prim::DifferentiableGraph_0 = graph(%1 : Dynamic
       %17 : Dynamic) {
   %0 : Dynamic = aten::mm(%1, %2)
   %3 : Dynamic = aten::mm(%4, %5)
-  %6 : int = prim::Constant[value=1]()
-  %7 : Dynamic = aten::add(%0, %3, %6)
-  %8 : Dynamic, %9 : Dynamic, %10 : Dynamic, %11 : Dynamic = prim::ConstantChunk[chunks=4, dim=1](%7)
+  %7 : int = prim::Constant[value=1]()
+  %6 : Dynamic = aten::add(%0, %3, %7)
+  %8 : Dynamic, %9 : Dynamic, %10 : Dynamic, %11 : Dynamic = prim::ConstantChunk[chunks=4, dim=1](%6)
   %12 : Dynamic = aten::sigmoid(%8)
   %13 : Dynamic = aten::sigmoid(%11)
   %14 : Dynamic = aten::tanh(%10)
   %15 : Dynamic = aten::sigmoid(%9)
   %16 : Dynamic = aten::mul(%15, %17)
   %18 : Dynamic = aten::mul(%12, %14)
-  %19 : int = prim::Constant[value=1]()
-  %20 : Dynamic = aten::add(%16, %18, %19)
-  %21 : Dynamic = aten::tanh(%20)
+  %20 : int = prim::Constant[value=1]()
+  %19 : Dynamic = aten::add(%16, %18, %20)
+  %21 : Dynamic = aten::tanh(%19)
   %22 : Dynamic = aten::mul(%13, %21)
-  return (%20, %22);
+  return (%19, %22);
 }
 
 testDifferentiate
diff --git a/test/test_jit.py b/test/test_jit.py
index be0dccf8afc411..c8268f03fa737c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -531,6 +531,21 @@ def forward(self, input):
         input = torch.rand(3, 4)
         self.assertEqual(2 * input + 1, m(input))
 
+    def test_diff_subgraph_clones_constants(self):
+        @torch.jit.script
+        def f(x, y):
+            return x + x + y + x + y + x + y + x + y + x
+
+        def count_constants(graph):
+            return sum(node.kind() == 'prim::Constant' for node in graph.nodes())
+
+        graph = f.graph.copy()
+        self.run_pass('cse', graph)
+        self.run_pass('create_autodiff_subgraphs', graph)
+        nodes = list(graph.nodes())
+        self.assertEqual(count_constants(graph), 1)
+        self.assertEqual(count_constants(nodes[1].g('Subgraph')), 1)
+
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
     # and we attempted to trace its derivative (which is not
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 4cdb193d8434d8..f1844d2bac6651 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -16,6 +16,9 @@ Value* insertConstant(
     if(!ref.defined()) {
       throw constant_not_supported_error("undefined tensors cannot become constants");
     }
+    if (ref.is_variable()) {
+      ref = autograd::Variable(ref).data();
+    }
     n->output()->inferTypeFrom(ref); // note: before t_ because of std::move(ref)
     n->t_(attr::value, std::move(ref));
   } else if(val.isInt()) {
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 751035a00c0bae..98a7b010419324 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -14,6 +14,7 @@
 #include "torch/csrc/jit/passes/erase_number_types.h"
 #include "torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
 #include "torch/csrc/jit/passes/peephole.h"
 #include "torch/csrc/jit/passes/canonicalize.h"
 #include "torch/csrc/jit/passes/onnx/peephole.h"
@@ -106,6 +107,9 @@ void initJITBindings(PyObject *module) {
      return ConstantPropagation(g);
    })
    .def("_jit_pass_erase_shape_information", EraseShapeInformation)
+   .def("_jit_pass_create_autodiff_subgraphs", [](Graph& graph) {
+     CreateAutodiffSubgraphs(graph);
+   })
    .def("_jit_run_cpp_tests", [] {
      // We have to release the GIL inside this method, because if we happen to
      // initialize the autograd engine in these tests, the newly spawned worker threads will
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
index 3554c22ddc70e8..d1d73a36ea8346 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
@@ -1,8 +1,11 @@
-#include <cstddef>
+#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
+
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/autodiff.h"
 #include "torch/csrc/jit/assertions.h"
 
+#include <cstddef>
+
 namespace torch { namespace jit {
 
 struct Graph;
@@ -30,6 +33,11 @@ Node* mergeNodes(Block * block, Symbol group_node_kind, ArrayRef<Node*> nodes) {
     if(value_map.count(v) > 0) {
       return value_map[v];
     }
+    if (auto value = toIValue(v)) {
+      Value * nv = new_graph->insertConstant(*value);
+      value_map[v] = nv;
+      return nv;
+    }
     Value * nv = new_graph->addInput()->setType(v->type());
     group_node->addInput(v);
     value_map[v] = nv;
@@ -69,8 +77,6 @@ Node* mergeNodes(Block * block, Symbol group_node_kind, ArrayRef<Node*> nodes) {
   return group_node;
 }
 
-}
-
 void CreateAutodiffSubgraphs(Block * block, size_t threshold, std::vector<Node*>& diff_graphs) {
   // This implementation is not optimal, but it is simple.
   // It just scans through the list in order looking for runs of
@@ -90,8 +96,12 @@ void CreateAutodiffSubgraphs(Block * block, size_t threshold, std::vector<Node*>
   for(Node * node : block->nodes()) { // Note: nodes() iterator stays valid since it is
                             // always pointing _after_ the nodes that mergeNodes
                             // mutates.
-    if(isDifferentiable(node)) {
-      groupable.push_back(node);
+    if (isDifferentiable(node)) {
+      // Constants are generally cheap to clone, so it's better to replicate them,
+      // instead of moving them out from the original graph.
+      if (node->kind() != prim::Constant) {
+        groupable.push_back(node);
+      }
     } else {
       if(groupable.size() >= threshold) {
         diff_graphs.push_back(mergeNodes(block, prim::DifferentiableGraph, groupable));
@@ -107,11 +117,12 @@ void CreateAutodiffSubgraphs(Block * block, size_t threshold, std::vector<Node*>
   }
 }
 
+} // anonymous namespace
+
 std::vector<Node*> CreateAutodiffSubgraphs(Graph & graph, size_t threshold) {
   std::vector<Node*> diff_nodes;
   CreateAutodiffSubgraphs(graph.block(), threshold, diff_nodes);
   return diff_nodes;
 }
 
-
 }}
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
index 44a6683dc4ce35..1908b03e2568fb 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.h
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
@@ -1,10 +1,12 @@
 #pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
 #include <cstddef>
 
 namespace torch { namespace jit {
 
-struct Graph;
-
 // insert GraphExecutor nodes that group together
 // subgraphs that are differentiable by the jit's autodiff passes
 // threshold - minimum number of nodes that will appear in a block

From 1c09bfde1b311f836ecc1b0f491895b2ac83a297 Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Mon, 24 Sep 2018 13:54:35 -0700
Subject: [PATCH 07/51] Make promoteType(half, integer) -> half (#11941)

Summary:
Changes the result type of half type and any integer type to return half
type (instead of float or double).

This is based on top of #11808. The first new commit is "Make promoteType(half, integer) -> half". I'll rebase on top of master once that PR lands.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11941

Differential Revision: D10014122

Pulled By: colesbury

fbshipit-source-id: 16a5eb3406a5712069201d872d8736d0599e9411
---
 aten/src/ATen/core/ScalarType.h |  8 ++++----
 test/test_cuda.py               | 29 +++++++++++++----------------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
index 6fe88bfadb05f5..5a88fadf00de67 100644
--- a/aten/src/ATen/core/ScalarType.h
+++ b/aten/src/ATen/core/ScalarType.h
@@ -178,10 +178,10 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
             /* u1  i1  i2  i4  i8  f2  f4  f8 */
     /* u1 */ { u1, i2, i2, i4, i8, f2, f4, f8 },
     /* i1 */ { i2, i1, i2, i4, i8, f2, f4, f8 },
-    /* i2 */ { i2, i2, i2, i4, i8, f4, f4, f8 },
-    /* i4 */ { i4, i4, i4, i4, i8, f8, f4, f8 },
-    /* i8 */ { i8, i8, i8, i8, i8, f8, f4, f8 },
-    /* f2 */ { f2, f2, f4, f8, f8, f2, f4, f8 },
+    /* i2 */ { i2, i2, i2, i4, i8, f2, f4, f8 },
+    /* i4 */ { i4, i4, i4, i4, i8, f2, f4, f8 },
+    /* i8 */ { i8, i8, i8, i8, i8, f2, f4, f8 },
+    /* f2 */ { f2, f2, f2, f2, f2, f2, f4, f8 },
     /* f4 */ { f4, f4, f4, f4, f4, f4, f4, f8 },
     /* f8 */ { f8, f8, f8, f8, f8, f8, f8, f8 },
   };
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 36678940aebdb6..8f59afc0891806 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -909,23 +909,20 @@ def test_type_conversions(self):
         self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
 
     def test_mul_intertype_scalar(self):
-        x = torch.tensor(1.5, device='cuda')
-        y = torch.tensor(3, dtype=torch.int32, device='cuda')
-
-        self.assertEqual(x * y, 4.5)
-        self.assertEqual(y * x, 4.5)
-        with self.assertRaisesRegex(RuntimeError, 'expected type'):
-            y *= x
-        x *= y
-        self.assertEqual(x, 4.5)
-
-        x = torch.tensor(1.5, device='cuda', dtype=torch.float16)
-        self.assertEqual(x * y, 4.5)
-        # half * int currently promotes to double
-        with self.assertRaisesRegex(RuntimeError, 'expected type'):
+        def test_mul(dtype):
+            x = torch.tensor(1.5, dtype=dtype, device='cuda')
+            y = torch.tensor(3, dtype=torch.int32, device='cuda')
+
+            self.assertEqual(x * y, 4.5)
+            self.assertEqual(y * x, 4.5)
+            with self.assertRaisesRegex(RuntimeError, 'expected type'):
+                y *= x
             x *= y
-        with self.assertRaisesRegex(RuntimeError, 'expected type'):
-            y *= x
+            self.assertEqual(x, 4.5)
+
+        test_mul(torch.float16)
+        test_mul(torch.float32)
+        test_mul(torch.float64)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     @skipIfRocm

From e05d689c49f650601243693ed433a23930d31e46 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 24 Sep 2018 14:28:54 -0700
Subject: [PATCH 08/51] Unify C++ API with C++ extensions (#11510)

Summary:
Currently the C++ API and C++ extensions are effectively two different, entirely orthogonal code paths. This PR unifies the C++ API with the C++ extension API by adding an element of Python binding support to the C++ API. This means the `torch/torch.h` included by C++ extensions, which currently routes to `torch/csrc/torch.h`, can now be rerouted to `torch/csrc/api/include/torch/torch.h` -- i.e. the main C++ API header. This header then includes Python binding support conditioned on a define (`TORCH_WITH_PYTHON_BINDINGS`), *which is only passed when building a C++ extension*.

Currently stacked on top of https://github.com/pytorch/pytorch/pull/11498

Why is this useful?

1. One less codepath. In particular, there has been trouble again and again due to the two `torch/torch.h` header files and ambiguity when both ended up in the include path. This is now fixed.
2. I have found that it is quite common to want to bind a C++ API module back into Python. This could be for simple experimentation, or to have your training loop in Python but your models in C++. This PR makes this easier by adding pybind11 support to the C++ API.
3. The C++ extension API simply becomes richer by gaining access to the C++ API headers.

soumith ezyang apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11510

Reviewed By: ezyang

Differential Revision: D9998835

Pulled By: goldsborough

fbshipit-source-id: 7a94b44a9d7e0377b7f1cfc99ba2060874d51535
---
 cmake/TorchConfig.cmake.in                    |  16 ++-
 setup.py                                      |  17 ++-
 .../complex_registration_extension.cpp        |   2 +-
 test/cpp_extensions/cpp_api_extension.cpp     |  38 +++++++
 test/cpp_extensions/cuda_extension.cpp        |   2 +-
 test/cpp_extensions/cudnn_extension.cpp       |   2 +-
 test/cpp_extensions/doubler.h                 |   2 +-
 test/cpp_extensions/extension.cpp             |   2 +-
 test/cpp_extensions/half_support.cu           |   2 +-
 test/cpp_extensions/jit_extension.cpp         |   2 +-
 test/cpp_extensions/jit_extension2.cpp        |   2 +-
 test/test_cpp_extensions.py                   |  51 ++++++++-
 torch/CMakeLists.txt                          |   2 +-
 .../api/include/torch/nn/modules/dropout.h    |   8 +-
 .../api/include/torch/nn/modules/sequential.h |   2 +
 torch/csrc/api/include/torch/nn/pimpl-inl.h   |  47 ++++++++
 torch/csrc/api/include/torch/nn/pimpl.h       |  43 ++++---
 torch/csrc/api/include/torch/python.h         | 107 ++++++++++++++++++
 torch/csrc/api/include/torch/torch.h          |   6 +
 torch/csrc/tensor/python_tensor.cpp           |   2 +-
 torch/{csrc/torch.h => extension.h}           |   1 -
 torch/utils/cpp_extension.py                  |  40 ++++---
 22 files changed, 320 insertions(+), 76 deletions(-)
 create mode 100644 test/cpp_extensions/cpp_api_extension.cpp
 create mode 100644 torch/csrc/api/include/torch/nn/pimpl-inl.h
 create mode 100644 torch/csrc/api/include/torch/python.h
 rename torch/{csrc/torch.h => extension.h} (79%)

diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 066a7e63f9c57a..2b847815603a9f 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -24,9 +24,13 @@ endif()
 
 # Include directories.
 if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include")
-  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include")
+  set(TORCH_INCLUDE_DIRS
+    ${TORCH_INSTALL_PREFIX}/lib/include
+    ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include)
 else()
-  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/include")
+  set(TORCH_INCLUDE_DIRS
+    ${TORCH_INSTALL_PREFIX}/include
+    ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include)
 endif()
 
 # Library dependencies.
@@ -45,7 +49,7 @@ if (@USE_CUDA@)
     set(TORCH_CUDA_LIBRARIES
       ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
       ${CUDA_LIBRARIES})
-    list(APPEND TORCH_INCLUDE_DIRS "${NVTOOLEXT_HOME}/include")
+    list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include)
   elseif(APPLE)
     set(TORCH_CUDA_LIBRARIES
       ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
@@ -66,8 +70,8 @@ endif()
 set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 
 set_target_properties(torch PROPERTIES
-    IMPORTED_LOCATION ${TORCH_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${TORCH_INCLUDE_DIRS}
-    INTERFACE_COMPILE_OPTIONS ${TORCH_CXX_FLAGS}
+    IMPORTED_LOCATION "${TORCH_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${TORCH_INCLUDE_DIRS}"
+    INTERFACE_COMPILE_OPTIONS "${TORCH_CXX_FLAGS}"
     CXX_STANDARD 11
 )
diff --git a/setup.py b/setup.py
index ff05793ce0f316..5132d357bdc8bb 100644
--- a/setup.py
+++ b/setup.py
@@ -471,18 +471,9 @@ def check_file(f):
             if not same:
                 shutil.copyfile(orig_file, sym_file)
 
-        # Copy headers necessary to compile C++ extensions.
-        #
-        # This is not perfect solution as build does not depend on any of
-        # the auto-generated code and auto-generated files will not be
-        # included in this copy. If we want to use auto-generated files,
-        # we need to find a better way to do this.
-        # More information can be found in conversation thread of PR #5772
-
         self.copy_tree('torch/lib/tmp_install/share', 'torch/share')
         self.copy_tree('third_party/pybind11/include/pybind11/',
                        'torch/lib/include/pybind11')
-        self.copy_file('torch/csrc/torch.h', 'torch/lib/include/torch/torch.h')
 
 
 build_dep_cmds = {}
@@ -1212,7 +1203,13 @@ def make_relative_rpath(path):
                 'lib/include/c10/macros/*.h',
                 'lib/include/torch/*.h',
                 'lib/include/torch/csrc/*.h',
-                'lib/include/torch/csrc/api/include/torch/detail/ordered_dict.h',
+                'lib/include/torch/csrc/api/include/torch/*.h',
+                'lib/include/torch/csrc/api/include/torch/detail/*.h',
+                'lib/include/torch/csrc/api/include/torch/nn/*.h',
+                'lib/include/torch/csrc/api/include/torch/nn/modules/*.h',
+                'lib/include/torch/csrc/api/include/torch/nn/parallel/*.h',
+                'lib/include/torch/csrc/api/include/torch/optim/*.h',
+                'lib/include/torch/csrc/api/include/torch/serialize/*.h',
                 'lib/include/torch/csrc/autograd/*.h',
                 'lib/include/torch/csrc/autograd/generated/*.h',
                 'lib/include/torch/csrc/cuda/*.h',
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
index db75e3f67f7772..21b05d060b190c 100644
--- a/test/cpp_extensions/complex_registration_extension.cpp
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include <ATen/CPUFloatType.h>
 #include <ATen/Type.h>
diff --git a/test/cpp_extensions/cpp_api_extension.cpp b/test/cpp_extensions/cpp_api_extension.cpp
new file mode 100644
index 00000000000000..066ad64160fa5c
--- /dev/null
+++ b/test/cpp_extensions/cpp_api_extension.cpp
@@ -0,0 +1,38 @@
+#include <torch/extension.h>
+#include <torch/python.h>
+#include <torch/torch.h>
+
+struct Net : torch::nn::Module {
+  Net(int64_t in, int64_t out)
+      : fc(in, out),
+        bn(torch::nn::BatchNormOptions(out).stateful(true)),
+        dropout(0.5) {
+    register_module("fc", fc);
+    register_module("bn", bn);
+    register_module("dropout", dropout);
+  }
+
+  torch::Tensor forward(torch::Tensor x) {
+    return dropout->forward(bn->forward(torch::relu(fc->forward(x))));
+  }
+
+  void set_bias(torch::Tensor bias) {
+    fc->bias = bias;
+  }
+
+  torch::Tensor get_bias() const {
+    return fc->bias;
+  }
+
+  torch::nn::Linear fc;
+  torch::nn::BatchNorm bn;
+  torch::nn::Dropout dropout;
+};
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  torch::python::bind_module<Net>(m, "Net")
+      .def(py::init<int64_t, int64_t>())
+      .def("forward", &Net::forward)
+      .def("set_bias", &Net::set_bias)
+      .def("get_bias", &Net::get_bias);
+}
diff --git a/test/cpp_extensions/cuda_extension.cpp b/test/cpp_extensions/cuda_extension.cpp
index 963850acc2795d..9946b4f9cb97d4 100644
--- a/test/cpp_extensions/cuda_extension.cpp
+++ b/test/cpp_extensions/cuda_extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 // Declare the function from cuda_extension.cu. It will be compiled
 // separately with nvcc and linked with the object file of cuda_extension.cpp
diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp
index 7c3be3e4716302..498e01a116a152 100644
--- a/test/cpp_extensions/cudnn_extension.cpp
+++ b/test/cpp_extensions/cudnn_extension.cpp
@@ -10,7 +10,7 @@
  * 5) Return something (optional).
  */
 
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include <ATen/cudnn/Descriptors.h> // for TensorDescriptor
 #include <ATen/cuda/Exceptions.h> // for CUDNN_CHECK
diff --git a/test/cpp_extensions/doubler.h b/test/cpp_extensions/doubler.h
index 2b22dca1284cd5..d9e6aaea8c3465 100644
--- a/test/cpp_extensions/doubler.h
+++ b/test/cpp_extensions/doubler.h
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 struct Doubler {
   Doubler(int A, int B) {
diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp
index 8e79397296910e..3ba27d92f32d7b 100644
--- a/test/cpp_extensions/extension.cpp
+++ b/test/cpp_extensions/extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 at::Tensor sigmoid_add(at::Tensor x, at::Tensor y) {
   return x.sigmoid() + y.sigmoid();
diff --git a/test/cpp_extensions/half_support.cu b/test/cpp_extensions/half_support.cu
index a3621bfe7c55fb..9d420438fb5268 100644
--- a/test/cpp_extensions/half_support.cu
+++ b/test/cpp_extensions/half_support.cu
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include <THC/THCNumerics.cuh>
 
diff --git a/test/cpp_extensions/jit_extension.cpp b/test/cpp_extensions/jit_extension.cpp
index e62be5b38ba1dd..576e7fc9a1d3c1 100644
--- a/test/cpp_extensions/jit_extension.cpp
+++ b/test/cpp_extensions/jit_extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include "doubler.h"
 
diff --git a/test/cpp_extensions/jit_extension2.cpp b/test/cpp_extensions/jit_extension2.cpp
index e197308c3d59e4..cfd472137187a5 100644
--- a/test/cpp_extensions/jit_extension2.cpp
+++ b/test/cpp_extensions/jit_extension2.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 using namespace at;
 
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index 3702205e4c4491..e5b1121784f070 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -23,6 +23,9 @@
     TEST_CUDNN = TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()
 
 
+IS_WINDOWS = sys.platform == 'win32'
+
+
 class TestCppExtension(common.TestCase):
     def setUp(self):
         if sys.platform != 'win32':
@@ -189,7 +192,7 @@ def test_inline_jit_compile_extension_multiple_sources_and_no_functions(self):
         '''
 
         cpp_source2 = '''
-        #include <torch/torch.h>
+        #include <torch/extension.h>
         at::Tensor sin_add(at::Tensor x, at::Tensor y);
         PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           m.def("sin_add", &sin_add, "sin(x) + sin(y)");
@@ -265,7 +268,7 @@ def test_lenient_flag_handling_in_jit_extensions(self):
             cpp_sources=cpp_source,
             functions='tanh_add',
             extra_cflags=['-g\n\n', '-O0 -Wall'],
-            extra_include_paths=['       cpp_extensions\n', '../'],
+            extra_include_paths=['       cpp_extensions\n'],
             verbose=True)
 
         x = torch.zeros(100, dtype=torch.float32)
@@ -341,6 +344,50 @@ def compile(code):
         module = compile('int f() { return 789; }')
         self.assertEqual(module.f(), 789)
 
+    @unittest.skipIf(IS_WINDOWS, "C++ API not yet supported on Windows")
+    def test_cpp_api_extension(self):
+        here = os.path.abspath(__file__)
+        pytorch_root = os.path.dirname(os.path.dirname(here))
+        api_include = os.path.join(pytorch_root, 'torch', 'csrc', 'api', 'include')
+        module = torch.utils.cpp_extension.load(
+            name='cpp_api_extension',
+            sources='cpp_extensions/cpp_api_extension.cpp',
+            extra_include_paths=api_include,
+            extra_cflags=[] if IS_WINDOWS else ['-UTORCH_API_INCLUDE_EXTENSION_H'],
+            verbose=True)
+
+        net = module.Net(3, 5)
+
+        self.assertTrue(net.training)
+        net.eval()
+        self.assertFalse(net.training)
+        net.train()
+        self.assertTrue(net.training)
+        net.eval()
+
+        input = torch.randn(2, 3, dtype=torch.float32)
+        output = net.forward(input)
+        self.assertEqual(output, net.forward(input))
+        self.assertEqual(list(output.shape), [2, 5])
+
+        bias = net.get_bias()
+        self.assertEqual(list(bias.shape), [5])
+        net.set_bias(bias + 1)
+        self.assertEqual(net.get_bias(), bias + 1)
+        output2 = net.forward(input)
+
+        self.assertNotEqual(output + 1, output2)
+
+        self.assertEqual(len(net.parameters()), 4)
+
+        p = net.named_parameters()
+        self.assertEqual(type(p), dict)
+        self.assertEqual(len(p), 4)
+        self.assertIn('fc.weight', p)
+        self.assertIn('fc.bias', p)
+        self.assertIn('bn.weight', p)
+        self.assertIn('bn.bias', p)
+
 
 if __name__ == '__main__':
     common.run_tests()
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index be13aaa61b97b5..af5bfc0fdc8ef5 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -411,7 +411,7 @@ endif()
 install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
         FILES_MATCHING PATTERN "*.h")
-install(FILES "${TORCH_SRC_DIR}/script.h"
+install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 install(TARGETS torch
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index bfe230a597215f..48b89642864ed3 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -12,20 +12,18 @@ namespace nn {
 
 /// Options for `Dropout` and `FeatureDropout`.
 struct DropoutOptions {
-  DropoutOptions(double rate);
+  /* implicit */ DropoutOptions(double rate = 0.5);
   /// The probability with which a particular component of the input is set to
   /// zero.
   /// Changes to this parameter at runtime are effective.
-  TORCH_ARG(double, rate) = 0.5;
+  TORCH_ARG(double, rate);
 };
 
 namespace detail {
 template <typename Derived>
 class DropoutImplBase : public torch::nn::Cloneable<Derived> {
  public:
-  explicit DropoutImplBase(double rate)
-      : DropoutImplBase(DropoutOptions(rate)) {}
-  explicit DropoutImplBase(DropoutOptions options_);
+  explicit DropoutImplBase(DropoutOptions options_ = DropoutOptions());
 
   void reset() override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
index e4839ac41a910b..3ee80042020b19 100644
--- a/torch/csrc/api/include/torch/nn/modules/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -92,6 +92,8 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   using Iterator = std::vector<AnyModule>::iterator;
   using ConstIterator = std::vector<AnyModule>::const_iterator;
 
+  SequentialImpl() = default;
+
   /// Constructs the `Sequential` from a variadic list of modules.
   template <typename... Modules>
   explicit SequentialImpl(Modules&&... modules) {
diff --git a/torch/csrc/api/include/torch/nn/pimpl-inl.h b/torch/csrc/api/include/torch/nn/pimpl-inl.h
new file mode 100644
index 00000000000000..9da1c38a8372d1
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/pimpl-inl.h
@@ -0,0 +1,47 @@
+// This class exists  only to do SFINAE on abstract types `T` that are really
+// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
+// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
+// `enable_if_t<is_base_of_v<ModuleHolderIndicator, T>>`.
+struct ModuleHolderIndicator {};
+
+// A type trait that is true for types that are `ModuleHolder`s.
+template <typename T>
+using is_module_holder = std::is_base_of<ModuleHolderIndicator, decay_t<T>>;
+
+template <typename T>
+using disable_if_module_holder_t = disable_if_t<is_module_holder<T>::value>;
+
+// A collection of templates that answer the question whether a type `T` is a
+// `ModuleHolder`, and if so whether its contained type is of type `C`. This is
+// tricky because it is hard to short circuit in template metaprogramming. A
+// naive and incorrect solution to this problem would be something like
+// `disable_if<is_module_holder<T>::value && typename T::ContainedType == C>`.
+// This would disable all types that are not `ModuleHolder`s, because even
+// though the `is_module_holder<T>::value` may be `false` for such types the
+// `T::ContainedType` access would be ill-formed and thus fail the whole
+// expression by the rules of SFINAE. Instead we have to use template
+// specialization to statically branch on the first condition
+// (`is_module_holder<T>`) and are only then allowed to query
+// `T::ContainedType` in the branch for which the condition was true.
+
+// Base template.
+template <bool is_module_holder_value, typename T, typename C>
+struct is_module_holder_of_impl;
+
+// False branch. `T` is not a `ModuleHolder` and thus not a `ModuleHolder` with
+// contained type `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<false, T, C> : std::false_type {};
+
+// True branch. `T` is a `ModuleHolder` and thus we can legit access its
+// `ContainedType` and compare it against `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<true, T, C>
+    : std::is_same<typename T::ContainedType, C> {};
+
+// Helper template.
+template <typename T, typename C>
+struct is_module_holder_of : is_module_holder_of_impl<
+                                 detail::is_module_holder<T>::value,
+                                 torch::decay_t<T>,
+                                 torch::decay_t<C>> {};
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
index 48c331e148686f..ecdd36af231870 100644
--- a/torch/csrc/api/include/torch/nn/pimpl.h
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -10,17 +10,8 @@
 
 namespace torch {
 namespace detail {
-/// This class exists  only to do SFINAE on abstract types `T` that are really
-/// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
-/// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
-/// `enable_if_t<is_base_of_v<ModuleHolderIndicator, T>>`.
-struct ModuleHolderIndicator {};
-
-template <typename T>
-using is_module_holder = std::is_base_of<ModuleHolderIndicator, decay_t<T>>;
-
-template <typename T>
-using disable_if_module_holder_t = disable_if_t<is_module_holder<T>::value>;
+// Dump all the template metaprogramming in this file.
+#include "pimpl-inl.h"
 } // namespace detail
 
 namespace nn {
@@ -40,7 +31,9 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
   using ContainedType = Contained;
 
   /// Default constructs the contained module if if has a default constructor,
-  /// else produces a static error. NOTE: This uses the behavior of template
+  /// else produces a static error.
+  ///
+  /// NOTE: This uses the behavior of template
   /// classes in C++ that constructors (or any methods) are only compiled when
   /// actually used.
   ModuleHolder() : impl_(default_construct()) {
@@ -58,9 +51,16 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
 
   /// Constructs the `ModuleHolder` with a contained module, forwarding all
   /// arguments to its constructor.
-  template <typename... Ts>
-  explicit ModuleHolder(Ts&&... ts)
-      : impl_(new Contained(std::forward<Ts>(ts)...)) {}
+  template <
+      typename Head,
+      typename... Tail,
+      typename = torch::disable_if_t<
+          detail::is_module_holder_of<Head, ContainedType>::value &&
+          (sizeof...(Tail) == 0)>>
+  explicit ModuleHolder(Head&& head, Tail&&... tail)
+      : impl_(new Contained(
+            std::forward<Head>(head),
+            std::forward<Tail>(tail)...)) {}
 
   /// Constructs the `ModuleHolder` from a pointer to the contained type.
   /// Example: `Linear(std::make_shared<LinearImpl>(...))`.
@@ -158,15 +158,10 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
 
 /// Defines a class `Name` which inherits from `nn::ModuleHolder` to provide a
 /// wrapper over a `std::shared_ptr<Impl>`.
-#define TORCH_MODULE_IMPL(Name, Impl)                                         \
-  class Name : public torch::nn::ModuleHolder<Impl> { /* NOLINT */            \
-   public:                                                                    \
-    using torch::nn::ModuleHolder<Impl>::ModuleHolder;                        \
-    Name(const Name&) = default; /* NOLINT */                                 \
-    Name(Name&&) = default; /* NOLINT */                                      \
-    Name(Name& other) : Name(static_cast<const Name&>(other)) {} /* NOLINT */ \
-    Name& operator=(const Name&) = default; /* NOLINT */                      \
-    Name& operator=(Name&&) = default; /* NOLINT */                           \
+#define TORCH_MODULE_IMPL(Name, Impl)                              \
+  class Name : public torch::nn::ModuleHolder<Impl> { /* NOLINT */ \
+   public:                                                         \
+    using torch::nn::ModuleHolder<Impl>::ModuleHolder;             \
   }
 
 /// Like `TORCH_MODULE_IMPL`, but defaults the `Impl` name to `<Name>Impl`.
diff --git a/torch/csrc/api/include/torch/python.h b/torch/csrc/api/include/torch/python.h
new file mode 100644
index 00000000000000..ba1da4599f439b
--- /dev/null
+++ b/torch/csrc/api/include/torch/python.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+#include <torch/tensor.h>
+
+#include <iterator>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace python {
+namespace detail {
+template <typename Cursor>
+std::vector<Tensor> cursor_to_vector(const Cursor& cursor) {
+  std::vector<Tensor> vector;
+  vector.reserve(cursor.size());
+  cursor.map(
+      std::back_inserter(vector), [](const Tensor& tensor) { return tensor; });
+  return vector;
+}
+
+template <typename Cursor>
+std::unordered_map<std::string, Tensor> cursor_to_map(const Cursor& cursor) {
+  std::unordered_map<std::string, Tensor> map;
+  map.reserve(cursor.size());
+  cursor.map_items(
+      std::inserter(map, map.end()),
+      [](const std::string& key, const Tensor& tensor) {
+        return std::make_pair(key, tensor);
+      });
+  return map;
+}
+} // namespace detail
+
+/// Adds method bindings for a pybind11 `class_` that binds an `nn::Module`
+/// subclass.
+///
+/// Say you have a pybind11 class object created with `py::class_<Net>(m,
+/// "Net")`. This function will add all the necessary `.def()` calls to bind the
+/// `nn::Module` base class' methods, such as `train()`, `eval()` etc. into
+/// Python. The exact list of supported methods and their Python signatures are:
+/// - `train()`
+/// - `eval()`
+/// - `is_training() -> bool`
+/// - `zero_grad()`
+/// - `cuda()`
+/// - `cpu()`
+/// - `parameters() -> List<Tensor>`
+/// - `named_parameters() -> Dict<String, Tensor>`
+/// - `buffers() -> List<Tensor>`
+/// - `named_buffers() -> Dict<String, Tensor>`
+template <typename M, typename... Extra>
+py::class_<M, Extra...> add_module_bindings(py::class_<M, Extra...> module) {
+  return module.def("train", [](M& module) { module.train(); })
+      .def("eval", [](M& module) { module.eval(); })
+      .def("clone", [](M& module) { return module.clone(); })
+      .def_property_readonly(
+          "training", [](M& module) { return module.is_training(); })
+      .def_property_readonly(
+          "training", [](M& module) { return module.is_training(); })
+      .def("zero_grad", [](M& module) { module.zero_grad(); })
+      .def("cuda", [](M& module) { module.to(torch::kCUDA); })
+      .def("cpu", [](M& module) { module.to(torch::kCPU); })
+      .def(
+          "parameters",
+          [](M& module) {
+            return detail::cursor_to_vector(module.parameters());
+          })
+      .def(
+          "named_parameters",
+          [](M& module) { return detail::cursor_to_map(module.parameters()); })
+      .def(
+          "buffers",
+          [](M& module) { return detail::cursor_to_vector(module.buffers()); })
+      .def("named_buffers", [](M& module) {
+        return detail::cursor_to_map(module.buffers());
+      });
+}
+
+/// Creates a pybind11 class object for an `nn::Module` subclass type and adds
+/// default bindings.
+///
+/// After adding the default bindings, the class object is returned, such that
+/// you can add more bindings.
+///
+/// Example usage:
+/// \rst
+/// .. code-block::
+///   struct Net : torch::nn::Module {
+///     Net(int in, int out) { }
+///     torch::Tensor forward(torch::Tensor x) { return x; }
+///   };
+///
+///   PYBIND11_MODULE(my_module, m) {
+///     torch::python::bind_module<Net>(m, "Net")
+///       .def(py::init<int, int>())
+///       .def("forward", &Net::forward);
+///  }
+/// \endrst
+template <typename M, typename... Extra>
+py::class_<M, Extra...> bind_module(py::module module, const char* name) {
+  return add_module_bindings(py::class_<M, Extra...>(module, name));
+}
+} // namespace python
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/torch.h b/torch/csrc/api/include/torch/torch.h
index 9b6eae58d9c72f..38bd5a571283f0 100644
--- a/torch/csrc/api/include/torch/torch.h
+++ b/torch/csrc/api/include/torch/torch.h
@@ -1,8 +1,14 @@
 #pragma once
 
 #include <torch/cuda.h>
+#include <torch/jit.h>
 #include <torch/nn.h>
 #include <torch/optim.h>
 #include <torch/serialize.h>
 #include <torch/tensor.h>
 #include <torch/utils.h>
+
+#ifdef TORCH_API_INCLUDE_EXTENSION_H
+#include <torch/extension.h>
+#warning "Including torch/torch.h for C++ extensions is deprecated. Please include torch/extension.h"
+#endif // defined(TORCH_API_INCLUDE_EXTENSION_H)
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 4a40cf243f3a68..1b85b1810b6607 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -3,7 +3,6 @@
 #include <structmember.h>
 #include <pybind11/pybind11.h>
 
-#include "torch/csrc/torch.h"
 #include "torch/csrc/Dtype.h"
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
@@ -17,6 +16,7 @@
 #include "torch/csrc/utils/python_strings.h"
 #include "torch/csrc/utils/tensor_new.h"
 #include "torch/csrc/utils/tensor_types.h"
+#include "torch/csrc/variable_tensor_functions.h"
 
 #include <ATen/ATen.h>
 
diff --git a/torch/csrc/torch.h b/torch/extension.h
similarity index 79%
rename from torch/csrc/torch.h
rename to torch/extension.h
index 5761b8ef57f642..828aefd572ae77 100644
--- a/torch/csrc/torch.h
+++ b/torch/extension.h
@@ -2,6 +2,5 @@
 
 #include <Python.h>
 
-#include <pybind11/pybind11.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/variable_tensor_functions.h>
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 50d3f74b2a2c69..4b1c4cbc32bc09 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -170,6 +170,7 @@ class BuildExtension(build_ext):
     def build_extensions(self):
         self._check_abi()
         for extension in self.extensions:
+            self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
             self._define_torch_extension_name(extension)
             self._add_gnu_abi_flag_if_binary(extension)
 
@@ -290,6 +291,13 @@ def _check_abi(self):
             compiler = os.environ.get('CXX', 'c++')
         check_compiler_abi_compatibility(compiler)
 
+    def _add_compile_flag(self, extension, flag):
+        if isinstance(extension.extra_compile_args, dict):
+            for args in extension.extra_compile_args.values():
+                args.append(flag)
+        else:
+            extension.extra_compile_args.append(flag)
+
     def _define_torch_extension_name(self, extension):
         # pybind11 doesn't support dots in the names
         # so in order to support extensions in the packages
@@ -298,11 +306,7 @@ def _define_torch_extension_name(self, extension):
         names = extension.name.split('.')
         name = names[-1]
         define = '-DTORCH_EXTENSION_NAME={}'.format(name)
-        if isinstance(extension.extra_compile_args, dict):
-            for args in extension.extra_compile_args.values():
-                args.append(define)
-        else:
-            extension.extra_compile_args.append(define)
+        self._add_compile_flag(extension, define)
 
     def _add_gnu_abi_flag_if_binary(self, extension):
         # If the version string looks like a binary build,
@@ -310,14 +314,9 @@ def _add_gnu_abi_flag_if_binary(self, extension):
         # if the extension is compiled with gcc >= 5.1,
         # then we have to define _GLIBCXX_USE_CXX11_ABI=0
         # so that the std::string in the API is resolved to
-        # non-C++11 symbols.
-        define = '-D_GLIBCXX_USE_CXX11_ABI=0'
+        # non-C++11 symbols
         if _is_binary_build():
-            if isinstance(extension.extra_compile_args, dict):
-                for args in extension.extra_compile_args.values():
-                    args.append(define)
-            else:
-                extension.extra_compile_args.append(define)
+            self._add_compile_flag(extension, '-D_GLIBCXX_USE_CXX11_ABI=0')
 
 
 def CppExtension(name, sources, *args, **kwargs):
@@ -427,10 +426,12 @@ def include_paths(cuda=False):
     here = os.path.abspath(__file__)
     torch_path = os.path.dirname(os.path.dirname(here))
     lib_include = os.path.join(torch_path, 'lib', 'include')
-    # Some internal (old) Torch headers don't properly prefix their includes,
-    # so we need to pass -Itorch/lib/include/TH as well.
     paths = [
         lib_include,
+        # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
+        os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
+        # Some internal (old) Torch headers don't properly prefix their includes,
+        # so we need to pass -Itorch/lib/include/TH as well.
         os.path.join(lib_include, 'TH'),
         os.path.join(lib_include, 'THC')
     ]
@@ -580,7 +581,7 @@ def load_inline(name,
     the necessary header includes, as well as the (pybind11) binding code. More
     precisely, strings passed to ``cpp_sources`` are first concatenated into a
     single ``.cpp`` file. This file is then prepended with ``#include
-    <torch/torch.h>``.
+    <torch/extension.h>``.
 
     Furthermore, if the ``functions`` argument is supplied, bindings will be
     automatically generated for each function specified. ``functions`` can
@@ -630,7 +631,7 @@ def load_inline(name,
     if isinstance(cuda_sources, str):
         cuda_sources = [cuda_sources]
 
-    cpp_sources.insert(0, '#include <torch/torch.h>')
+    cpp_sources.insert(0, '#include <torch/extension.h>')
 
     # If `functions` is supplied, we create the pybind11 bindings for the user.
     # Here, `functions` is (or becomes, after some processing) a map from
@@ -854,7 +855,9 @@ def _build_extension_module(name, build_directory, verbose):
         # Python 2 and 3 compatible way of getting the error object.
         _, error, _ = sys.exc_info()
         # error.output contains the stdout and stderr of the build attempt.
-        message = "Error building extension '{}': {}".format(name, error.output.decode())
+        message = "Error building extension '{}'".format(name)
+        if hasattr(error, 'output') and error.output:
+            message += ": {}".format(error.output.decode())
         raise_from(RuntimeError(message), None)
 
 
@@ -890,7 +893,7 @@ def _write_ninja_file(path,
     sources = [os.path.abspath(file) for file in sources]
     user_includes = [os.path.abspath(file) for file in extra_include_paths]
 
-    # include_paths() gives us the location of torch/torch.h
+    # include_paths() gives us the location of torch/extension.h
     system_includes = include_paths(with_cuda)
     # sysconfig.get_paths()['include'] gives us the location of Python.h
     system_includes.append(sysconfig.get_paths()['include'])
@@ -901,6 +904,7 @@ def _write_ninja_file(path,
         system_includes.clear()
 
     common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
+    common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
     common_cflags += ['-I{}'.format(include) for include in user_includes]
     common_cflags += ['-isystem {}'.format(include) for include in system_includes]
 

From 70e4b3ef59f8ebb7dd359e00fa136d52d88160ed Mon Sep 17 00:00:00 2001
From: Hoa Dinh <dvh@fb.com>
Date: Mon, 24 Sep 2018 15:26:56 -0700
Subject: [PATCH 09/51] Revert D10006069: Remove TIndex typedef from
 core/common.h

Differential Revision:
D10006069

Original commit changeset: 5e2aac993968

fbshipit-source-id: fbd8d3860635211e641ca14eaff7a64882e0d6bd
---
 caffe2/core/common.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 2582a605adee55..93bbf341b5061a 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -30,6 +30,10 @@
 
 namespace caffe2 {
 
+// Data type for caffe2 Index/Size. We use size_t to be safe here as well as for
+// large matrices that are common in sparse math.
+typedef int64_t TIndex;
+
 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
 // forcing us to use std::map instead of unordered_map. This may affect speed
 // in some cases, but in most of the computation code we do not access map very

From b7c302da1ad1f2051a931cb8cb4fb583ae34c685 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Mon, 24 Sep 2018 15:46:48 -0700
Subject: [PATCH 10/51] Make gen_jit_dispatch runnable (#12018)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12018

Tried to use the file and ran into a small bug, this fixes it

Differential Revision: D10013231

fbshipit-source-id: 4cf8c29cf9e2cedd7a28fa0cc0196e5144a54bf2
---
 tools/jit/gen_jit_dispatch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index b7326e526baa86..f6fdc7505d9966 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -351,7 +351,7 @@ def main():
                         help='path to Declarations.yaml')
     parser.add_argument('out', metavar='OUT',
                         help='path to output directory')
-    parser.add_argument('template-path', metavar='TEMPLATE_PATH',
+    parser.add_argument('template_path', metavar='TEMPLATE_PATH',
                         help='path to templates directory')
     args = parser.parse_args()
     gen_jit_dispatch(args.declarations, args.out, args.template_path)

From 3ae6ee4ebded136da30aa53fd3873d84acfbc9f0 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 24 Sep 2018 16:49:27 -0700
Subject: [PATCH 11/51] Move CreateContext to global registry (#11688)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11688

As a first step to remove static context(merge with allocator), we'll create a
global registries for context constructors, and remove CreateContext function from tensor.

Reviewed By: ezyang, dzhulgakov

Differential Revision: D9779821

fbshipit-source-id: 8b239ea50af7a0556fde2382f58f79194f0e3dc1
---
 aten/src/ATen/core/context_base.cpp  | 11 ++++++++++
 aten/src/ATen/core/context_base.h    | 26 ++++++++++++++++------
 caffe2/core/blob_serialization.cc    |  5 ++---
 caffe2/core/context.cc               |  4 ++++
 caffe2/core/context.h                | 11 ++--------
 caffe2/core/context_base.cc          |  1 +
 caffe2/core/context_base.h           |  2 ++
 caffe2/core/context_gpu.cu           |  5 +++++
 caffe2/core/context_gpu.h            | 15 ++-----------
 caffe2/core/hip/context_hip.cc       | 18 ++++++++++------
 caffe2/core/hip/context_hip.h        | 15 ++-----------
 caffe2/core/registry.h               |  2 +-
 caffe2/core/tensor.h                 |  8 +++----
 caffe2/core/tensor_impl.cc           |  2 +-
 caffe2/core/tensor_impl.h            | 28 +++++++++++-------------
 caffe2/ideep/utils/ideep_context.h   | 11 ++--------
 caffe2/ideep/utils/ideep_register.cc |  3 +++
 caffe2/mkl/utils/mkl_context.cc      |  4 ++++
 caffe2/mkl/utils/mkl_context.h       | 11 ++--------
 caffe2/proto/caffe2_pb.h             | 32 +++++++++++++++++++++++++++-
 caffe2/python/pybind_state.h         |  2 +-
 21 files changed, 122 insertions(+), 94 deletions(-)

diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
index e34c6880c0210a..5fa747180d1214 100644
--- a/aten/src/ATen/core/context_base.cpp
+++ b/aten/src/ATen/core/context_base.cpp
@@ -1,5 +1,16 @@
 #include <ATen/core/context_base.h>
 
+namespace at {
+
+AT_DEFINE_TYPED_REGISTRY(
+    ContextRegistry,
+    DeviceType,
+    BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+} // namespace at
+
 namespace caffe2 {
 
 // TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 45b38387b46ca6..be9c36bfd60796 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -6,11 +6,12 @@
 #include <memory>
 #include <unordered_map>
 
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
+#include <ATen/core/Registry.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
-#include <ATen/core/ATenGeneral.h>
 
 namespace caffe2 {
 class Event;
@@ -31,11 +32,6 @@ class AT_CORE_API BaseStaticContext {
 
   virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
 
-  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext(
-      const caffe2::DeviceOption&) = 0;
-
   virtual DeviceType GetDeviceType() = 0;
 
   /*
@@ -184,6 +180,22 @@ class AT_CORE_API BaseContext {
   }
 };
 
+// Context constructor registry
+AT_DECLARE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+#define REGISTER_CONTEXT(type, ...) \
+  AT_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
+
+inline std::unique_ptr<at::BaseContext> CreateContext(
+    const at::Device& device) {
+  return ContextRegistry()->Create(device.type(), device);
+}
+
 } // namespace at
 
 namespace caffe2 {
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 7ff5a2b25eacc1..38125b242def2f 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -196,7 +196,7 @@ void TensorSerializer::Serialize(
   const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
   proto.set_data_type(data_type);
   StoreDeviceDetail(input, &proto);
-  auto uniq_ptr = input.GetStaticContext()->CreateContext();
+  auto uniq_ptr = CreateContext(input.GetDevice());
   // A lot of copypaste is error prone. Should we create a macro for this?
   switch (data_type) {
     case TensorProto_DataType_FLOAT:
@@ -370,8 +370,7 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
 void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
   // We create a local context for deserializing. Since Caffe2 contexts are
   // usually lightweight, this should not involve too much overhead.
-  auto uniq_ptr =
-      tensor->GetStaticContext()->CreateContext(proto.device_detail());
+  auto uniq_ptr = CreateContext(OptionToDevice(proto.device_detail()));
   auto context = uniq_ptr.get();
   context->SwitchToDevice(0);
   vector<int64_t> dims;
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
index 30819afdc4ce3f..94047eb71ee0b6 100644
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@@ -5,6 +5,10 @@
 #include <process.h>
 #endif
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::CPU, caffe2::CPUContext);
+} // namespace at
 namespace caffe2 {
 
 uint32_t RandomNumberSeed() {
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index aff66534d22198..af66396af72c44 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -50,6 +50,8 @@ class CAFFE2_API CPUContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU);
   }
+  explicit CPUContext(const at::Device& device)
+      : CPUContext(DeviceToOption(device)) {}
 
   ~CPUContext() noexcept override {}
 
@@ -192,15 +194,6 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
     return data_and_deleter;
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CPUContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CPUContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return CPU;
   }
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
index b61b73cbad1cb5..99996d9e165b9b 100644
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@@ -1,4 +1,5 @@
 #include "context_base.h"
 
 namespace caffe2 {
+
 } // namespace caffe2
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index 3a6dfad5b95cc3..50b9252a3cf3dd 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -5,3 +5,5 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/proto/caffe2_pb.h"
+
+namespace caffe2 {} // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 1eaa579ee0cdbe..0d9e2686212a1e 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -57,6 +57,11 @@ CAFFE2_DEFINE_int(
     128,
     "The threshold in MB on how frequently to report memory changes");
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
+} // namespace at
+
 namespace caffe2 {
 
 ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 5fcdb98b100794..ce73f5f942828b 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -142,6 +142,8 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   // The default cuda context constructor.
   explicit CUDAContext(const int gpu_id = -1);
   explicit CUDAContext(const DeviceOption& option);
+  explicit CUDAContext(const at::Device& device)
+      : CUDAContext(DeviceToOption(device)) {}
 
   ~CUDAContext() override {
     if (curand_generator_) {
@@ -385,19 +387,6 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CUDAContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CUDAContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<CUDAContext>(gpu_id);
-  }
-
   DeviceType GetDeviceType() override {
     return CUDA;
   }
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 0fabb20a642c94..3eadaf0e71b118 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -50,6 +50,11 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
                   128,
                   "The threshold in MB on how frequently to report memory changes");
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::HIP, caffe2::HIPContext);
+} // namespace at
+
 namespace caffe2 {
 
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
@@ -408,13 +413,12 @@ void HIPStaticContext::Delete(void* ptr) {
         g_hip_device_affiliation.erase(it);
         break;
     }
-    case HipMemoryPoolType::THC: 
-    {
-        HIP_ENFORCE(g_thc_allocator->Free(ptr));
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
-        }
-        break;
+    case HipMemoryPoolType::THC: {
+      HIP_ENFORCE(g_thc_allocator->Free(ptr));
+      if (FLAGS_caffe2_gpu_memory_tracking) {
+        g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
+      }
+      break;
     }
     }
 }
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index 5a7613cf934fd0..fb04336354e704 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -127,6 +127,8 @@ class HIPContext final : public BaseContext {
   // The default HIP context constructor.
   explicit HIPContext(const int gpu_id = -1);
   explicit HIPContext(const DeviceOption& option);
+  explicit HIPContext(const at::Device& device)
+      : HIPContext(DeviceToOption(device)) {}
 
   ~HIPContext() override {
     if (hiprand_generator_) {
@@ -374,19 +376,6 @@ class HIPStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<HIPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<HIPContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<HIPContext>(gpu_id);
-  }
-
   DeviceType GetDeviceType() override {
     return HIP;
   }
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index 7db975077ea8b9..634323af1eb4d0 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -172,7 +172,7 @@ class Registerer {
       key,                                                                    \
       RegistryName(),                                                         \
       Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      at::demangle_type<__VA_ARGS__>());                                           \
+      at::demangle_type<__VA_ARGS__>());                                      \
   }
 
 // CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 1e4cac2788b560..7e563e37d3418e 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -130,14 +130,14 @@ class CAFFE2_API Tensor final {
     return impl_.get()->GetStaticContext();
   }
 
-  std::unique_ptr<BaseContext> CreateContext() const {
-    return impl_.get()->CreateContext();
-  }
-
   DeviceType GetDeviceType() const {
     return impl_.get()->GetDeviceType();
   }
 
+  at::Device GetDevice() const {
+    return impl_.get()->GetDevice();
+  }
+
   void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const {
     impl_.get()->CopyFrom(*src.impl_.get(), context);
   }
diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc
index cff98c6101ea5d..74aa5385ed0199 100644
--- a/caffe2/core/tensor_impl.cc
+++ b/caffe2/core/tensor_impl.cc
@@ -1,5 +1,5 @@
 #include "caffe2/core/tensor_impl.h"
-
+#include "caffe2/core/context_base.h"
 #include "caffe2/core/flags.h"
 
 CAFFE2_DEFINE_bool(
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 53c812f55e297b..eb59291689cbc5 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -3,7 +3,6 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/context_base.h>
-#include <ATen/core/context_base.h>
 
 #include "caffe2/core/allocator.h"
 #include "caffe2/core/common.h"
@@ -112,21 +111,14 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return get_static_context(device_type);
   }
 
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us
-   */
-  std::unique_ptr<at::BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
   at::DeviceType GetDeviceType() const {
     return storage_.device_type();
   }
 
+  at::Device GetDevice() const {
+    return storage_.device();
+  }
+
   /**
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.
@@ -167,8 +159,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         // knows how to copy between CPU and that context
         if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
           if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+            CreateContext(src.GetDevice())
+                ->CopyBytesToDevice(
+                    nbytes(),
+                    src.raw_data(),
+                    raw_mutable_data(),
+                    GetDeviceType());
           } else {
             CAFFE_ENFORCE(
                 context->device_type() == src.GetDeviceType(),
@@ -180,8 +176,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
           // In case source context is CPU, and target context is non-CPU
           // We'll have to create a Context from target and perform the
           // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
+          CreateContext(GetDevice())
+              ->CopyBytesFromCPU(nbytes(), src.raw_data(), raw_mutable_data());
         }
       }
     }
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index f50a4f34c66789..087078c507d164 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -20,6 +20,8 @@ class IDEEPContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP);
   }
+  explicit IDEEPContext(const at::Device& device)
+      : IDEEPContext(DeviceToOption(device)) {}
 
   ~IDEEPContext() noexcept override {}
 
@@ -178,15 +180,6 @@ class IDEEPStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<IDEEPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<IDEEPContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return IDEEP;
   }
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 020e22fa6143ed..53b8bcbf072c5a 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -4,6 +4,9 @@
 #include <ideep_pin_singletons.hpp>
 #include "ideep_context.h"
 
+namespace at {
+REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext);
+} // namespace at
 namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(ideep::tensor);
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
index 6e9075df43475f..8c66bc111282ac 100644
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -3,6 +3,10 @@
 #include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::MKLDNN, caffe2::MKLContext);
+} // namespace at
 namespace caffe2 {
 
 // MKL events are the same as CPU events
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 0a7b5808a446be..8364026d91c651 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -29,6 +29,8 @@ class MKLContext : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_MKLDNN);
   }
+  explicit MKLContext(const at::Device& device)
+      : MKLContext(DeviceToOption(device)) {}
 
   ~MKLContext() override {}
 
@@ -155,15 +157,6 @@ class MKLStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<MKLContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<MKLContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return MKLDNN;
   }
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index 0a08c8db241e98..a0d455d4519d74 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <caffe2/proto/caffe2.pb.h>
 
@@ -47,6 +47,10 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   }
 }
 
+inline CAFFE2_API DeviceType ProtoToType(int p) {
+  return ProtoToType(static_cast<caffe2::DeviceTypeProto>(p));
+}
+
 inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   switch (t) {
     case DeviceType::CPU:
@@ -77,4 +81,30 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   }
 }
 
+inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
+    const at::Device& device) {
+  caffe2::DeviceOption option;
+  auto type = device.type();
+  option.set_device_type(TypeToProto(type));
+  // sets the gpu_id to -1 means we'll use the current gpu id when the function
+  // is being called, see context_gpu.cu for more info.
+  if (type == at::DeviceType::CUDA) {
+    option.set_cuda_gpu_id(device.index());
+  } else if (type == at::DeviceType::HIP) {
+    option.set_hip_gpu_id(device.index());
+  }
+  return option;
+}
+
+inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
+  at::Device device(ProtoToType(option.device_type()));
+  auto type = device.type();
+  if (type == at::DeviceType::CUDA) {
+    device.set_index(option.cuda_gpu_id());
+  } else if (type == at::DeviceType::HIP) {
+    device.set_index(option.hip_gpu_id());
+  }
+  return device;
+}
+
 } // namespace caffe2
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 59f39dd313032c..97ec6628fe3f27 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -148,7 +148,7 @@ class TensorFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      auto context = tensor.GetStaticContext()->CreateContext();
+      auto context = CreateContext(tensor.GetDeviceType());
       context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
       context->FinishDeviceComputation();
     }

From a8309640072a5851f61e6d7ebd698dff0239f811 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Mon, 24 Sep 2018 17:39:32 -0700
Subject: [PATCH 12/51] Eliminate no-op adds and muls in peephole pass (#11801)

Summary:
Because we emit a lot of them in our symbolic AD. This brings down the backward time of an LSTM I'm testing from 14.2ms to 12.5ms (a 15% improvement).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11801

Differential Revision: D9916815

Pulled By: apaszke

fbshipit-source-id: 2d9cb886c424ccd43b9f996aad89950d3bddf494
---
 ...ript.test_lstm_fusion_cuda-backward.expect |  78 ++++++------
 ...pt.test_milstm_fusion_cuda-backward.expect | 114 +++++++++---------
 .../TestScript.test_scalar_fusion.expect      |  11 +-
 test/test_jit.py                              |   2 +-
 torch/csrc/jit/passes/peephole.cpp            |  20 ++-
 5 files changed, 116 insertions(+), 109 deletions(-)

diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
index efb3d272bb4c2f..cbdbc744b5e85d 100644
--- a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
@@ -17,20 +17,18 @@ graph(%0 : Float(*, *)
       %cellgate : Float(*, *)
       %outgate : Float(*, *)
       %18 : Float(*, *)) {
-  %19 : int = prim::Constant[value=1]()
-  %20 : Float(*, *), %21 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %11, %1, %18, %0)
-  %22 : Float(*, *) = aten::mul(%20, %19)
-  %23 : Float(*, *) = aten::t(%13)
-  %24 : Float(*, *) = aten::mm(%22, %23)
-  %25 : Float(*, *) = aten::t(%10)
-  %26 : Float(*, *) = aten::mm(%25, %22)
-  %27 : Float(*, *) = aten::t(%26)
-  %28 : Float(*, *) = aten::t(%12)
-  %29 : Float(*, *) = aten::mm(%20, %28)
-  %30 : Float(*, *) = aten::t(%9)
-  %31 : Float(*, *) = aten::mm(%30, %20)
-  %32 : Float(*, *) = aten::t(%31)
-  return (%32, %29, %27, %24, %22, %22, %21);
+  %19 : Float(*, *), %20 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %11, %1, %18, %0)
+  %21 : Float(*, *) = aten::t(%13)
+  %22 : Float(*, *) = aten::mm(%19, %21)
+  %23 : Float(*, *) = aten::t(%10)
+  %24 : Float(*, *) = aten::mm(%23, %19)
+  %25 : Float(*, *) = aten::t(%24)
+  %26 : Float(*, *) = aten::t(%12)
+  %27 : Float(*, *) = aten::mm(%19, %26)
+  %28 : Float(*, *) = aten::t(%9)
+  %29 : Float(*, *) = aten::mm(%28, %19)
+  %30 : Float(*, *) = aten::t(%29)
+  return (%30, %27, %25, %22, %19, %19, %20);
 }
 with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %1 : Float(*, *)
@@ -52,31 +50,29 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %17 : Float(*, *) = aten::add(%7, %14, %16)
   %18 : Float(*, *) = aten::mul(%17, %1)
   %19 : Float(*, *) = aten::mul(%5, %6)
-  %20 : int = prim::Constant[value=1]()
-  %21 : Float(*, *) = aten::mul(%17, %20)
-  %22 : Float(*, *) = aten::mul(%21, %2)
-  %23 : Float(*, *) = aten::mul(%21, %0)
-  %24 : Float(*, *) = aten::mul(%17, %4)
-  %25 : Float(*, *) = aten::neg(%3)
-  %26 : int = prim::Constant[value=1]()
-  %27 : Float(*, *) = aten::add(%25, %26, %26)
-  %28 : Float(*, *) = aten::mul(%19, %3)
-  %29 : Float(*, *) = aten::mul(%28, %27)
-  %30 : Float(*, *) = aten::mul(%2, %2)
-  %31 : Float(*, *) = aten::neg(%30)
-  %32 : int = prim::Constant[value=1]()
-  %33 : Float(*, *) = aten::add(%31, %32, %32)
-  %34 : Float(*, *) = aten::mul(%23, %33)
-  %35 : Float(*, *) = aten::neg(%1)
-  %36 : int = prim::Constant[value=1]()
-  %37 : Float(*, *) = aten::add(%35, %36, %36)
-  %38 : Float(*, *) = aten::mul(%24, %1)
-  %39 : Float(*, *) = aten::mul(%38, %37)
-  %40 : Float(*, *) = aten::neg(%0)
-  %41 : int = prim::Constant[value=1]()
-  %42 : Float(*, *) = aten::add(%40, %41, %41)
-  %43 : Float(*, *) = aten::mul(%22, %0)
-  %44 : Float(*, *) = aten::mul(%43, %42)
-  %45 : Float(*, *) = prim::FusedConcat[dim=1](%44, %39, %34, %29)
-  return (%45, %18);
+  %20 : Float(*, *) = aten::mul(%17, %2)
+  %21 : Float(*, *) = aten::mul(%17, %0)
+  %22 : Float(*, *) = aten::mul(%17, %4)
+  %23 : Float(*, *) = aten::neg(%3)
+  %24 : int = prim::Constant[value=1]()
+  %25 : Float(*, *) = aten::add(%23, %24, %24)
+  %26 : Float(*, *) = aten::mul(%19, %3)
+  %27 : Float(*, *) = aten::mul(%26, %25)
+  %28 : Float(*, *) = aten::mul(%2, %2)
+  %29 : Float(*, *) = aten::neg(%28)
+  %30 : int = prim::Constant[value=1]()
+  %31 : Float(*, *) = aten::add(%29, %30, %30)
+  %32 : Float(*, *) = aten::mul(%21, %31)
+  %33 : Float(*, *) = aten::neg(%1)
+  %34 : int = prim::Constant[value=1]()
+  %35 : Float(*, *) = aten::add(%33, %34, %34)
+  %36 : Float(*, *) = aten::mul(%22, %1)
+  %37 : Float(*, *) = aten::mul(%36, %35)
+  %38 : Float(*, *) = aten::neg(%0)
+  %39 : int = prim::Constant[value=1]()
+  %40 : Float(*, *) = aten::add(%38, %39, %39)
+  %41 : Float(*, *) = aten::mul(%20, %0)
+  %42 : Float(*, *) = aten::mul(%41, %40)
+  %43 : Float(*, *) = prim::FusedConcat[dim=1](%42, %37, %32, %27)
+  return (%43, %18);
 }
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index 1221d05e519257..b0dc85644751d8 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -27,14 +27,17 @@ graph(%0 : Float(*, *)
       %outgate : Float(*, *)
       %27 : Float(*, *)) {
   %28 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %0, %27, %1)
-  %29 : Float(*, *), %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16)
-  %35 : Float(*, *) = aten::t(%13)
-  %36 : Float(*, *) = aten::mm(%35, %31)
-  %37 : Float(*, *) = aten::t(%36)
-  %38 : Float(*, *) = aten::t(%12)
-  %39 : Float(*, *) = aten::mm(%38, %29)
-  %40 : Float(*, *) = aten::t(%39)
-  return (%40, %37, %30, %32, %33, %34);
+  %29 : Float(*, *) = aten::mul(%28, %Uz)
+  %30 : Float(*, *) = aten::mul(%28, %Wx)
+  %31 : Float(*, *) = prim::FusionGroup_1[device=0](%28, %22, %16)
+  %32 : Float(*, *), %33 : Float(*, *) = prim::FusionGroup_2[device=0](%14, %28, %15, %Wx, %Uz)
+  %34 : Float(*, *) = aten::t(%13)
+  %35 : Float(*, *) = aten::mm(%34, %31)
+  %36 : Float(*, *) = aten::t(%35)
+  %37 : Float(*, *) = aten::t(%12)
+  %38 : Float(*, *) = aten::mm(%37, %32)
+  %39 : Float(*, *) = aten::t(%38)
+  return (%39, %36, %33, %30, %29, %28);
 }
 with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %1 : Float(*, *)
@@ -53,58 +56,51 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %14 : Float(*, *) = aten::mul(%9, %13)
   %15 : int = prim::Constant[value=1]()
   %16 : Float(*, *) = aten::add(%5, %14, %15)
-  %17 : int = prim::Constant[value=1]()
-  %18 : Float(*, *) = aten::mul(%16, %17)
-  %19 : Float(*, *) = aten::mul(%18, %2)
-  %20 : Float(*, *) = aten::mul(%18, %0)
-  %21 : Float(*, *) = aten::mul(%16, %4)
-  %22 : Float(*, *) = aten::neg(%3)
-  %23 : int = prim::Constant[value=1]()
-  %24 : Float(*, *) = aten::add(%22, %23, %23)
-  %25 : Float(*, *) = aten::mul(%8, %3)
-  %26 : Float(*, *) = aten::mul(%25, %24)
-  %27 : Float(*, *) = aten::mul(%2, %2)
-  %28 : Float(*, *) = aten::neg(%27)
-  %29 : int = prim::Constant[value=1]()
-  %30 : Float(*, *) = aten::add(%28, %29, %29)
-  %31 : Float(*, *) = aten::mul(%20, %30)
-  %32 : Float(*, *) = aten::neg(%1)
-  %33 : int = prim::Constant[value=1]()
-  %34 : Float(*, *) = aten::add(%32, %33, %33)
-  %35 : Float(*, *) = aten::mul(%21, %1)
-  %36 : Float(*, *) = aten::mul(%35, %34)
-  %37 : Float(*, *) = aten::neg(%0)
-  %38 : int = prim::Constant[value=1]()
-  %39 : Float(*, *) = aten::add(%37, %38, %38)
-  %40 : Float(*, *) = aten::mul(%19, %0)
-  %41 : Float(*, *) = aten::mul(%40, %39)
-  %42 : Float(*, *) = prim::FusedConcat[dim=1](%41, %36, %31, %26)
-  return (%42);
+  %17 : Float(*, *) = aten::mul(%16, %2)
+  %18 : Float(*, *) = aten::mul(%16, %0)
+  %19 : Float(*, *) = aten::mul(%16, %4)
+  %20 : Float(*, *) = aten::neg(%3)
+  %21 : int = prim::Constant[value=1]()
+  %22 : Float(*, *) = aten::add(%20, %21, %21)
+  %23 : Float(*, *) = aten::mul(%8, %3)
+  %24 : Float(*, *) = aten::mul(%23, %22)
+  %25 : Float(*, *) = aten::mul(%2, %2)
+  %26 : Float(*, *) = aten::neg(%25)
+  %27 : int = prim::Constant[value=1]()
+  %28 : Float(*, *) = aten::add(%26, %27, %27)
+  %29 : Float(*, *) = aten::mul(%18, %28)
+  %30 : Float(*, *) = aten::neg(%1)
+  %31 : int = prim::Constant[value=1]()
+  %32 : Float(*, *) = aten::add(%30, %31, %31)
+  %33 : Float(*, *) = aten::mul(%19, %1)
+  %34 : Float(*, *) = aten::mul(%33, %32)
+  %35 : Float(*, *) = aten::neg(%0)
+  %36 : int = prim::Constant[value=1]()
+  %37 : Float(*, *) = aten::add(%35, %36, %36)
+  %38 : Float(*, *) = aten::mul(%17, %0)
+  %39 : Float(*, *) = aten::mul(%38, %37)
+  %40 : Float(*, *) = prim::FusedConcat[dim=1](%39, %34, %29, %24)
+  return (%40);
 }
-with prim::FusionGroup_1 = graph(%0 : Float(*)
-      %1 : Float(*)
-      %2 : Float(*, *)
+with prim::FusionGroup_1 = graph(%0 : Float(*, *)
+      %1 : Float(*, *)
+      %2 : Float(*)) {
+  %3 : Float(*, *) = aten::mul(%0, %2)
+  %4 : Float(*, *) = aten::mul(%0, %1)
+  %5 : int = prim::Constant[value=1]()
+  %6 : Float(*, *) = aten::add(%3, %4, %5)
+  return (%6);
+}
+with prim::FusionGroup_2 = graph(%0 : Float(*)
+      %1 : Float(*, *)
+      %2 : Float(*)
       %3 : Float(*, *)
-      %4 : Float(*, *)
-      %5 : Float(*, *)
-      %6 : Float(*)) {
-  %7 : int = prim::Constant[value=1]()
-  %8 : int = prim::Constant[value=1]()
+      %4 : Float(*, *)) {
+  %5 : Float(*, *) = aten::mul(%1, %4)
+  %6 : Float(*, *) = aten::mul(%5, %3)
+  %7 : Float(*, *) = aten::mul(%1, %2)
+  %8 : Float(*, *) = aten::mul(%5, %0)
   %9 : int = prim::Constant[value=1]()
-  %10 : int = prim::Constant[value=1]()
-  %11 : Float(*, *) = aten::mul(%3, %10)
-  %12 : Float(*, *) = aten::mul(%11, %4)
-  %13 : Float(*, *) = aten::mul(%11, %2)
-  %14 : Float(*, *) = aten::mul(%11, %6)
-  %15 : Float(*, *) = aten::mul(%3, %5)
-  %16 : int = prim::Constant[value=1]()
-  %17 : int = prim::Constant[value=1]()
-  %18 : Float(*, *) = aten::add(%14, %15, %17)
-  %19 : Float(*, *) = aten::mul(%3, %4)
-  %20 : Float(*, *) = aten::mul(%19, %2)
-  %21 : Float(*, *) = aten::mul(%11, %1)
-  %22 : Float(*, *) = aten::mul(%19, %0)
-  %23 : int = prim::Constant[value=1]()
-  %24 : Float(*, *) = aten::add(%21, %22, %23)
-  return (%24, %20, %18, %13, %12, %11);
+  %10 : Float(*, *) = aten::add(%7, %8, %9)
+  return (%10, %6);
 }
diff --git a/test/expect/TestScript.test_scalar_fusion.expect b/test/expect/TestScript.test_scalar_fusion.expect
index 9d45a9f765d632..e2fd92a0f5739c 100644
--- a/test/expect/TestScript.test_scalar_fusion.expect
+++ b/test/expect/TestScript.test_scalar_fusion.expect
@@ -1,12 +1,13 @@
 graph(%x : Float()
       %y : Float()) {
-  %2 : Float() = prim::FusionGroup_0[device=-1](%x, %y)
+  %2 : Float() = prim::FusionGroup_0[device=-1](%y, %x)
   return (%2);
 }
 with prim::FusionGroup_0 = graph(%0 : Float()
       %1 : Float()) {
-  %2 : Float() = aten::type_as(%1, %0)
-  %3 : int = prim::Constant[value=1]()
-  %4 : Float() = aten::add(%0, %2, %3)
-  return (%4);
+  %2 : int = prim::Constant[value=2]()
+  %3 : Float() = aten::mul(%2, %1)
+  %4 : int = prim::Constant[value=1]()
+  %5 : Float() = aten::add(%3, %0, %4)
+  return (%5);
 }
diff --git a/test/test_jit.py b/test/test_jit.py
index c8268f03fa737c..708533ae737760 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3516,7 +3516,7 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
     @enable_cpu_fuser
     def test_scalar_fusion(self):
         def fn(x, y):
-            return x + y.type_as(x)
+            return 2 * x + y
 
         x = torch.tensor(0.1, dtype=torch.float, device='cpu')
         y = torch.tensor(1, dtype=torch.float, device='cpu')
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
index ab4a75375081ae..176166218fc01a 100644
--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -19,7 +19,7 @@ void PeepholeOptimize(Block * block) {
     auto* node = *it;
 
     for (Block * sub_block : node->blocks()) {
-        PeepholeOptimize(sub_block);
+      PeepholeOptimize(sub_block);
     }
 
     // XXX: remember that if you want to simplify an expression by combining multiple nodes
@@ -41,8 +41,8 @@ void PeepholeOptimize(Block * block) {
       }
     } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
       // x.type_as(y) == x iff x.type() == y.type()
-      auto self_type = node->input(0)->type()->cast<CompleteTensorType>();
-      auto other_type = node->input(1)->type()->cast<CompleteTensorType>();
+      auto self_type = node->input(0)->type()->cast<TensorType>();
+      auto other_type = node->input(1)->type()->cast<TensorType>();
       if (self_type && other_type &&
           self_type->scalarType() == other_type->scalarType() &&
           self_type->device() == other_type->device()) {
@@ -100,6 +100,20 @@ void PeepholeOptimize(Block * block) {
           }
         }
       }
+    // TODO: this doesn't work with Scalar-Tensor ops! We should canonicalize those
+    } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*with_const=*/attr::other) ||
+               node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*with_const=*/attr::other)) {
+      // x * 1 == x / 1 == x
+      if (node->get<at::Scalar>(attr::other)->toDouble() == 1) {
+        node->output()->replaceAllUsesWith(node->input(0));
+      }
+    } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", /*with_const=*/{attr::alpha, attr::other}) ||
+               node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", /*with_const=*/{attr::alpha, attr::other})) {
+      // x + 0 == x - 0 == x
+      if (node->get<at::Scalar>(attr::alpha)->toDouble() == 1 &&
+          node->get<at::Scalar>(attr::other)->toDouble() == 0) {
+        node->output()->replaceAllUsesWith(node->input(0));
+      }
     } else if(node->kind() == prim::TensorToNum || node->kind() == prim::ImplicitTensorToNum) {
       Node* input_node = node->input()->node();
       if (input_node->kind() == prim::NumToTensor) {

From 9068a46dba7090df80079dc0a88bcda4b472e8cd Mon Sep 17 00:00:00 2001
From: Spandan Tiwari <sptiwari@microsoft.com>
Date: Mon, 24 Sep 2018 19:54:49 -0700
Subject: [PATCH 13/51] Fix deprecated function warning in ONNX model test.
 (#11827)

Summary:
When running /test/onnx/test_models.py, we see deprecation warnings in the test points for `super_resolution` and `squeezenet` models. This change updates those models to use the recommended methods, instead of the deprecated ones.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11827

Reviewed By: houseroad

Differential Revision: D10023998

Pulled By: ezyang

fbshipit-source-id: ee4e14304678c532ebd574e7bd143e3b311995ab
---
 test/onnx/model_defs/squeezenet.py       | 4 ++--
 test/onnx/model_defs/super_resolution.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py
index e4ace18194ab71..2ee956b605cd13 100644
--- a/test/onnx/model_defs/squeezenet.py
+++ b/test/onnx/model_defs/squeezenet.py
@@ -79,9 +79,9 @@ def __init__(self, version=1.0, num_classes=1000, ceil_mode=False):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
                 if m is final_conv:
-                    init.normal(m.weight.data, mean=0.0, std=0.01)
+                    init.normal_(m.weight.data, mean=0.0, std=0.01)
                 else:
-                    init.kaiming_uniform(m.weight.data)
+                    init.kaiming_uniform_(m.weight.data)
                 if m.bias is not None:
                     m.bias.data.zero_()
 
diff --git a/test/onnx/model_defs/super_resolution.py b/test/onnx/model_defs/super_resolution.py
index d0ba46a22d05af..619d5f4a5b581a 100644
--- a/test/onnx/model_defs/super_resolution.py
+++ b/test/onnx/model_defs/super_resolution.py
@@ -24,7 +24,7 @@ def forward(self, x):
         return x
 
     def _initialize_weights(self):
-        init.orthogonal(self.conv1.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv2.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv3.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv4.weight)
+        init.orthogonal_(self.conv1.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv2.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv3.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv4.weight)

From 5d4624a1d9170fcd05fb0c14844298fb3010d909 Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pcnoordhuis@gmail.com>
Date: Mon, 24 Sep 2018 19:58:28 -0700
Subject: [PATCH 14/51] Fix return temporary as reference in MPI backend
 (#11947)

Summary:
The MPI async work class returned a temporary as reference, which is
invalid (hat tip to colesbury for noticing it). This change fixes that and
uses a std::exception_ptr to hold on to the exception if applicable, and
then returns the reference by throwing it and returning it, like the
existing code path.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11947

Differential Revision: D10019928

Pulled By: pietern

fbshipit-source-id: 5a8ed0e894615a09224ca5e48c8b3104275a3019
---
 torch/lib/c10d/ProcessGroupMPI.cpp | 32 ++++++++++++++++++++++--------
 torch/lib/c10d/ProcessGroupMPI.hpp |  6 ++++--
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 63846b443ea072..033d5d24cb26dc 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -98,7 +98,7 @@ bool ProcessGroupMPI::WorkMPI::isCompleted() {
 }
 
 bool ProcessGroupMPI::WorkMPI::isSuccess() const {
-  return !workException_;
+  return !exception_;
 }
 
 void ProcessGroupMPI::WorkMPI::synchronize() {}
@@ -124,14 +124,14 @@ void ProcessGroupMPI::WorkMPI::finishWithException(
   {
     std::unique_lock<std::mutex> lock(workMutex_);
     completed_ = true;
-    workException_ = caughtWorkException;
+    exception_ = caughtWorkException;
   }
   workCV_.notify_all();
 }
 
 const std::exception& ProcessGroupMPI::WorkMPI::exception() const {
   try {
-    std::rethrow_exception(workException_);
+    std::rethrow_exception(exception_);
   } catch (const std::exception& e) {
     return e;
   }
@@ -169,6 +169,11 @@ bool ProcessGroupMPI::AsyncWork::isCompleted() {
     *srcRank_ = status_.MPI_SOURCE;
   }
 
+  // Populate exception if request was not successful
+  if (status_.MPI_ERROR != MPI_SUCCESS) {
+    populateException();
+  }
+
   return true;
 }
 
@@ -194,19 +199,30 @@ bool ProcessGroupMPI::AsyncWork::wait() {
     *srcRank_ = status_.MPI_SOURCE;
   }
 
-  return status_.MPI_ERROR == MPI_SUCCESS;
+  auto ok = (status_.MPI_ERROR == MPI_SUCCESS);
+
+  // Populate exception if request was not successful
+  if (!ok) {
+    populateException();
+  }
+
+  return ok;
 }
 
 const std::exception& ProcessGroupMPI::AsyncWork::exception() const {
-  if (request_ != MPI_REQUEST_NULL) {
-    throw std::runtime_error(
-        "Invalid call to AsyncWork::exception before work has completed");
+  try {
+    std::rethrow_exception(exception_);
+  } catch (const std::exception& e) {
+    return e;
   }
+}
 
+void ProcessGroupMPI::AsyncWork::populateException() {
   std::array<char, MPI_MAX_ERROR_STRING> buf;
   int len = buf.size();
   MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
-  return std::runtime_error(std::string(buf.data(), len));
+  exception_ =
+      std::make_exception_ptr(std::runtime_error(std::string(buf.data(), len)));
 }
 
 // Static global states
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
index 5bd2b303c1a4e1..8d3018be903253 100644
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -101,8 +101,7 @@ class ProcessGroupMPI : public ProcessGroup {
     std::mutex workMutex_;
     std::condition_variable workCV_;
     std::atomic<bool> completed_;
-
-    std::exception_ptr workException_;
+    std::exception_ptr exception_;
 
     friend class ProcessGroupMPI;
   };
@@ -123,10 +122,13 @@ class ProcessGroupMPI : public ProcessGroup {
     const std::exception& exception() const override;
 
    protected:
+    void populateException();
+
     at::Tensor tensor_;
     MPI_Request request_;
     int* const srcRank_;
     MPI_Status status_;
+    std::exception_ptr exception_;
   };
 
   // Constructor will spawn up the worker thread loop

From 86e025fca213ec0642dd4e49c969e0bbbe86a9ea Mon Sep 17 00:00:00 2001
From: John <johndouglascm@users.noreply.github.com>
Date: Mon, 24 Sep 2018 20:17:55 -0700
Subject: [PATCH 15/51] magma-cuda should reference updated versions (#12000)

Summary:
Source build doc section **LAPACK GPU**  only lists magma-cuda80

The magma-cuda version should reflect the installed version of cuda.

- Verified on ubuntu with magma-cuda92 with build and test
- Verified 91 is available
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12000

Differential Revision: D10024158

Pulled By: soumith

fbshipit-source-id: a34c85a5e87b52657f1e6f7b21d235306ab7b2aa
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e0aa68bf8b3e29..918aac0627cf2d 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
 conda install -c mingfeima mkldnn
 
 # Add LAPACK support for the GPU
-conda install -c pytorch magma-cuda80 # or magma-cuda90 if CUDA 9
+conda install -c pytorch magma-cuda92 # or [magma-cuda80 | magma-cuda91] depending on your cuda version
 ```
 
 On macOS

From dfa03e94ebf24b12e889f749c481ed687441cf75 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 24 Sep 2018 20:44:38 -0700
Subject: [PATCH 16/51] Fix mispelling of AVAILABLE. (#12016)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12016

Reviewed By: pietern

Differential Revision: D10010808

Pulled By: ezyang

fbshipit-source-id: ff6394ae9a53f7fdad2cadb4e019e09ac63bba96
---
 torch/distributed/distributed_c10d.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 0568e4261f4480..0da3f31b22b130 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -9,19 +9,19 @@
 from . import ProcessGroupGloo
 
 
-_MPI_AVAILBLE = True
-_NCCL_AVAILBLE = True
+_MPI_AVAILABLE = True
+_NCCL_AVAILABLE = True
 
 
 try:
     from. import ProcessGroupMPI
 except ImportError:
-    _MPI_AVAILBLE = False
+    _MPI_AVAILABLE = False
 
 try:
     from. import ProcessGroupNCCL
 except ImportError:
-    _NCCL_AVAILBLE = False
+    _NCCL_AVAILABLE = False
 
 
 class DistBackend(object):
@@ -166,7 +166,7 @@ def is_mpi_available():
     Checks if MPI is available
 
     """
-    return _MPI_AVAILBLE
+    return _MPI_AVAILABLE
 
 
 def is_nccl_available():
@@ -174,7 +174,7 @@ def is_nccl_available():
     Checks if NCCL is available
 
     """
-    return _NCCL_AVAILBLE
+    return _NCCL_AVAILABLE
 
 
 def is_initialized():

From 17a65bf9b680ae04522c11462f5fc243e525f07c Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 24 Sep 2018 22:52:14 -0700
Subject: [PATCH 17/51] Removing some dependency edges from Blob to other
 caffe2 (#11923)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11923

This is pre-work to allow moving Blob to ATen/core, which cannot depend on caffe2 anymore.
(1) Removing the Blob -> Tensor dependency allows us to move Blob to ATen/core and use it inside IValue without having to wait for the Tensor merge to be complete.
(2) In the final Blob design, we want it to be a very small class that doesn't have any special treatment for Tensor (or to be more correct, doesn't allow storing Tensor anymore), so this is anyhow the direction we want to go.

This changes call sites that will have to be moved to IValue later, but they cannot be moved to IValue directly, because for that, IValue first needs to be able to store Blob, which in turn first needs this diff and some other changes coming up in future diffs.

Codemods:
$ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)\\.IsTensorType\\(" "BlobIsTensorType(\\1, "
$ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)->IsTensorType\\(" "BlobIsTensorType(*\\1, "
$ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)\\.GetMutableTensor\\(" "BlobGetMutableTensor(\\1, "
$ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)->GetMutableTensor\\(" "BlobGetMutableTensor(*\\1, "

It is, however, not only these codemods because regex based refactoring was only able to match a small amount of the call sites. To catch more, I wouldn've needed a AST aware tool like clangr, which I didn't figure out how to use.

Reviewed By: ezyang

Differential Revision: D9979976

fbshipit-source-id: 2ea17724e223b5b73b44f99362727759ca689e61
---
 binaries/benchmark_helper.cc                  |   6 +-
 binaries/speed_benchmark.cc                   |   2 +-
 caffe2/contrib/gloo/common.cc                 |   2 +-
 .../contrib/nervana/nervana_fc_op_gpu_test.cc |   2 +-
 .../contrib/tensorrt/tensorrt_tranformer.cc   |   4 +-
 caffe2/core/blob.h                            |  54 +++---
 caffe2/core/blob_gpu_test.cc                  |   8 +-
 caffe2/core/blob_serialization.cc             |   3 +-
 caffe2/core/blob_test.cc                      |  30 ++--
 caffe2/core/operator.h                        |   6 +-
 caffe2/core/plan_executor.cc                  |   3 +-
 caffe2/core/workspace.h                       |   2 +-
 caffe2/ideep/operators/concat_split_op.cc     |   5 +-
 .../ideep/operators/operator_fallback_ideep.h |   6 +-
 caffe2/ideep/operators/utility_ops.cc         |   2 +-
 caffe2/mkl/operators/operator_fallback_mkl.h  |   6 +-
 .../contrib/arm-compute/operators/copy_op.cc  |   4 +-
 .../arm-compute/test/gl_operator_test.h       |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |   4 +-
 .../mobile/contrib/ios/mpscnn/mpscnn_test.mm  | 169 +++++++++---------
 caffe2/mobile/contrib/ios/pool_test.cc        |   2 +-
 caffe2/mobile/contrib/ios/resize_test.cc      |   2 +-
 caffe2/mobile/contrib/nnapi/nnapi.cc          |   2 +-
 .../mobile/contrib/nnapi/nnapi_benchmark.cc   |  24 +--
 caffe2/mobile/contrib/nnapi/nnapi_test.cc     |  28 +--
 .../mobile/contrib/opengl/test/opengl_test.cc |  94 +++++-----
 .../mobile/contrib/snpe/snpe_op_benchmark.cc  |   8 +-
 caffe2/mobile/contrib/ulp2/ulp_test.cc        |   6 +-
 caffe2/operators/batch_matmul_op_gpu_test.cc  |   2 +-
 caffe2/operators/batch_matmul_op_test.cc      |   2 +-
 caffe2/operators/boolean_unmask_ops_test.cc   |   2 +-
 caffe2/operators/conv_op_shared.cc            |   4 +-
 caffe2/operators/conv_op_shared_gpu.cc        |   4 +-
 .../conv_transpose_op_mobile_test.cc          |   4 +-
 caffe2/operators/dataset_ops.cc               |   2 +-
 caffe2/operators/dropout_op_cudnn.cc          |   2 +-
 caffe2/operators/elementwise_op_test.h        |   2 +-
 .../operators/generate_proposals_op_test.cc   |   6 +-
 caffe2/operators/index_ops.cc                 |   2 +-
 caffe2/operators/onnx_while_op.h              |  12 +-
 caffe2/operators/onnxifi_op.cc                |   2 +-
 caffe2/operators/operator_fallback_gpu.h      |   6 +-
 .../operators/operator_fallback_gpu_test.cc   |   4 +-
 caffe2/operators/reshape_op_gpu_test.cc       |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |   5 +-
 .../rnn/recurrent_network_executor.h          |   4 +-
 caffe2/operators/rnn/recurrent_network_op.h   |  22 +--
 caffe2/operators/roi_align_op_gpu_test.cc     |   6 +-
 caffe2/operators/string_ops_test.cc           |  14 +-
 caffe2/operators/stylizer_ops.cc              |   4 +-
 caffe2/operators/tensor_protos_db_input.h     |   6 +-
 caffe2/operators/tt_linear_op.h               |   2 +-
 caffe2/operators/utility_ops_gpu_test.cc      |   2 +-
 caffe2/operators/utility_ops_test.cc          |   2 +-
 caffe2/opt/fusion.cc                          |  10 +-
 caffe2/opt/onnxifi_transformer.cc             |   2 +-
 caffe2/predictor/predictor.cc                 |   6 +-
 caffe2/predictor/predictor_test.cc            |   6 +-
 caffe2/python/pybind_state.cc                 |   2 +-
 caffe2/python/pybind_state.h                  |  15 +-
 caffe2/python/pybind_state_ideep.cc           |   4 +-
 .../depthwise/depthwise3x3_conv_op_test.cc    |   2 +-
 caffe2/share/contrib/nnpack/conv_op.cc        |  11 +-
 caffe2/share/contrib/nnpack/nnpack_test.cc    |   2 +-
 caffe2/utils/hip/math_blas_hip_test.cc        |  32 ++--
 caffe2/utils/math_gpu_test.cc                 |  48 ++---
 66 files changed, 380 insertions(+), 371 deletions(-)

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 001c8e965f6a6e..f481a6292c7f56 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -163,7 +163,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -200,7 +200,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+    caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       int total_size = tensor_proto->string_data_size();
       for (size_t i = 0; i < total_size; i++) {
@@ -298,7 +298,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
+              BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
               output_prefix,
               name);
         }
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index 5914e3f58b44b2..fd502cf3c078ab 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -137,7 +137,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+        caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
index 21ce0343d81819..d4929938f19174 100644
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutableTensor(CPU);
+  auto* res = BlobGetMutableTensor(status_blob, CPU);
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
index 972d9231dcf9c6..9eee8973142ed7 100644
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index 3612d8b46f1f8d..2dd17e00169902 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -95,10 +95,10 @@ void BlobToTensorProto(
   }
 
   // Set values
-  if (blob->IsTensorType(CPU)) {
+  if (BlobIsTensorType(*blob, CPU)) {
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (blob->IsTensorType(CUDA)) {
+  } else if (BlobIsTensorType(*blob, CUDA)) {
     const auto& cuda_tensor = blob->template Get<TensorCUDA>();
     const auto cpu_tensor = TensorCPU(cuda_tensor, context);
     context->FinishDeviceComputation();
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 870fc88322b158..80470cea443331 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -6,16 +6,16 @@
 #include <typeinfo>
 #include <type_traits>
 #include <vector>
-
-#include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
+
+#include <ATen/core/typeid.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/tensor.h"
-#include "caffe2/core/typeid.h"
-#include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
 
+class Tensor;
+
 /**
  * @brief Blob is a general container that hosts a typed pointer.
  *
@@ -50,15 +50,6 @@ class CAFFE2_API Blob final {
     return meta_.Match<T>();
   }
 
-  bool IsTensorType(DeviceType device_type) const {
-    bool is_match = meta_.Match<Tensor>();
-    auto* tensor = static_cast<Tensor*>(pointer_);
-    if (is_match && tensor && tensor->GetDeviceType() == device_type) {
-      return true;
-    }
-    return false;
-  }
-
   /**
    * Returns the meta info of the blob.
    */
@@ -109,9 +100,6 @@ class CAFFE2_API Blob final {
         std::is_default_constructible<T>::value,
         "GetMutable can't be called with non-default-constructible types. "
         "Try using specialized methods");
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "Use GetMutableTensor(DeviceType) instead");
     if (IsType<T>()) {
       return static_cast<T*>(pointer_);
     } else {
@@ -129,16 +117,6 @@ class CAFFE2_API Blob final {
     }
   }
 
-  inline Tensor* GetMutableTensor(DeviceType device_type) {
-    if (IsTensorType(device_type)) {
-      return static_cast<Tensor*>(pointer_);
-    } else {
-      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-              << " DeviceType:" << device_type;
-      return Reset<Tensor>(new Tensor(device_type));
-    }
-  }
-
   /**
    * Sets the underlying object to the allocated one. The Blob then takes over
    * the ownership of the passed in pointer. If there is already an object in
@@ -248,5 +226,29 @@ inline void swap(Blob& lhs, Blob& rhs) {
   lhs.swap(rhs);
 }
 
+inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
+  bool is_match = blob.meta().Match<Tensor>();
+  if (!is_match) {
+    return false;
+  }
+  const Tensor* tensor = &blob.Get<Tensor>();
+  return tensor && tensor->GetDeviceType() == device_type;
+}
+
+inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
+  if (blob->IsType<Tensor>()) {
+    Tensor* tensor = blob->GetMutable<Tensor>();
+    if (tensor->GetDeviceType() == device_type) {
+      return tensor;
+    }
+  }
+
+  // if we're here, then either Blob didn't hold a Tensor
+  // or that Tensor had the wrong DeviceType.
+  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
+          << " DeviceType:" << device_type;
+  return blob->Reset<Tensor>(new Tensor(device_type));
+}
+
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_BLOB_H_
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index e8fdf47f69ddb0..55eafdede7269a 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -132,7 +132,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     for (int i = 0; i < 6; ++i) {                                          \
       cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
     }                                                                      \
-    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
+    BlobGetMutableTensor(&blob, CUDA)->CopyFrom(cpu_tensor);               \
     string serialized = SerializeBlob(blob, "test");                       \
     BlobProto proto;                                                       \
     CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@@ -149,7 +149,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));               \
-    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CUDA));                         \
     Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@@ -199,7 +199,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -207,7 +207,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 38125b242def2f..501ec1d89bf70a 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -363,7 +363,8 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
   auto tensor_proto = blob_proto.tensor();
   Deserialize(
       tensor_proto,
-      blob->GetMutableTensor(
+      BlobGetMutableTensor(
+          blob,
           static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
 }
 
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 24b2a2d0593d3a..bb2f4ba6a91818 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
   int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
   EXPECT_TRUE(blob.IsType<int>());
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
 
   BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
   EXPECT_TRUE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
 
-  Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
-  EXPECT_TRUE(blob.IsTensorType(CPU));
+  Tensor* tensor_unused CAFFE2_UNUSED = BlobGetMutableTensor(&blob, CPU);
+  EXPECT_TRUE(BlobIsTensorType(blob, CPU));
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
 }
@@ -600,7 +600,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
   TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
     Blob blob;                                                            \
-    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);                    \
     tensor->Resize(2, 3);                                                 \
     for (int i = 0; i < 6; ++i) {                                         \
       tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -634,7 +634,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                           \
   TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
     Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
+    TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);                 \
     tensor->Resize(0, 3);                                                 \
     tensor->mutable_data<TypeParam>();                                    \
     string serialized = SerializeBlob(blob, "test");                      \
@@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -669,7 +669,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerialization_CustomType) {
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor->mutable_data<BlobTestFoo>()[i].val = i;
@@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -696,7 +696,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, Half) {
   const int64_t kSize = 3000000;
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(kSize);
   for (int i = 0; i < tensor->size(); ++i) {
     tensor->mutable_data<at::Half>()[i].x = i % 10000;
@@ -724,7 +724,7 @@ TEST(TensorTest, Half) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -860,7 +860,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
   {
     VLOG(1) << "Test begin";
     Blob blob;
-    Tensor* tensor = blob.GetMutableTensor(CPU);
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);
     VLOG(1) << "Allocating blob";
     tensor->Resize(d1, d2);
     auto mutableData = tensor->mutable_data<TypeParam>();
@@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*new_blob, CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
@@ -1030,7 +1030,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
   int64_t size = d1 * d2;
 
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(d1, d2);
   tensor->mutable_data<float>();
   std::mutex mutex;
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 25aa801d265dba..f5683d1497377e 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -122,7 +122,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     static_assert(
         std::is_same<T, Tensor>::value,
         "Output(int, DeviceType) is only available for Tensor");
-    return outputs_.at(idx)->GetMutableTensor(type);
+    return BlobGetMutableTensor(outputs_.at(idx), type);
   }
 
   template <typename T>
@@ -149,7 +149,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool InputIsTensorType(int idx, DeviceType device_type) {
-    return inputs_.at(idx)->IsTensorType(device_type);
+    return BlobIsTensorType(*inputs_.at(idx), device_type);
   }
 
   template <typename T>
@@ -162,7 +162,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool OutputIsTensorType(int idx, DeviceType type) {
-    return outputs_.at(idx)->IsTensorType(type);
+    return BlobIsTensorType(*outputs_.at(idx), type);
   }
 
   inline int InputSize() const {
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 2c0ad9e7a8127b..8e48b6b7beabca 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -131,7 +131,8 @@ struct WorkspaceIdInjector {
           "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
       int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
       Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
+      TensorCPU* global_ws_id_tensor =
+          BlobGetMutableTensor(global_ws_id_blob, CPU);
       global_ws_id_tensor->Resize();
       global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
       VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 11bf9c413c5966..cbc58f742c2398 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -151,7 +151,7 @@ class CAFFE2_API Workspace {
       auto* to_blob = CreateBlob(blob);
       CAFFE_ENFORCE(to_blob);
       const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
+      auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
       to_tensor->CopyFrom(from_tensor);
     }
   }
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 8d011cd3be8bfa..38ffdc99426452 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -33,8 +33,9 @@ class IDEEPConcatOp final : public IDEEPOperator {
       if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
         inputs.emplace_back(Input(i));
       } else {
-        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
-                      "Expect cpu tensor if not itensor");
+        CAFFE_ENFORCE(
+            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
+            "Expect cpu tensor if not itensor");
         auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
         CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
                       tensor_cpu.size_from_dim(0) == 0,
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 08e6de2ae3f0dc..3226a08c4af9cf 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -89,7 +89,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           local_input_blobs_[i]->Reset();
         }
         input_share_[i] = false;
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
@@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
@@ -153,7 +153,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         Blob* dst = OperatorBase::OutputBlob(i);
         dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(dst, CPU);
         dtensor->Resize(src_dims);
         dtensor->ShareData(src);
       }
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 626568a989b939..468a42df1a9239 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.IsTensorType(CPU)) {
+    if (BlobIsTensorType(input_blob, CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<Tensor>(0, CPU);
       auto* Y = OperatorBase::Output<Tensor>(0, CPU);
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index 6d9713b74612d8..a3135758813ecf 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
     for (int i = 0; i < InputSize(); ++i) {
       if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
         OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
       } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
         OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
       } else {
         VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
@@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "MKL fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
index 111af03f8602b9..06ec2b50acc178 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
   if (first_run_) {
     first_run_ = false;
     for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
       Y->Resize(inputs_[i]->dims());
       Y->template mutable_data<float>();
     }
@@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
       // GLTensor
       auto* X = inputs_[i].get();
       X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
       Timer timer;
       timer.Start();
       getTensorCPU(*X, *Y);
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
index daa7ef008fc7b3..68f79e84a89f87 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                      std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
   Blob *blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(dims);
   T *t_data = tensor->mutable_data<T>();
   std::random_device rd;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 52f746f63f317b..742f8e48f4e9e1 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,13 +489,13 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsTensorType(CPU) ||
+    if (!BlobIsTensorType(*noiseBlob, CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
       t->Resize(noiseSize);
       math::RandGaussian<float, CPUContext>(
           t->size(),
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 7216b16611aa2a..7ac629019c58c0 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("stddev"), CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("pw"), CPU);
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(name), CPU);
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -682,8 +682,8 @@ void testMPSCNN() {
                           LOG(INFO) << "MPSCNNPool Test: " << pool;
                           Workspace ws;
                           {
-                            auto* t =
-                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                            auto* t = BlobGetMutableTensor(
+                                ws.CreateBlob("X_cpu"), CPU);
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,7 +1072,8 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                        auto* t =
+                            BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1080,7 +1081,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1092,7 +1093,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1188,7 +1189,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1196,7 +1197,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1204,7 +1205,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1275,7 +1276,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1283,7 +1284,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1291,7 +1292,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1385,7 +1386,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1393,7 +1394,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1401,7 +1402,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1493,7 +1494,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1501,7 +1502,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1509,7 +1510,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1607,7 +1608,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1615,7 +1616,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1623,7 +1624,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1726,7 +1727,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1734,7 +1735,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1791,7 +1792,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1799,7 +1800,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1856,7 +1857,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1864,7 +1865,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1921,7 +1922,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1929,7 +1930,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2011,7 +2012,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2065,7 +2066,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2136,7 +2137,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2144,7 +2145,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2250,14 +2251,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2362,7 +2363,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2497,7 +2498,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2505,7 +2506,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("bbox_delta_cpu"), CPU);
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2513,7 +2514,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("im_info"), CPU);
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2521,7 +2522,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("anchors"), CPU);
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2587,7 +2588,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2661,8 +2662,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t =
-                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                              auto* t = BlobGetMutableTensor(
+                                  ws.CreateBlob("X_cpu"), CPU);
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2675,7 +2676,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2692,7 +2693,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2809,7 +2810,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2891,7 +2892,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2964,7 +2965,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3336,8 +3337,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace cws;
     cws.RunNetOnce(initNet);
     {
-      auto* t =
-          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(
+          cws.CreateBlob(predictNet.external_input(0)), CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3348,8 +3349,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace mws;
     mws.RunNetOnce(initNet);
     {
-      auto* t =
-          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(
+          mws.CreateBlob(predictNet.external_input(0)), CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3397,16 +3398,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                                             \
-  ws.RunNetOnce(initNet);                                                   \
-  {                                                                         \
-    auto* t =                                                               \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
-    t->Resize(inputDims);                                                   \
-    CPUContext ctx;                                                         \
-    math::RandGaussian<float, CPUContext>(                                  \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
-  }                                                                         \
+#define RUN_NET(ws, predictNet)                            \
+  ws.RunNetOnce(initNet);                                  \
+  {                                                        \
+    auto* t = BlobGetMutableTensor(                        \
+        ws.CreateBlob(predictNet.external_input(0)), CPU); \
+    t->Resize(inputDims);                                  \
+    CPUContext ctx;                                        \
+    math::RandGaussian<float, CPUContext>(                 \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);  \
+  }                                                        \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index 47fd405eef01e4..3f78c5d1fcd6ae 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 1c08df0f32a1c0..428c395fe442d4 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index 45ea26c44cc964..56f1fc28986a7c 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
         output_dims.push_back(dim);
       }
 
-      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
+      auto* tensor = BlobGetMutableTensor(ws_.CreateBlob(blob), CPU);
       tensor->Resize(output_dims);
       outputs->push_back(tensor);
 
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
index 359e7767746b69..c14e9ed26376e1 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
@@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
@@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
@@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
@@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
index deab1ca7b43f76..9b4608dc07aee1 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
   // CPU reference
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -103,21 +103,21 @@ static void test_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, kernel, kernel, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -189,7 +189,7 @@ static void test_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(1, kernel, kernel, D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -428,7 +428,7 @@ static void test_pooling(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -496,7 +496,7 @@ static void test_pooling(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     if (H == 1 && W == 1) {
       t->Resize(N, C);
     } else {
@@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 9da266c4e85051..690a33cb854f16 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
   LOG(INFO) << "OPENGLCopyFrom/To Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -275,7 +275,7 @@ void testOpenGLConv(int N,
             << " Op: " << glPoolOperationName[poolOp];
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -301,7 +301,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
       t->Resize(K, C, kernel_h, kernel_w);
     } else {
@@ -343,7 +343,7 @@ void testOpenGLConv(int N,
 
     // bias
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(K);
       CPUContext ctx;
       if (random_input) {
@@ -367,7 +367,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -532,7 +532,7 @@ void testOpenGLPRelu(
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -541,7 +541,7 @@ void testOpenGLPRelu(
 
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(prelu_size);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
 
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -814,8 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t =
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i)), CPU);
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -891,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -942,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@@ -992,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
   }
 
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(1);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@@ -1060,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
   LOG(INFO) << "OpenGL Softmax Test "
             << "N: " << N << " D: " << D << " Tiled:" << tiled;
   Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
   {
     t->Resize(N, D);
     CPUContext ctx;
@@ -1151,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1163,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1172,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1254,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1266,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1275,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1284,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1385,7 +1385,7 @@ void OpenGL_speedtest(int N,
             << " C: " << C << " H: " << H << " W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1399,7 +1399,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1413,7 +1413,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1479,7 +1479,7 @@ void testOpenGLPadImage(
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1593,7 +1593,7 @@ void testOpenGLResize(int N,
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1675,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGL Preprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1684,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 100;
@@ -1748,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLDeprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1757,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1800,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLNormPlanarYUV Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, 3, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1809,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1818,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 6;
@@ -1879,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
   LOG(INFO) << "OpenGL CopyOps Speed Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1893,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1907,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1990,8 +1990,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace cws;
     cws.RunNetOnce(initNet);
 
-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
-                      ->GetMutableTensor(CPU);
+    auto* t_cpu = BlobGetMutableTensor(
+        cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2032,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace mws;
     mws.RunNetOnce(initNet);
 
-    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
-                     ->GetMutableTensor(CPU);
+    auto* t_gl = BlobGetMutableTensor(
+        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2116,7 +2116,7 @@ void compareBatchedToTiledModels(std::string name,
     tws.RunNetOnce(initNet);
 
     auto* t_batch =
-        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2143,7 +2143,7 @@ void compareBatchedToTiledModels(std::string name,
     bws.RunNetOnce(initNet);
 
     auto* t_tiling =
-        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index deced719644963..cfeed00e8b9730 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -14,7 +14,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                         \
   do {                                                                    \
     Blob* _blob = ws.CreateBlob((_n));                                    \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);                     \
     _tensor->Resize((_s));                                                \
     memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
   } while (0)
@@ -23,7 +23,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                 \
   do {                                                            \
     Blob* _blob = ws.CreateBlob((_n));                            \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);             \
     _tensor->Resize((_s));                                        \
     memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
   } while (0)
@@ -43,7 +43,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@@ -56,7 +56,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index a1c1af0f6dfb8d..6316b05284fba9 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -289,13 +289,13 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
     def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
     def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
     def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
+    auto* Xws = BlobGetMutableTensor(ws.CreateBlob("X"), CPU);
     Xws->ResizeLike(X);
     Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* Wws = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     Wws->ResizeLike(W_);
     Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* bws = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     bws->ResizeLike(bias);
     bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
     ws.RunOperatorOnce(def);
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index 804296307d6ef8..31e179b3e41f82 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -30,7 +30,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CUDA);
+    auto* tensor = BlobGetMutableTensor(blob, CUDA);
     tensor->Resize(dims);
     math::Set<float, CUDAContext>(
         tensor->size(),
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index 45db7dd5b8484a..c74829b4f8f9c5 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -24,7 +24,7 @@ class BatchMatMulOpTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
     tensor->Resize(dims);
     math::Set<float, CPUContext>(
         tensor->size(),
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index 8814be17153d44..b0c5f7dcdfff0b 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -16,7 +16,7 @@ static void AddScalarInput(
     Workspace* ws,
     bool isEmpty = false) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   if (!isEmpty) {
     tensor->Resize(vector<int64_t>{1});
     *(tensor->template mutable_data<DataT>()) = value;
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
index b9f54b6d55be7c..155b6f0cd24561 100644
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@@ -27,8 +27,8 @@ void runWithSharedBuffer<CPUContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
+  auto* buffer = BlobGetMutableTensor(
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__"), CPU);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index f80d15a5d9054b..c1f37c7f1362f2 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -20,8 +20,8 @@ void runWithSharedBuffer<CUDAContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
+  auto* buffer = BlobGetMutableTensor(
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__"), CUDA);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index 6eb45eb5f8d17c..3bc2951664353b 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -17,7 +17,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -29,7 +29,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 83294224280831..e3c0abe83d8b4e 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -1428,7 +1428,7 @@ class TreeCursorSerializer : public BlobSerializerBase {
     // serialize offsets as a tensor
     if (cursor->offsets.size() > 0) {
       Blob offsets_blob;
-      auto* offsets = offsets_blob.GetMutableTensor(CPU);
+      auto* offsets = BlobGetMutableTensor(&offsets_blob, CPU);
       offsets->Resize(cursor->offsets.size());
       std::copy(
           cursor->offsets.begin(),
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index a68a1263f6f451..8a40c731143f44 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -150,7 +150,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
     // Reshape tensor descriptors if necessary
     if (X.dims() != cudnn_input_dims_ && !is_test_) {
       CAFFE_ENFORCE(scratch_blob_);
-      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
+      Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
       cudnn_input_dims_ = X.dims();
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index bcd547e28f0989..b785d040c8f1a7 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -19,7 +19,7 @@ void FillTensor(
     const std::vector<int64_t>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
   tensor->Resize(shape);
   auto* mutable_data = tensor->template mutable_data<O_Type>();
   const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index 2b3a033a665df7..da7fdc650879c3 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -18,7 +18,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -34,7 +34,7 @@ static void AddLinSpacedInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -51,7 +51,7 @@ static void AddInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index 241b0ff97c6070..2fb8f3b338dc64 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -353,7 +353,7 @@ class IndexSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor) override {
     auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
     Blob tensor_blob;
-    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
+    auto* tensor_out = BlobGetMutableTensor(&tensor_blob, CPU);
 
     if (base->Type().Match<std::string>()) {
       doStore<std::string>(base, tensor_out);
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index dbd5103952469c..7a3c34cfbf7cce 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -213,23 +213,23 @@ class ONNXWhileOp final : public Operator<Context> {
       lcd_tensors_.clear();
       for (int i = 2; i < body_net_def.external_input_size(); ++i) {
         Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
+        Tensor* t = BlobGetMutableTensor(b, Context::GetDeviceType());
         lcd_tensors_.push_back(t);
       }
       // First output is the iteration variable
       auto* iteration_var_blob = loop_ws_->CreateBlob(
           body_net_def.external_input(0));
       iteration_var_ =
-          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(iteration_var_blob, Context::GetDeviceType());
 
-      input_condition_var_ =
-          loop_ws_->CreateBlob(body_net_def.external_input(1))
-              ->GetMutableTensor(Context::GetDeviceType());
+      input_condition_var_ = BlobGetMutableTensor(
+          loop_ws_->CreateBlob(body_net_def.external_input(1)),
+          Context::GetDeviceType());
 
       auto* condition_var_blob =
           loop_ws_->CreateBlob(body_net_def.external_output(0));
       condition_var_ =
-          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(condition_var_blob, Context::GetDeviceType());
       condition_var_->Resize(1);
       condition_var_->template mutable_data<bool>();
 
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index d1b0824f1b3191..767a37d5fc7924 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->IsTensorType(CPU),
+      BlobIsTensorType(*blob, CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 8ef39e7c0e78d1..5b3a38dbfbd13d 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -65,8 +65,8 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
       if (this->InputIsTensorType(i, CUDA)) {
-        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
-            Input(i), &context_);
+        BlobGetMutableTensor(local_input_blobs_[i], CPU)
+            ->CopyFrom(Input(i), &context_);
         need_sync = true;
       } else {
         VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
@@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index 964708bc10906f..0870a4be2dd7bd 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -40,7 +40,7 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
@@ -64,7 +64,7 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CUDA)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index 3537ab69d058f0..d4ac325a78b80a 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -20,7 +20,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 98675cea858d54..63d58f3ccd8f6d 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -43,11 +43,10 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
             prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
-        ws_->CreateBlob(newBlobName)
-            ->GetMutableTensor(CPU)
+        BlobGetMutableTensor(ws_->CreateBlob(newBlobName), CPU)
             ->ResizeLike(currentTensor);
         auto type = Context::GetDeviceType();
-        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
+        auto* newTensor = BlobGetMutableTensor(ws_->GetBlob(newBlobName), type);
         newTensor->CopyFrom(currentTensor);
       }
     }
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 7e37e562e77a50..4cb53a6d7d330a 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
       // the forward-only mode.
       std::string this_timestep_blob =
           timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
+      BlobGetMutableTensor(ws->CreateBlob(this_timestep_blob), CPU)->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
-      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+      BlobGetMutableTensor(b, CPU)->template mutable_data<int32_t>()[0] = t;
 
       // Copy the operators from template
       for (auto& template_rnn_op : timestep_ops_template_) {
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index 2421bc44263afd..21b3064a6fac3c 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -52,10 +52,11 @@ struct CAFFE2_API ScratchWorkspaces {
 };
 
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
+  BlobGetMutableTensor(ws->CreateBlob(blob_name), CPU)->Resize(1);
   auto timestepBlob = ws->GetBlob(blob_name);
   CAFFE_ENFORCE(timestepBlob);
-  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+  BlobGetMutableTensor(timestepBlob, CPU)->template mutable_data<int32_t>()[0] =
+      t;
 }
 
 CAFFE2_API std::map<string, string> GetRecurrentMapping(
@@ -71,8 +72,9 @@ void applyOffsetAlias(
           << " at offset: " << oc.offset;
   auto srcBlob = ws->GetBlob(oc.src);
   CAFFE_ENFORCE(srcBlob);
-  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
-  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
+  auto* src = BlobGetMutableTensor(srcBlob, Context::GetDeviceType());
+  auto* dst =
+      BlobGetMutableTensor(ws->GetBlob(oc.dst), Context::GetDeviceType());
   auto timestep = src->size() / src->dim(0);
   auto dims = src->dims();
   const int32_t startDstTimestep =
@@ -113,7 +115,7 @@ void initializeRecurrentInput(
     Context* context) {
   auto stateBlob = ws->GetBlob(rc.state);
   CAFFE_ENFORCE(stateBlob);
-  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* state = BlobGetMutableTensor(stateBlob, Context::GetDeviceType());
 
   auto inputBlob = ws->GetBlob(rc.input);
   CAFFE_ENFORCE(inputBlob);
@@ -660,7 +662,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->GetBlob(param.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
       g->ResizeLike(p);
       math::Set<T, Context>(
           g->size(),
@@ -676,7 +678,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->CreateBlob(rg.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
       g->ResizeLike(p);
       CAFFE_ENFORCE_EQ(g->ndim(), 3);
       const auto timestep = g->size() / g->dim(0);
@@ -703,7 +705,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
               << ". Size: " << Input(gradientInputIndex).size();
       auto pGradientBlob = sharedWs_->GetBlob(gradientName);
       CAFFE_ENFORCE(pGradientBlob);
-      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(pGradientBlob, Context::GetDeviceType());
       g->ResizeLike(Input(gradientInputIndex));
       g->template mutable_data<T>();
     }
@@ -717,7 +719,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                 << rg.lastExternalGrad << " for final time step (sep. blob)";
         auto gBlob = sharedWs_->GetBlob(rg.grad);
         CAFFE_ENFORCE(gBlob);
-        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+        auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
 
         auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
         CAFFE_ENFORCE(oglastBlob);
@@ -779,7 +781,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
       T* output_data = Output(outputIdx)->template mutable_data<T>();
       auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
       CAFFE_ENFORCE(pBlob);
-      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* p = BlobGetMutableTensor(pBlob, Context::GetDeviceType());
 
       if (Input(inputId).ndim() >= 2) {
         // Gradient states blob should live. And if it gets changed by the
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 2647a97d6f0b90..7257ec44c25984 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -18,7 +18,7 @@ void AddConstInput(
     Context* context,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
   tensor->Resize(shape);
   math::Set<float, Context>(
       tensor->size(), value, tensor->template mutable_data<float>(), context);
@@ -39,7 +39,7 @@ void AddInput<CPUContext>(
     const string& name,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -57,7 +57,7 @@ void AddInput<CUDAContext>(
   tmp_vec.array() = utils::AsEArrXt(values);
 
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->CopyFrom(tmp);
 }
 
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index c9ba13efb50258..2092ae804f2c3b 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
  public:
   bool runOp(const TensorCPU& input) {
     auto* blob = ws_.CreateBlob("X");
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
     tensor->ResizeLike(input);
     tensor->ShareData(input);
 
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*output, CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
@@ -42,7 +42,7 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
   std::vector<std::string> input = {"a", "xx", "c"};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -62,7 +62,7 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                  {"dd", "ee", "ff"}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -82,7 +82,7 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
   std::vector<float> input = {3.90f, 5.234f, 8.12f};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -102,7 +102,7 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                            {4.67f, 5.90f, 6.32f}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -122,7 +122,7 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
   std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<int64_t>();
   for (int i = 0; i < input.size(); ++i) {
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index a6d395fe9ba647..bfc41a462999b5 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsTensorType(CPU)) {
+    if (!BlobIsTensorType(*noiseBlob, CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index cd081bf959e399..e9f5b1a8f8455f 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -56,7 +56,7 @@ bool TensorProtosDBInput<Context>::Prefetch() {
         protos.mutable_protos(i)->clear_device_detail();
       }
       deserializer.Deserialize(
-          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
+          protos.protos(i), BlobGetMutableTensor(&prefetched_blobs_[i], CPU));
     }
   } else {
     vector<Tensor> temp_tensors;
@@ -74,11 +74,11 @@ bool TensorProtosDBInput<Context>::Prefetch() {
           vector<int> dims(
               protos.protos(i).dims().begin(), protos.protos(i).dims().end());
           dims.insert(dims.begin(), batch_size_);
-          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
+          BlobGetMutableTensor(&prefetched_blobs_[i], CPU)->Resize(dims);
         }
       }
       for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
+        TensorCPU* dst = BlobGetMutableTensor(&prefetched_blobs_[i], CPU);
         TensorCPU& src = temp_tensors[i];
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index 421c26e318b6e9..1a5cdc344ce4a8 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
     int cores_idx = 0;
 
     // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
+    auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType());
     Y_buf->ResizeLike(X);
     Y_buf->CopyFrom(X);
 
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index f500afaf9ed24f..1099d900cbefdc 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -19,7 +19,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index 379dd52655c4f4..a3a2a409674edd 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -16,7 +16,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index fdf5fdc31e1046..8c324a97c50934 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -44,10 +44,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
     CAFFE_ENFORCE(
         bnInputs.size() >= 5, "Invalid batch normalization input size");
 
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
-  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                                \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                      \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");             \
+  auto name##Tensor = BlobGetMutableTensor(ws->GetBlob(name->getName()), CPU); \
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
@@ -76,7 +76,7 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
           nn->dataFlow.createEdge(convBiasNode, convNode);
 
           auto* blob = ws->CreateBlob(convBiasName);
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
           CHECK_NOTNULL(tensor);
           // Get output channel
           size_t c = filterTensor->dim32(0);
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index ce79df56ecb728..a048503fea99c7 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -173,7 +173,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
 
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = blob->GetMutableTensor(CPU);
+      auto* cpu_tensor = BlobGetMutableTensor(blob, CPU);
       std::vector<int64_t> dims;
       for(const auto& d : t.dims()) {
         dims.push_back(d);
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 84dac93753d37a..7775e69776450c 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -10,14 +10,14 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
+      BlobIsTensorType(*blob, CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return blob->GetMutableTensor(CPU);
+  return BlobGetMutableTensor(blob, CPU);
 }
 
 void shareInputTensor(
@@ -60,7 +60,7 @@ Predictor::Predictor(PredictorConfig config) : config_(std::move(config)) {
   for (const auto& name : config_.predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = config_.ws->CreateBlob(name);
-      blob->GetMutableTensor(CPU);
+      BlobGetMutableTensor(blob, CPU);
     }
   }
   CAFFE_ENFORCE(config_.ws->CreateNet(config_.predict_net));
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index ae4f73e9da0ad7..a0245cd7a86d66 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
     const std::vector<int64_t>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
-  auto* t = blob->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(blob.get(), CPU);
   t->Resize(dims);
   math::RandUniform<float, CPUContext>(
       t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@@ -180,7 +180,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorList input;
   input.emplace_back(CPU);
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
   input.back().ResizeLike(*tensor);
   input.back().ShareData(*tensor);
   Predictor::TensorList output;
@@ -196,7 +196,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorMap input;
   auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
   iter.first->second.ResizeLike(*tensor);
   iter.first->second.ShareData(*tensor);
 
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 81197047102ffb..9a1d715bfdf225 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -328,7 +328,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "tensor",
-          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
+          [](Blob* blob) { return py::cast(BlobGetMutableTensor(blob, CPU)); },
           py::return_value_policy::reference_internal)
       .def(
           "_feed",
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 97ec6628fe3f27..50c64f6c9e44c8 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -234,7 +234,7 @@ class TensorFeeder : public BlobFeederBase {
     FeedTensor(
         option,
         original_array,
-        blob->GetMutableTensor(Context::GetDeviceType()));
+        BlobGetMutableTensor(blob, Context::GetDeviceType()));
   }
 };
 
@@ -366,31 +366,32 @@ class PythonOpBase : public Operator<Context> {
 
         // make sure output blob is initialized before creating the binding
         if (forced_cpu_outputs_.count(i)) {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
         } else {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
         }
 
         py::object py_obj;
         if (blob->template IsType<Tensor>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
+                cpu_option);
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         }
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index ebad6cf8d96839..f0307f7b6485d2 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -163,8 +163,8 @@ class IDeepFeeder : public BlobFeederBase {
         DeviceOption cpu_option(option);
         cpu_option.set_device_type(DeviceTypeProto::PROTO_CPU);
         TensorFeeder<CPUContext> cpu_tensor_feeder;
-        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
-                                     blob->GetMutableTensor(CPU));
+        cpu_tensor_feeder.FeedTensor(
+            cpu_option, original_array, BlobGetMutableTensor(blob, CPU));
       }
     } catch (ideep::error &e) {
       LOG(ERROR) << "IDEEP error: " << e.message;
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 4ac3524d49d8a6..d102985e2fd7aa 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index 05c945106c52da..f11e05b67392c9 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -231,11 +231,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
 
         for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] = ws_->CreateBlob(
-                                          "__transformed_kernel_" +
-                                          to_string(__sync_fetch_and_add(
-                                              &precomputed_transform_id, 1)))
-                                       ->GetMutableTensor(CPU);
+          transformedFilters_[g] = BlobGetMutableTensor(
+              ws_->CreateBlob(
+                  "__transformed_kernel_" +
+                  to_string(
+                      __sync_fetch_and_add(&precomputed_transform_id, 1))),
+              CPU);
           transformedFilters_[g]->Resize(transformedFilterElements);
 
           status = nnp_convolution_inference(
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index 2f892118982da2..10eb6348becc06 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index 911c2b09868fc3..a5df5900ee23a2 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{10, 6};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{6, 10};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
   vector<int> shapeA{5, 10};
   vector<int> shapeX{10};
   vector<int> shapeY{5};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 50);
@@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
   vector<int> shapeA{6, 10};
   vector<int> shapeX{6};
   vector<int> shapeY{10};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 60);
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 9be1c3db6c1d01..4b0247a0786fcc 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
   Blob* bloby = ws.CreateBlob("Y");
   Blob* bloby_host = ws.CreateBlob("Y_host");
 
-  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
-  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensorx0 = BlobGetMutableTensor(blobx0, CUDA);
+  auto* tensorx1 = BlobGetMutableTensor(blobx1, CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
 
   vector<int> shapex0_vector{shapex0};
   vector<int> shapex1_vector{shapex1};
@@ -71,7 +71,7 @@ void executeGpuBinaryOpTest(
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   vector<int> shapex{33 * 9, 25};
   vector<int> shapey{33, 25};
 
-  auto* tensorx = blobx->GetMutableTensor(CUDA);
+  auto* tensorx = BlobGetMutableTensor(blobx, CUDA);
   tensorx->Resize(shapex);
   int stripe = 33 * 25;
   vector<float> tot(33, 0.0);
@@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
     }
   }
 
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
   tensory->Resize(shapey);
   math::Set<float, CUDAContext>(
       stripe, 0.0, tensory->mutable_data<float>(), &context);
@@ -125,7 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -258,9 +258,9 @@ class GemmBatchedGPUTest
     Blob* X_blob = ws_.CreateBlob("X");
     Blob* W_blob = ws_.CreateBlob("W");
     Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = X_blob->GetMutableTensor(CUDA);
-    W_ = W_blob->GetMutableTensor(CUDA);
-    Y_ = Y_blob->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(X_blob, CUDA);
+    W_ = BlobGetMutableTensor(W_blob, CUDA);
+    Y_ = BlobGetMutableTensor(Y_blob, CUDA);
     X_->Resize(std::vector<int64_t>{3, 5, 10});
     W_->Resize(std::vector<int64_t>{3, 6, 10});
     Y_->Resize(std::vector<int64_t>{3, 5, 6});
@@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -402,7 +402,7 @@ class ReduceTensorGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -664,8 +664,8 @@ class BroadcastGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -681,7 +681,7 @@ class BroadcastGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -741,9 +741,9 @@ class MomentsGPUTest : public testing::Test {
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_mean = ws_.CreateBlob("mean");
     Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    mean_ = blob_mean->GetMutableTensor(CUDA);
-    variance_ = blob_variance->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    mean_ = BlobGetMutableTensor(blob_mean, CUDA);
+    variance_ = BlobGetMutableTensor(blob_variance, CUDA);
   }
 
   void SetUpData(
@@ -766,10 +766,10 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& mean_data,
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
+    auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
     mean_host->CopyFrom(*mean_, cuda_context_.get());
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
+    auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
     variance_host->CopyFrom(*variance_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
 
@@ -868,8 +868,8 @@ class TransposeGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -890,7 +890,7 @@ class TransposeGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());

From 3417a1e7e4693d76b1a695af22e04c7e5ef89682 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Mon, 24 Sep 2018 23:08:56 -0700
Subject: [PATCH 18/51] Prepend a "const" to a for loop in printPyObject.
 (#11857)

Summary:
As pytuple should be a constant type (since obj is constant), potential errors would occur without
this const decorator, e.g., when compiling against PyPy. Although PyPy is not supported yet, it
would still be useful if we remove this compilation issue (out of very few numbers of compilation
issues) to allow hackers playing with them.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11857

Differential Revision: D10024149

Pulled By: soumith

fbshipit-source-id: aa7e08e58f6369233a11477113351dccd3854ba8
---
 torch/csrc/jit/python_ir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 53e16cc0a09f9d..5aa053f626faa1 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -45,7 +45,7 @@ std::ostream& printPyObject(std::ostream & out, const THPObjectPtr& obj) {
     auto pytuple = pyobj.cast<py::tuple>();
     out << "(";
     size_t i = 0;
-    for (auto& o : pytuple) {
+    for (const auto& o : pytuple) {
       if (i > 0) {
         out << ", ";
       }

From 2cdf98a74df3afa094a7af465004aad128067cf9 Mon Sep 17 00:00:00 2001
From: Maciej Bargiel <maciejbargiel@fb.com>
Date: Tue, 25 Sep 2018 01:06:19 -0700
Subject: [PATCH 19/51] Back out "Removing some dependency edges from Blob to
 other caffe2"

Summary: The controller you requested could not be found. Original commit changeset: 2ea17724e223

Differential Revision:
D10026321
Ninja: stable broken

fbshipit-source-id: faf87cb7cc0f78c2c10d4aa6fceea279cd27acd6
---
 binaries/benchmark_helper.cc                  |   6 +-
 binaries/speed_benchmark.cc                   |   2 +-
 caffe2/contrib/gloo/common.cc                 |   2 +-
 .../contrib/nervana/nervana_fc_op_gpu_test.cc |   2 +-
 .../contrib/tensorrt/tensorrt_tranformer.cc   |   4 +-
 caffe2/core/blob.h                            |  54 +++---
 caffe2/core/blob_gpu_test.cc                  |   8 +-
 caffe2/core/blob_serialization.cc             |   3 +-
 caffe2/core/blob_test.cc                      |  30 ++--
 caffe2/core/operator.h                        |   6 +-
 caffe2/core/plan_executor.cc                  |   3 +-
 caffe2/core/workspace.h                       |   2 +-
 caffe2/ideep/operators/concat_split_op.cc     |   5 +-
 .../ideep/operators/operator_fallback_ideep.h |   6 +-
 caffe2/ideep/operators/utility_ops.cc         |   2 +-
 caffe2/mkl/operators/operator_fallback_mkl.h  |   6 +-
 .../contrib/arm-compute/operators/copy_op.cc  |   4 +-
 .../arm-compute/test/gl_operator_test.h       |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |   4 +-
 .../mobile/contrib/ios/mpscnn/mpscnn_test.mm  | 169 +++++++++---------
 caffe2/mobile/contrib/ios/pool_test.cc        |   2 +-
 caffe2/mobile/contrib/ios/resize_test.cc      |   2 +-
 caffe2/mobile/contrib/nnapi/nnapi.cc          |   2 +-
 .../mobile/contrib/nnapi/nnapi_benchmark.cc   |  24 +--
 caffe2/mobile/contrib/nnapi/nnapi_test.cc     |  28 +--
 .../mobile/contrib/opengl/test/opengl_test.cc |  94 +++++-----
 .../mobile/contrib/snpe/snpe_op_benchmark.cc  |   8 +-
 caffe2/mobile/contrib/ulp2/ulp_test.cc        |   6 +-
 caffe2/operators/batch_matmul_op_gpu_test.cc  |   2 +-
 caffe2/operators/batch_matmul_op_test.cc      |   2 +-
 caffe2/operators/boolean_unmask_ops_test.cc   |   2 +-
 caffe2/operators/conv_op_shared.cc            |   4 +-
 caffe2/operators/conv_op_shared_gpu.cc        |   4 +-
 .../conv_transpose_op_mobile_test.cc          |   4 +-
 caffe2/operators/dataset_ops.cc               |   2 +-
 caffe2/operators/dropout_op_cudnn.cc          |   2 +-
 caffe2/operators/elementwise_op_test.h        |   2 +-
 .../operators/generate_proposals_op_test.cc   |   6 +-
 caffe2/operators/index_ops.cc                 |   2 +-
 caffe2/operators/onnx_while_op.h              |  12 +-
 caffe2/operators/onnxifi_op.cc                |   2 +-
 caffe2/operators/operator_fallback_gpu.h      |   6 +-
 .../operators/operator_fallback_gpu_test.cc   |   4 +-
 caffe2/operators/reshape_op_gpu_test.cc       |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |   5 +-
 .../rnn/recurrent_network_executor.h          |   4 +-
 caffe2/operators/rnn/recurrent_network_op.h   |  22 ++-
 caffe2/operators/roi_align_op_gpu_test.cc     |   6 +-
 caffe2/operators/string_ops_test.cc           |  14 +-
 caffe2/operators/stylizer_ops.cc              |   4 +-
 caffe2/operators/tensor_protos_db_input.h     |   6 +-
 caffe2/operators/tt_linear_op.h               |   2 +-
 caffe2/operators/utility_ops_gpu_test.cc      |   2 +-
 caffe2/operators/utility_ops_test.cc          |   2 +-
 caffe2/opt/fusion.cc                          |  10 +-
 caffe2/opt/onnxifi_transformer.cc             |   2 +-
 caffe2/predictor/predictor.cc                 |   6 +-
 caffe2/predictor/predictor_test.cc            |   6 +-
 caffe2/python/pybind_state.cc                 |   2 +-
 caffe2/python/pybind_state.h                  |  15 +-
 caffe2/python/pybind_state_ideep.cc           |   4 +-
 .../depthwise/depthwise3x3_conv_op_test.cc    |   2 +-
 caffe2/share/contrib/nnpack/conv_op.cc        |  11 +-
 caffe2/share/contrib/nnpack/nnpack_test.cc    |   2 +-
 caffe2/utils/hip/math_blas_hip_test.cc        |  32 ++--
 caffe2/utils/math_gpu_test.cc                 |  48 ++---
 66 files changed, 371 insertions(+), 380 deletions(-)

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index f481a6292c7f56..001c8e965f6a6e 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -163,7 +163,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
+          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -200,7 +200,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
+    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       int total_size = tensor_proto->string_data_size();
       for (size_t i = 0; i < total_size; i++) {
@@ -298,7 +298,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
+              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
               output_prefix,
               name);
         }
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index fd502cf3c078ab..5914e3f58b44b2 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -137,7 +137,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
+        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
index d4929938f19174..21ce0343d81819 100644
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = BlobGetMutableTensor(status_blob, CPU);
+  auto* res = status_blob->GetMutableTensor(CPU);
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
index 9eee8973142ed7..972d9231dcf9c6 100644
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CUDA);
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index 2dd17e00169902..3612d8b46f1f8d 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -95,10 +95,10 @@ void BlobToTensorProto(
   }
 
   // Set values
-  if (BlobIsTensorType(*blob, CPU)) {
+  if (blob->IsTensorType(CPU)) {
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (BlobIsTensorType(*blob, CUDA)) {
+  } else if (blob->IsTensorType(CUDA)) {
     const auto& cuda_tensor = blob->template Get<TensorCUDA>();
     const auto cpu_tensor = TensorCPU(cuda_tensor, context);
     context->FinishDeviceComputation();
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 80470cea443331..870fc88322b158 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -6,16 +6,16 @@
 #include <typeinfo>
 #include <type_traits>
 #include <vector>
-#include "caffe2/core/common.h"
 
-#include <ATen/core/typeid.h>
+#include "caffe2/core/blob_serializer_base.h"
+#include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
 
-class Tensor;
-
 /**
  * @brief Blob is a general container that hosts a typed pointer.
  *
@@ -50,6 +50,15 @@ class CAFFE2_API Blob final {
     return meta_.Match<T>();
   }
 
+  bool IsTensorType(DeviceType device_type) const {
+    bool is_match = meta_.Match<Tensor>();
+    auto* tensor = static_cast<Tensor*>(pointer_);
+    if (is_match && tensor && tensor->GetDeviceType() == device_type) {
+      return true;
+    }
+    return false;
+  }
+
   /**
    * Returns the meta info of the blob.
    */
@@ -100,6 +109,9 @@ class CAFFE2_API Blob final {
         std::is_default_constructible<T>::value,
         "GetMutable can't be called with non-default-constructible types. "
         "Try using specialized methods");
+    static_assert(
+        !std::is_same<T, Tensor>::value,
+        "Use GetMutableTensor(DeviceType) instead");
     if (IsType<T>()) {
       return static_cast<T*>(pointer_);
     } else {
@@ -117,6 +129,16 @@ class CAFFE2_API Blob final {
     }
   }
 
+  inline Tensor* GetMutableTensor(DeviceType device_type) {
+    if (IsTensorType(device_type)) {
+      return static_cast<Tensor*>(pointer_);
+    } else {
+      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
+              << " DeviceType:" << device_type;
+      return Reset<Tensor>(new Tensor(device_type));
+    }
+  }
+
   /**
    * Sets the underlying object to the allocated one. The Blob then takes over
    * the ownership of the passed in pointer. If there is already an object in
@@ -226,29 +248,5 @@ inline void swap(Blob& lhs, Blob& rhs) {
   lhs.swap(rhs);
 }
 
-inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
-  bool is_match = blob.meta().Match<Tensor>();
-  if (!is_match) {
-    return false;
-  }
-  const Tensor* tensor = &blob.Get<Tensor>();
-  return tensor && tensor->GetDeviceType() == device_type;
-}
-
-inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
-  if (blob->IsType<Tensor>()) {
-    Tensor* tensor = blob->GetMutable<Tensor>();
-    if (tensor->GetDeviceType() == device_type) {
-      return tensor;
-    }
-  }
-
-  // if we're here, then either Blob didn't hold a Tensor
-  // or that Tensor had the wrong DeviceType.
-  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-          << " DeviceType:" << device_type;
-  return blob->Reset<Tensor>(new Tensor(device_type));
-}
-
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_BLOB_H_
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 55eafdede7269a..e8fdf47f69ddb0 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -132,7 +132,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     for (int i = 0; i < 6; ++i) {                                          \
       cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
     }                                                                      \
-    BlobGetMutableTensor(&blob, CUDA)->CopyFrom(cpu_tensor);               \
+    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
     string serialized = SerializeBlob(blob, "test");                       \
     BlobProto proto;                                                       \
     CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@@ -149,7 +149,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));               \
-    EXPECT_TRUE(BlobIsTensorType(new_blob, CUDA));                         \
+    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                              \
     Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@@ -199,7 +199,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
-    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
+    EXPECT_TRUE(blob.IsTensorType(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -207,7 +207,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
-    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
+    EXPECT_TRUE(blob.IsTensorType(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 501ec1d89bf70a..38125b242def2f 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -363,8 +363,7 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
   auto tensor_proto = blob_proto.tensor();
   Deserialize(
       tensor_proto,
-      BlobGetMutableTensor(
-          blob,
+      blob->GetMutableTensor(
           static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
 }
 
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index bb2f4ba6a91818..24b2a2d0593d3a 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
   int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
   EXPECT_TRUE(blob.IsType<int>());
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
+  EXPECT_FALSE(blob.IsTensorType(CPU));
 
   BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
   EXPECT_TRUE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
+  EXPECT_FALSE(blob.IsTensorType(CPU));
 
-  Tensor* tensor_unused CAFFE2_UNUSED = BlobGetMutableTensor(&blob, CPU);
-  EXPECT_TRUE(BlobIsTensorType(blob, CPU));
+  Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
+  EXPECT_TRUE(blob.IsTensorType(CPU));
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
 }
@@ -600,7 +600,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
   TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
     Blob blob;                                                            \
-    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);                    \
+    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
     tensor->Resize(2, 3);                                                 \
     for (int i = 0; i < 6; ++i) {                                         \
       tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
+    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -634,7 +634,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                           \
   TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
     Blob blob;                                                            \
-    TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);                 \
+    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
     tensor->Resize(0, 3);                                                 \
     tensor->mutable_data<TypeParam>();                                    \
     string serialized = SerializeBlob(blob, "test");                      \
@@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
+    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -669,7 +669,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerialization_CustomType) {
   Blob blob;
-  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor->mutable_data<BlobTestFoo>()[i].val = i;
@@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
+  EXPECT_TRUE(new_blob.IsTensorType(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -696,7 +696,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, Half) {
   const int64_t kSize = 3000000;
   Blob blob;
-  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(kSize);
   for (int i = 0; i < tensor->size(); ++i) {
     tensor->mutable_data<at::Half>()[i].x = i % 10000;
@@ -724,7 +724,7 @@ TEST(TensorTest, Half) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
+  EXPECT_TRUE(new_blob.IsTensorType(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -860,7 +860,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
   {
     VLOG(1) << "Test begin";
     Blob blob;
-    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);
+    Tensor* tensor = blob.GetMutableTensor(CPU);
     VLOG(1) << "Allocating blob";
     tensor->Resize(d1, d2);
     auto mutableData = tensor->mutable_data<TypeParam>();
@@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(BlobIsTensorType(*new_blob, CPU));
+    EXPECT_TRUE(new_blob->IsTensorType(CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
@@ -1030,7 +1030,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
   int64_t size = d1 * d2;
 
   Blob blob;
-  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(d1, d2);
   tensor->mutable_data<float>();
   std::mutex mutex;
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index f5683d1497377e..25aa801d265dba 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -122,7 +122,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     static_assert(
         std::is_same<T, Tensor>::value,
         "Output(int, DeviceType) is only available for Tensor");
-    return BlobGetMutableTensor(outputs_.at(idx), type);
+    return outputs_.at(idx)->GetMutableTensor(type);
   }
 
   template <typename T>
@@ -149,7 +149,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool InputIsTensorType(int idx, DeviceType device_type) {
-    return BlobIsTensorType(*inputs_.at(idx), device_type);
+    return inputs_.at(idx)->IsTensorType(device_type);
   }
 
   template <typename T>
@@ -162,7 +162,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool OutputIsTensorType(int idx, DeviceType type) {
-    return BlobIsTensorType(*outputs_.at(idx), type);
+    return outputs_.at(idx)->IsTensorType(type);
   }
 
   inline int InputSize() const {
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 8e48b6b7beabca..2c0ad9e7a8127b 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -131,8 +131,7 @@ struct WorkspaceIdInjector {
           "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
       int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
       Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor =
-          BlobGetMutableTensor(global_ws_id_blob, CPU);
+      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
       global_ws_id_tensor->Resize();
       global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
       VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index cbc58f742c2398..11bf9c413c5966 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -151,7 +151,7 @@ class CAFFE2_API Workspace {
       auto* to_blob = CreateBlob(blob);
       CAFFE_ENFORCE(to_blob);
       const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
+      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
       to_tensor->CopyFrom(from_tensor);
     }
   }
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 38ffdc99426452..8d011cd3be8bfa 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -33,9 +33,8 @@ class IDEEPConcatOp final : public IDEEPOperator {
       if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
         inputs.emplace_back(Input(i));
       } else {
-        CAFFE_ENFORCE(
-            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
-            "Expect cpu tensor if not itensor");
+        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
+                      "Expect cpu tensor if not itensor");
         auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
         CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
                       tensor_cpu.size_from_dim(0) == 0,
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 3226a08c4af9cf..08e6de2ae3f0dc 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -89,7 +89,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           local_input_blobs_[i]->Reset();
         }
         input_share_[i] = false;
-        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
+        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
@@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         continue;
       }
       CAFFE_ENFORCE(
-          BlobIsTensorType(*local_output_blobs_[i], CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
@@ -153,7 +153,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         Blob* dst = OperatorBase::OutputBlob(i);
         dst->Reset(new Tensor(CPU));
-        auto dtensor = BlobGetMutableTensor(dst, CPU);
+        auto dtensor = dst->GetMutableTensor(CPU);
         dtensor->Resize(src_dims);
         dtensor->ShareData(src);
       }
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 468a42df1a9239..626568a989b939 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (BlobIsTensorType(input_blob, CPU)) {
+    if (input_blob.IsTensorType(CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<Tensor>(0, CPU);
       auto* Y = OperatorBase::Output<Tensor>(0, CPU);
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index a3135758813ecf..6d9713b74612d8 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
     for (int i = 0; i < InputSize(); ++i) {
       if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
         OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            BlobGetMutableTensor(local_input_blobs_[i], CPU));
+            local_input_blobs_[i]->GetMutableTensor(CPU));
       } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
         OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            BlobGetMutableTensor(local_input_blobs_[i], CPU));
+            local_input_blobs_[i]->GetMutableTensor(CPU));
       } else {
         VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
@@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          BlobIsTensorType(*local_output_blobs_[i], CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "MKL fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
index 06ec2b50acc178..111af03f8602b9 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
   if (first_run_) {
     first_run_ = false;
     for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
+      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
       Y->Resize(inputs_[i]->dims());
       Y->template mutable_data<float>();
     }
@@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
       // GLTensor
       auto* X = inputs_[i].get();
       X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
+      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
       Timer timer;
       timer.Start();
       getTensorCPU(*X, *Y);
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
index 68f79e84a89f87..daa7ef008fc7b3 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                      std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
   Blob *blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(dims);
   T *t_data = tensor->mutable_data<T>();
   std::random_device rd;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 742f8e48f4e9e1..52f746f63f317b 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,13 +489,13 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!BlobIsTensorType(*noiseBlob, CPU) ||
+    if (!noiseBlob->IsTensorType(CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
+      auto* t = noiseBlob->GetMutableTensor(CPU);
       t->Resize(noiseSize);
       math::RandGaussian<float, CPUContext>(
           t->size(),
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 7ac629019c58c0..7216b16611aa2a 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
+                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
+          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob("stddev"), CPU);
+          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("pw"), CPU);
+              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob(name), CPU);
+          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -682,8 +682,8 @@ void testMPSCNN() {
                           LOG(INFO) << "MPSCNNPool Test: " << pool;
                           Workspace ws;
                           {
-                            auto* t = BlobGetMutableTensor(
-                                ws.CreateBlob("X_cpu"), CPU);
+                            auto* t =
+                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,8 +1072,7 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t =
-                            BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1081,7 +1080,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1093,7 +1092,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1189,7 +1188,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1197,7 +1196,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1205,7 +1204,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1276,7 +1275,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1284,7 +1283,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1292,7 +1291,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1386,7 +1385,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1394,7 +1393,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1402,7 +1401,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1494,7 +1493,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1502,7 +1501,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1510,7 +1509,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1608,7 +1607,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1616,7 +1615,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1624,7 +1623,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1727,7 +1726,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1735,7 +1734,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1792,7 +1791,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1800,7 +1799,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1857,7 +1856,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1865,7 +1864,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1922,7 +1921,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1930,7 +1929,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2012,7 +2011,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2066,7 +2065,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2137,7 +2136,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2145,7 +2144,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
+              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2251,14 +2250,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
+          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2363,7 +2362,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2498,7 +2497,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2506,7 +2505,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("bbox_delta_cpu"), CPU);
+      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2514,7 +2513,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("im_info"), CPU);
+      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2522,7 +2521,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("anchors"), CPU);
+      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2588,7 +2587,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2662,8 +2661,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t = BlobGetMutableTensor(
-                                  ws.CreateBlob("X_cpu"), CPU);
+                              auto* t =
+                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2676,7 +2675,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2693,7 +2692,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2810,7 +2809,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
+            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2892,7 +2891,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2965,7 +2964,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
+            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3337,8 +3336,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace cws;
     cws.RunNetOnce(initNet);
     {
-      auto* t = BlobGetMutableTensor(
-          cws.CreateBlob(predictNet.external_input(0)), CPU);
+      auto* t =
+          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3349,8 +3348,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace mws;
     mws.RunNetOnce(initNet);
     {
-      auto* t = BlobGetMutableTensor(
-          mws.CreateBlob(predictNet.external_input(0)), CPU);
+      auto* t =
+          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3398,16 +3397,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                            \
-  ws.RunNetOnce(initNet);                                  \
-  {                                                        \
-    auto* t = BlobGetMutableTensor(                        \
-        ws.CreateBlob(predictNet.external_input(0)), CPU); \
-    t->Resize(inputDims);                                  \
-    CPUContext ctx;                                        \
-    math::RandGaussian<float, CPUContext>(                 \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);  \
-  }                                                        \
+#define RUN_NET(ws, predictNet)                                             \
+  ws.RunNetOnce(initNet);                                                   \
+  {                                                                         \
+    auto* t =                                                               \
+        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
+    t->Resize(inputDims);                                                   \
+    CPUContext ctx;                                                         \
+    math::RandGaussian<float, CPUContext>(                                  \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
+  }                                                                         \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index 3f78c5d1fcd6ae..47fd405eef01e4 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 428c395fe442d4..1c08df0f32a1c0 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index 56f1fc28986a7c..45ea26c44cc964 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
         output_dims.push_back(dim);
       }
 
-      auto* tensor = BlobGetMutableTensor(ws_.CreateBlob(blob), CPU);
+      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
       tensor->Resize(output_dims);
       outputs->push_back(tensor);
 
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
index c14e9ed26376e1..359e7767746b69 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+ 
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
@@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
     ws = &localWs;
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
@@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
     ws = &localWs;
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
@@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
     ws = &localWs;
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
-    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
@@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
index 9b4608dc07aee1..deab1ca7b43f76 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
   // CPU reference
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -103,21 +103,21 @@ static void test_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(K, kernel, kernel, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -189,7 +189,7 @@ static void test_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(1, kernel, kernel, D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -428,7 +428,7 @@ static void test_pooling(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -496,7 +496,7 @@ static void test_pooling(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     if (H == 1 && W == 1) {
       t->Resize(N, C);
     } else {
@@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 690a33cb854f16..9da266c4e85051 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
   LOG(INFO) << "OPENGLCopyFrom/To Test";
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -275,7 +275,7 @@ void testOpenGLConv(int N,
             << " Op: " << glPoolOperationName[poolOp];
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -301,7 +301,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
       t->Resize(K, C, kernel_h, kernel_w);
     } else {
@@ -343,7 +343,7 @@ void testOpenGLConv(int N,
 
     // bias
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(K);
       CPUContext ctx;
       if (random_input) {
@@ -367,7 +367,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -532,7 +532,7 @@ void testOpenGLPRelu(
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -541,7 +541,7 @@ void testOpenGLPRelu(
 
   // prelu scale
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
     t->Resize(prelu_size);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
 
   Workspace ws;
   {
-    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -814,8 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t = BlobGetMutableTensor(
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i)), CPU);
+    auto* t =
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -891,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -942,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@@ -992,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(1);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@@ -1060,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
   LOG(INFO) << "OpenGL Softmax Test "
             << "N: " << N << " D: " << D << " Tiled:" << tiled;
   Workspace ws;
-  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
   {
     t->Resize(N, D);
     CPUContext ctx;
@@ -1151,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1163,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1172,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1254,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1266,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1275,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1284,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // prelu scale
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1385,7 +1385,7 @@ void OpenGL_speedtest(int N,
             << " C: " << C << " H: " << H << " W: " << W;
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1399,7 +1399,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1413,7 +1413,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1479,7 +1479,7 @@ void testOpenGLPadImage(
   {
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1593,7 +1593,7 @@ void testOpenGLResize(int N,
   {
     Workspace ws;
     {
-      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1675,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGL Preprocess Test";
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1684,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 100;
@@ -1748,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLDeprocess Test";
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1757,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1800,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLNormPlanarYUV Test";
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, 3, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1809,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1818,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
+    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 6;
@@ -1879,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
   LOG(INFO) << "OpenGL CopyOps Speed Test";
   Workspace ws;
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1893,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1907,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1990,8 +1990,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace cws;
     cws.RunNetOnce(initNet);
 
-    auto* t_cpu = BlobGetMutableTensor(
-        cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
+    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
+                      ->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2032,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace mws;
     mws.RunNetOnce(initNet);
 
-    auto* t_gl = BlobGetMutableTensor(
-        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
+    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
+                     ->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2116,7 +2116,7 @@ void compareBatchedToTiledModels(std::string name,
     tws.RunNetOnce(initNet);
 
     auto* t_batch =
-        BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
+        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2143,7 +2143,7 @@ void compareBatchedToTiledModels(std::string name,
     bws.RunNetOnce(initNet);
 
     auto* t_tiling =
-        BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
+        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index cfeed00e8b9730..deced719644963 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -14,7 +14,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                         \
   do {                                                                    \
     Blob* _blob = ws.CreateBlob((_n));                                    \
-    auto* _tensor = BlobGetMutableTensor(_blob, CPU);                     \
+    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
     _tensor->Resize((_s));                                                \
     memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
   } while (0)
@@ -23,7 +23,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                 \
   do {                                                            \
     Blob* _blob = ws.CreateBlob((_n));                            \
-    auto* _tensor = BlobGetMutableTensor(_blob, CPU);             \
+    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
     _tensor->Resize((_s));                                        \
     memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
   } while (0)
@@ -43,7 +43,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@@ -56,7 +56,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index 6316b05284fba9..a1c1af0f6dfb8d 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -289,13 +289,13 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
     def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
     def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
     def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = BlobGetMutableTensor(ws.CreateBlob("X"), CPU);
+    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
     Xws->ResizeLike(X);
     Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
+    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
     Wws->ResizeLike(W_);
     Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
+    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
     bws->ResizeLike(bias);
     bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
     ws.RunOperatorOnce(def);
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index 31e179b3e41f82..804296307d6ef8 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -30,7 +30,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = BlobGetMutableTensor(blob, CUDA);
+    auto* tensor = blob->GetMutableTensor(CUDA);
     tensor->Resize(dims);
     math::Set<float, CUDAContext>(
         tensor->size(),
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index c74829b4f8f9c5..45db7dd5b8484a 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -24,7 +24,7 @@ class BatchMatMulOpTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = BlobGetMutableTensor(blob, CPU);
+    auto* tensor = blob->GetMutableTensor(CPU);
     tensor->Resize(dims);
     math::Set<float, CPUContext>(
         tensor->size(),
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index b0c5f7dcdfff0b..8814be17153d44 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -16,7 +16,7 @@ static void AddScalarInput(
     Workspace* ws,
     bool isEmpty = false) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   if (!isEmpty) {
     tensor->Resize(vector<int64_t>{1});
     *(tensor->template mutable_data<DataT>()) = value;
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
index 155b6f0cd24561..b9f54b6d55be7c 100644
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@@ -27,8 +27,8 @@ void runWithSharedBuffer<CPUContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer = BlobGetMutableTensor(
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__"), CPU);
+  auto* buffer =
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index c1f37c7f1362f2..f80d15a5d9054b 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -20,8 +20,8 @@ void runWithSharedBuffer<CUDAContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer = BlobGetMutableTensor(
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__"), CUDA);
+  auto* buffer =
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index 3bc2951664353b..6eb45eb5f8d17c 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -17,7 +17,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -29,7 +29,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index e3c0abe83d8b4e..83294224280831 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -1428,7 +1428,7 @@ class TreeCursorSerializer : public BlobSerializerBase {
     // serialize offsets as a tensor
     if (cursor->offsets.size() > 0) {
       Blob offsets_blob;
-      auto* offsets = BlobGetMutableTensor(&offsets_blob, CPU);
+      auto* offsets = offsets_blob.GetMutableTensor(CPU);
       offsets->Resize(cursor->offsets.size());
       std::copy(
           cursor->offsets.begin(),
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index 8a40c731143f44..a68a1263f6f451 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -150,7 +150,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
     // Reshape tensor descriptors if necessary
     if (X.dims() != cudnn_input_dims_ && !is_test_) {
       CAFFE_ENFORCE(scratch_blob_);
-      Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
+      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
       cudnn_input_dims_ = X.dims();
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index b785d040c8f1a7..bcd547e28f0989 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -19,7 +19,7 @@ void FillTensor(
     const std::vector<int64_t>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
+  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
   tensor->Resize(shape);
   auto* mutable_data = tensor->template mutable_data<O_Type>();
   const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index da7fdc650879c3..2b3a033a665df7 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -18,7 +18,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -34,7 +34,7 @@ static void AddLinSpacedInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -51,7 +51,7 @@ static void AddInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index 2fb8f3b338dc64..241b0ff97c6070 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -353,7 +353,7 @@ class IndexSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor) override {
     auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
     Blob tensor_blob;
-    auto* tensor_out = BlobGetMutableTensor(&tensor_blob, CPU);
+    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
 
     if (base->Type().Match<std::string>()) {
       doStore<std::string>(base, tensor_out);
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index 7a3c34cfbf7cce..dbd5103952469c 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -213,23 +213,23 @@ class ONNXWhileOp final : public Operator<Context> {
       lcd_tensors_.clear();
       for (int i = 2; i < body_net_def.external_input_size(); ++i) {
         Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor* t = BlobGetMutableTensor(b, Context::GetDeviceType());
+        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
         lcd_tensors_.push_back(t);
       }
       // First output is the iteration variable
       auto* iteration_var_blob = loop_ws_->CreateBlob(
           body_net_def.external_input(0));
       iteration_var_ =
-          BlobGetMutableTensor(iteration_var_blob, Context::GetDeviceType());
+          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
 
-      input_condition_var_ = BlobGetMutableTensor(
-          loop_ws_->CreateBlob(body_net_def.external_input(1)),
-          Context::GetDeviceType());
+      input_condition_var_ =
+          loop_ws_->CreateBlob(body_net_def.external_input(1))
+              ->GetMutableTensor(Context::GetDeviceType());
 
       auto* condition_var_blob =
           loop_ws_->CreateBlob(body_net_def.external_output(0));
       condition_var_ =
-          BlobGetMutableTensor(condition_var_blob, Context::GetDeviceType());
+          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
       condition_var_->Resize(1);
       condition_var_->template mutable_data<bool>();
 
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 767a37d5fc7924..d1b0824f1b3191 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      BlobIsTensorType(*blob, CPU),
+      blob->IsTensorType(CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 5b3a38dbfbd13d..8ef39e7c0e78d1 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -65,8 +65,8 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
       if (this->InputIsTensorType(i, CUDA)) {
-        BlobGetMutableTensor(local_input_blobs_[i], CPU)
-            ->CopyFrom(Input(i), &context_);
+        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
+            Input(i), &context_);
         need_sync = true;
       } else {
         VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
@@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          BlobIsTensorType(*local_output_blobs_[i], CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index 0870a4be2dd7bd..964708bc10906f 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -40,7 +40,7 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(source_tensor);
+  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
@@ -64,7 +64,7 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  BlobGetMutableTensor(ws.CreateBlob("X"), CUDA)->CopyFrom(source_tensor);
+  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index d4ac325a78b80a..3537ab69d058f0 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -20,7 +20,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CUDA);
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 63d58f3ccd8f6d..98675cea858d54 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -43,10 +43,11 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
             prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
-        BlobGetMutableTensor(ws_->CreateBlob(newBlobName), CPU)
+        ws_->CreateBlob(newBlobName)
+            ->GetMutableTensor(CPU)
             ->ResizeLike(currentTensor);
         auto type = Context::GetDeviceType();
-        auto* newTensor = BlobGetMutableTensor(ws_->GetBlob(newBlobName), type);
+        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
         newTensor->CopyFrom(currentTensor);
       }
     }
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 4cb53a6d7d330a..7e37e562e77a50 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
       // the forward-only mode.
       std::string this_timestep_blob =
           timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      BlobGetMutableTensor(ws->CreateBlob(this_timestep_blob), CPU)->Resize(1);
+      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
-      BlobGetMutableTensor(b, CPU)->template mutable_data<int32_t>()[0] = t;
+      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
 
       // Copy the operators from template
       for (auto& template_rnn_op : timestep_ops_template_) {
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index 21b3064a6fac3c..2421bc44263afd 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -52,11 +52,10 @@ struct CAFFE2_API ScratchWorkspaces {
 };
 
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  BlobGetMutableTensor(ws->CreateBlob(blob_name), CPU)->Resize(1);
+  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
   auto timestepBlob = ws->GetBlob(blob_name);
   CAFFE_ENFORCE(timestepBlob);
-  BlobGetMutableTensor(timestepBlob, CPU)->template mutable_data<int32_t>()[0] =
-      t;
+  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
 }
 
 CAFFE2_API std::map<string, string> GetRecurrentMapping(
@@ -72,9 +71,8 @@ void applyOffsetAlias(
           << " at offset: " << oc.offset;
   auto srcBlob = ws->GetBlob(oc.src);
   CAFFE_ENFORCE(srcBlob);
-  auto* src = BlobGetMutableTensor(srcBlob, Context::GetDeviceType());
-  auto* dst =
-      BlobGetMutableTensor(ws->GetBlob(oc.dst), Context::GetDeviceType());
+  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
   auto timestep = src->size() / src->dim(0);
   auto dims = src->dims();
   const int32_t startDstTimestep =
@@ -115,7 +113,7 @@ void initializeRecurrentInput(
     Context* context) {
   auto stateBlob = ws->GetBlob(rc.state);
   CAFFE_ENFORCE(stateBlob);
-  auto* state = BlobGetMutableTensor(stateBlob, Context::GetDeviceType());
+  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
 
   auto inputBlob = ws->GetBlob(rc.input);
   CAFFE_ENFORCE(inputBlob);
@@ -662,7 +660,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->GetBlob(param.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
+      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
       g->ResizeLike(p);
       math::Set<T, Context>(
           g->size(),
@@ -678,7 +676,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->CreateBlob(rg.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
+      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
       g->ResizeLike(p);
       CAFFE_ENFORCE_EQ(g->ndim(), 3);
       const auto timestep = g->size() / g->dim(0);
@@ -705,7 +703,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
               << ". Size: " << Input(gradientInputIndex).size();
       auto pGradientBlob = sharedWs_->GetBlob(gradientName);
       CAFFE_ENFORCE(pGradientBlob);
-      auto* g = BlobGetMutableTensor(pGradientBlob, Context::GetDeviceType());
+      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
       g->ResizeLike(Input(gradientInputIndex));
       g->template mutable_data<T>();
     }
@@ -719,7 +717,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                 << rg.lastExternalGrad << " for final time step (sep. blob)";
         auto gBlob = sharedWs_->GetBlob(rg.grad);
         CAFFE_ENFORCE(gBlob);
-        auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
+        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
 
         auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
         CAFFE_ENFORCE(oglastBlob);
@@ -781,7 +779,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
       T* output_data = Output(outputIdx)->template mutable_data<T>();
       auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
       CAFFE_ENFORCE(pBlob);
-      auto* p = BlobGetMutableTensor(pBlob, Context::GetDeviceType());
+      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
 
       if (Input(inputId).ndim() >= 2) {
         // Gradient states blob should live. And if it gets changed by the
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 7257ec44c25984..2647a97d6f0b90 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -18,7 +18,7 @@ void AddConstInput(
     Context* context,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
+  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
   tensor->Resize(shape);
   math::Set<float, Context>(
       tensor->size(), value, tensor->template mutable_data<float>(), context);
@@ -39,7 +39,7 @@ void AddInput<CPUContext>(
     const string& name,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -57,7 +57,7 @@ void AddInput<CUDAContext>(
   tmp_vec.array() = utils::AsEArrXt(values);
 
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CUDA);
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->CopyFrom(tmp);
 }
 
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index 2092ae804f2c3b..c9ba13efb50258 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
  public:
   bool runOp(const TensorCPU& input) {
     auto* blob = ws_.CreateBlob("X");
-    auto* tensor = BlobGetMutableTensor(blob, CPU);
+    auto* tensor = blob->GetMutableTensor(CPU);
     tensor->ResizeLike(input);
     tensor->ShareData(input);
 
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(BlobIsTensorType(*output, CPU));
+    EXPECT_TRUE(output->IsTensorType(CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
@@ -42,7 +42,7 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
   std::vector<std::string> input = {"a", "xx", "c"};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -62,7 +62,7 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                  {"dd", "ee", "ff"}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -82,7 +82,7 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
   std::vector<float> input = {3.90f, 5.234f, 8.12f};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -102,7 +102,7 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                            {4.67f, 5.90f, 6.32f}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -122,7 +122,7 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
   std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<int64_t>();
   for (int i = 0; i < input.size(); ++i) {
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index bfc41a462999b5..a6d395fe9ba647 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!BlobIsTensorType(*noiseBlob, CPU)) {
+    if (!noiseBlob->IsTensorType(CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
+      auto* t = noiseBlob->GetMutableTensor(CPU);
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index e9f5b1a8f8455f..cd081bf959e399 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -56,7 +56,7 @@ bool TensorProtosDBInput<Context>::Prefetch() {
         protos.mutable_protos(i)->clear_device_detail();
       }
       deserializer.Deserialize(
-          protos.protos(i), BlobGetMutableTensor(&prefetched_blobs_[i], CPU));
+          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
     }
   } else {
     vector<Tensor> temp_tensors;
@@ -74,11 +74,11 @@ bool TensorProtosDBInput<Context>::Prefetch() {
           vector<int> dims(
               protos.protos(i).dims().begin(), protos.protos(i).dims().end());
           dims.insert(dims.begin(), batch_size_);
-          BlobGetMutableTensor(&prefetched_blobs_[i], CPU)->Resize(dims);
+          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
         }
       }
       for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = BlobGetMutableTensor(&prefetched_blobs_[i], CPU);
+        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
         TensorCPU& src = temp_tensors[i];
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index 1a5cdc344ce4a8..421c26e318b6e9 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
     int cores_idx = 0;
 
     // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType());
+    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
     Y_buf->ResizeLike(X);
     Y_buf->CopyFrom(X);
 
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index 1099d900cbefdc..f500afaf9ed24f 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -19,7 +19,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CUDA);
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index a3a2a409674edd..379dd52655c4f4 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -16,7 +16,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index 8c324a97c50934..fdf5fdc31e1046 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -44,10 +44,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
     CAFFE_ENFORCE(
         bnInputs.size() >= 5, "Invalid batch normalization input size");
 
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                                \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                      \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");             \
-  auto name##Tensor = BlobGetMutableTensor(ws->GetBlob(name->getName()), CPU); \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
+  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
@@ -76,7 +76,7 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
           nn->dataFlow.createEdge(convBiasNode, convNode);
 
           auto* blob = ws->CreateBlob(convBiasName);
-          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
+          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
           CHECK_NOTNULL(tensor);
           // Get output channel
           size_t c = filterTensor->dim32(0);
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index a048503fea99c7..ce79df56ecb728 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -173,7 +173,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
 
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = BlobGetMutableTensor(blob, CPU);
+      auto* cpu_tensor = blob->GetMutableTensor(CPU);
       std::vector<int64_t> dims;
       for(const auto& d : t.dims()) {
         dims.push_back(d);
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 7775e69776450c..84dac93753d37a 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -10,14 +10,14 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      BlobIsTensorType(*blob, CPU), "Blob is not a CPU Tensor: ", name);
+      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return BlobGetMutableTensor(blob, CPU);
+  return blob->GetMutableTensor(CPU);
 }
 
 void shareInputTensor(
@@ -60,7 +60,7 @@ Predictor::Predictor(PredictorConfig config) : config_(std::move(config)) {
   for (const auto& name : config_.predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = config_.ws->CreateBlob(name);
-      BlobGetMutableTensor(blob, CPU);
+      blob->GetMutableTensor(CPU);
     }
   }
   CAFFE_ENFORCE(config_.ws->CreateNet(config_.predict_net));
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index a0245cd7a86d66..ae4f73e9da0ad7 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
     const std::vector<int64_t>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
-  auto* t = BlobGetMutableTensor(blob.get(), CPU);
+  auto* t = blob->GetMutableTensor(CPU);
   t->Resize(dims);
   math::RandUniform<float, CPUContext>(
       t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@@ -180,7 +180,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorList input;
   input.emplace_back(CPU);
-  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
+  auto tensor = inputData->GetMutableTensor(CPU);
   input.back().ResizeLike(*tensor);
   input.back().ShareData(*tensor);
   Predictor::TensorList output;
@@ -196,7 +196,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorMap input;
   auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
+  auto tensor = inputData->GetMutableTensor(CPU);
   iter.first->second.ResizeLike(*tensor);
   iter.first->second.ShareData(*tensor);
 
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 9a1d715bfdf225..81197047102ffb 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -328,7 +328,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "tensor",
-          [](Blob* blob) { return py::cast(BlobGetMutableTensor(blob, CPU)); },
+          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
           py::return_value_policy::reference_internal)
       .def(
           "_feed",
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 50c64f6c9e44c8..97ec6628fe3f27 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -234,7 +234,7 @@ class TensorFeeder : public BlobFeederBase {
     FeedTensor(
         option,
         original_array,
-        BlobGetMutableTensor(blob, Context::GetDeviceType()));
+        blob->GetMutableTensor(Context::GetDeviceType()));
   }
 };
 
@@ -366,32 +366,31 @@ class PythonOpBase : public Operator<Context> {
 
         // make sure output blob is initialized before creating the binding
         if (forced_cpu_outputs_.count(i)) {
-          BlobGetMutableTensor(blob, Context::GetDeviceType());
+          blob->GetMutableTensor(Context::GetDeviceType());
         } else {
-          BlobGetMutableTensor(blob, Context::GetDeviceType());
+          blob->GetMutableTensor(Context::GetDeviceType());
         }
 
         py::object py_obj;
         if (blob->template IsType<Tensor>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                BlobGetMutableTensor(blob, Context::GetDeviceType()),
-                cpu_option);
+                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                BlobGetMutableTensor(blob, Context::GetDeviceType()),
+                blob->GetMutableTensor(Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                BlobGetMutableTensor(blob, Context::GetDeviceType()),
+                blob->GetMutableTensor(Context::GetDeviceType()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                BlobGetMutableTensor(blob, Context::GetDeviceType()),
+                blob->GetMutableTensor(Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         }
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index f0307f7b6485d2..ebad6cf8d96839 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -163,8 +163,8 @@ class IDeepFeeder : public BlobFeederBase {
         DeviceOption cpu_option(option);
         cpu_option.set_device_type(DeviceTypeProto::PROTO_CPU);
         TensorFeeder<CPUContext> cpu_tensor_feeder;
-        cpu_tensor_feeder.FeedTensor(
-            cpu_option, original_array, BlobGetMutableTensor(blob, CPU));
+        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
+                                     blob->GetMutableTensor(CPU));
       }
     } catch (ideep::error &e) {
       LOG(ERROR) << "IDEEP error: " << e.message;
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index d102985e2fd7aa..4ac3524d49d8a6 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index f11e05b67392c9..05c945106c52da 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -231,12 +231,11 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
 
         for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] = BlobGetMutableTensor(
-              ws_->CreateBlob(
-                  "__transformed_kernel_" +
-                  to_string(
-                      __sync_fetch_and_add(&precomputed_transform_id, 1))),
-              CPU);
+          transformedFilters_[g] = ws_->CreateBlob(
+                                          "__transformed_kernel_" +
+                                          to_string(__sync_fetch_and_add(
+                                              &precomputed_transform_id, 1)))
+                                       ->GetMutableTensor(CPU);
           transformedFilters_[g]->Resize(transformedFilterElements);
 
           status = nnp_convolution_inference(
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index 10eb6348becc06..2f892118982da2 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CPU);
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index a5df5900ee23a2..911c2b09868fc3 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{10, 6};
   vector<int> shapeY{5, 6};
-  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
+  auto* tensorW = blobW->GetMutableTensor(HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{6, 10};
   vector<int> shapeY{5, 6};
-  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
+  auto* tensorW = blobW->GetMutableTensor(HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
   vector<int> shapeA{5, 10};
   vector<int> shapeX{10};
   vector<int> shapeY{5};
-  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
+  auto* tensorA = blobA->GetMutableTensor(HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 50);
@@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
   vector<int> shapeA{6, 10};
   vector<int> shapeX{6};
   vector<int> shapeY{10};
-  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
+  auto* tensorA = blobA->GetMutableTensor(HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 60);
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 4b0247a0786fcc..9be1c3db6c1d01 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
   Blob* bloby = ws.CreateBlob("Y");
   Blob* bloby_host = ws.CreateBlob("Y_host");
 
-  auto* tensorx0 = BlobGetMutableTensor(blobx0, CUDA);
-  auto* tensorx1 = BlobGetMutableTensor(blobx1, CUDA);
-  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
+  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
+  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
+  auto* tensory = bloby->GetMutableTensor(CUDA);
 
   vector<int> shapex0_vector{shapex0};
   vector<int> shapex1_vector{shapex1};
@@ -71,7 +71,7 @@ void executeGpuBinaryOpTest(
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
+  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   vector<int> shapex{33 * 9, 25};
   vector<int> shapey{33, 25};
 
-  auto* tensorx = BlobGetMutableTensor(blobx, CUDA);
+  auto* tensorx = blobx->GetMutableTensor(CUDA);
   tensorx->Resize(shapex);
   int stripe = 33 * 25;
   vector<float> tot(33, 0.0);
@@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
     }
   }
 
-  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
+  auto* tensory = bloby->GetMutableTensor(CUDA);
   tensory->Resize(shapey);
   math::Set<float, CUDAContext>(
       stripe, 0.0, tensory->mutable_data<float>(), &context);
@@ -125,7 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
+  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -258,9 +258,9 @@ class GemmBatchedGPUTest
     Blob* X_blob = ws_.CreateBlob("X");
     Blob* W_blob = ws_.CreateBlob("W");
     Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = BlobGetMutableTensor(X_blob, CUDA);
-    W_ = BlobGetMutableTensor(W_blob, CUDA);
-    Y_ = BlobGetMutableTensor(Y_blob, CUDA);
+    X_ = X_blob->GetMutableTensor(CUDA);
+    W_ = W_blob->GetMutableTensor(CUDA);
+    Y_ = Y_blob->GetMutableTensor(CUDA);
     X_->Resize(std::vector<int64_t>{3, 5, 10});
     W_->Resize(std::vector<int64_t>{3, 6, 10});
     Y_->Resize(std::vector<int64_t>{3, 5, 6});
@@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = BlobGetMutableTensor(blob_y, CUDA);
+    X_ = blob_x->GetMutableTensor(CUDA);
+    Y_ = blob_y->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -402,7 +402,7 @@ class ReduceTensorGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
+    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -664,8 +664,8 @@ class BroadcastGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = BlobGetMutableTensor(blob_y, CUDA);
+    X_ = blob_x->GetMutableTensor(CUDA);
+    Y_ = blob_y->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -681,7 +681,7 @@ class BroadcastGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
+    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -741,9 +741,9 @@ class MomentsGPUTest : public testing::Test {
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_mean = ws_.CreateBlob("mean");
     Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    mean_ = BlobGetMutableTensor(blob_mean, CUDA);
-    variance_ = BlobGetMutableTensor(blob_variance, CUDA);
+    X_ = blob_x->GetMutableTensor(CUDA);
+    mean_ = blob_mean->GetMutableTensor(CUDA);
+    variance_ = blob_variance->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -766,10 +766,10 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& mean_data,
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
+    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
     mean_host->CopyFrom(*mean_, cuda_context_.get());
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
+    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
     variance_host->CopyFrom(*variance_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
 
@@ -868,8 +868,8 @@ class TransposeGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = BlobGetMutableTensor(blob_y, CUDA);
+    X_ = blob_x->GetMutableTensor(CUDA);
+    Y_ = blob_y->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -890,7 +890,7 @@ class TransposeGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
+    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());

From 71b99f28bee12fbad440a9cd6697db2b69e2cdc9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 08:56:47 -0700
Subject: [PATCH 20/51] Give default values to members of TensorImpl. (#12033)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12033

These are reasonable sensible default values.  One key
pick is -1 for numel: this is because in Caffe2, a tensor
may be in "un-allocated" with no storage state; this is
historically represented in Caffe2 with numel_ == -1

Reviewed By: mingzhe09088

Differential Revision: D10024439

fbshipit-source-id: a167d727a7665daac7e7a1e98c0c89d8f1da6fa6
---
 aten/src/ATen/core/TensorImpl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 0c257cecdd80fb..cee843458b9e5e 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -174,12 +174,12 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_variable() const { return is_variable_; };
 
  private:
-  int64_t storage_offset_;
+  int64_t storage_offset_ = 0;
   std::vector<int64_t> sizes_;
   std::vector<int64_t> strides_;
 
-  bool is_contiguous_;
-  int64_t numel_;
+  bool is_contiguous_ = true;
+  int64_t numel_ = -1;
 
   int64_t compute_numel() const {
     int64_t n = 1;

From d4ce41c4dea1369d0dfee52de04c975a8c65e38b Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 08:57:42 -0700
Subject: [PATCH 21/51] Rename tensor_impl_ to impl_ in Tensor (#12035)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12035

This brings it in line with Caffe2's naming

Reviewed By: mingzhe09088

Differential Revision: D10024485

fbshipit-source-id: a6feef82a56b5eb3043b0821ea802ba746e542a0
---
 aten/src/ATen/core/Tensor.h      | 62 ++++++++++++++++----------------
 aten/src/ATen/templates/Tensor.h | 62 ++++++++++++++++----------------
 torch/csrc/autograd/variable.h   |  2 +-
 3 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index fca7fe3189f019..a359dc13b41fea 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -40,8 +40,8 @@ namespace at {
 struct CAFFE2_API Tensor {
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
-      : tensor_impl_(std::move(tensor_impl)) {
-    if (tensor_impl_.get() == nullptr) {
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
     }
   }
@@ -50,25 +50,25 @@ struct CAFFE2_API Tensor {
   Tensor(Tensor&&) = default;
 
   int64_t dim() const {
-    return tensor_impl_->dim();
+    return impl_->dim();
   }
 
   TensorImpl * unsafeGetTensorImpl() const {
-    return tensor_impl_.get();
+    return impl_.get();
   }
   TensorImpl * unsafeReleaseTensorImpl() {
-    return tensor_impl_.release();
+    return impl_.release();
   }
   const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   bool defined() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   void reset() {
-    tensor_impl_.reset();
+    impl_.reset();
   }
 
   // The following overloads are very intruiging.  Consider the following
@@ -102,11 +102,11 @@ struct CAFFE2_API Tensor {
   // Tensor& operator=(const Tensor&) & = default;
   // Tensor& operator=(Tensor&&) & = default;
   Tensor& operator=(const Tensor& x) & {
-    tensor_impl_ = x.tensor_impl_;
+    impl_ = x.impl_;
     return *this;
   }
   Tensor& operator=(Tensor&& x) & {
-    tensor_impl_ = std::move(x.tensor_impl_);
+    impl_ = std::move(x.impl_);
     return *this;
   }
 
@@ -115,37 +115,37 @@ struct CAFFE2_API Tensor {
   Tensor& operator=(Tensor&&) &&;
 
   bool is_same(const Tensor& other) const noexcept {
-    return tensor_impl_ == other.tensor_impl_;
+    return impl_ == other.impl_;
   }
   size_t use_count() const noexcept {
-    return tensor_impl_.use_count();
+    return impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return tensor_impl_.weak_use_count();
+    return impl_.weak_use_count();
   }
 
   const char * toString() const;
 
   IntList sizes() const {
-    return tensor_impl_->sizes();
+    return impl_->sizes();
   }
   IntList strides() const {
-    return tensor_impl_->strides();
+    return impl_->strides();
   }
   int64_t ndimension() const {
     return dim();
   }
   Type & type() const {
-    return tensor_impl_->type();
+    return impl_->type();
   }
   TensorTypeId type_id() const {
-    return tensor_impl_->type_id();
+    return impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return dataTypeToScalarType(tensor_impl_->dtype().id());
+    return dataTypeToScalarType(impl_->dtype().id());
   }
   const Storage& storage() const {
-    return tensor_impl_->storage();
+    return impl_->storage();
   }
   Tensor toType(const Type & t, bool non_blocking=false) const;
   Tensor & copy_(const Tensor & src, bool non_blocking=false);
@@ -222,18 +222,18 @@ struct CAFFE2_API Tensor {
   // ~~~~~ Autograd API ~~~~~
 
   Tensor& set_requires_grad(bool requires_grad) {
-    tensor_impl_->set_requires_grad(requires_grad);
+    impl_->set_requires_grad(requires_grad);
     return *this;
   }
   bool requires_grad() const {
-    return tensor_impl_->requires_grad();
+    return impl_->requires_grad();
   }
 
   Tensor& grad() {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
   const Tensor& grad() const {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
 
   void set_data(Tensor new_data);
@@ -645,35 +645,35 @@ struct CAFFE2_API Tensor {
   friend struct WeakTensor;
 
 protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 
 struct CAFFE2_API WeakTensor {
-  WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
+  WeakTensor(const Tensor& t) : weak_impl_(t.impl_) {}
 
   // XXX: this can return undefined tensors
   // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
   Tensor lock() const {
-    return Tensor(weak_tensor_impl_.lock());
+    return Tensor(weak_impl_.lock());
   }
 
   bool is_same(const WeakTensor& other) const noexcept {
-    return weak_tensor_impl_ == other.weak_tensor_impl_;
+    return weak_impl_ == other.weak_impl_;
   }
 
   size_t use_count() const noexcept {
-    return weak_tensor_impl_.use_count();
+    return weak_impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return weak_tensor_impl_.weak_use_count();
+    return weak_impl_.weak_use_count();
   }
 
   TensorImpl* unsafeGetTensorImpl() const {
-    return weak_tensor_impl_._unsafe_get_target();
+    return weak_impl_._unsafe_get_target();
   }
 
 private:
-  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_impl_;
 };
 } // namespace at
 
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 2e5fb25f597418..6f03690d9b9997 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -40,8 +40,8 @@ namespace at {
 struct CAFFE2_API Tensor {
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
-      : tensor_impl_(std::move(tensor_impl)) {
-    if (tensor_impl_.get() == nullptr) {
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
     }
   }
@@ -50,25 +50,25 @@ struct CAFFE2_API Tensor {
   Tensor(Tensor&&) = default;
 
   int64_t dim() const {
-    return tensor_impl_->dim();
+    return impl_->dim();
   }
 
   TensorImpl * unsafeGetTensorImpl() const {
-    return tensor_impl_.get();
+    return impl_.get();
   }
   TensorImpl * unsafeReleaseTensorImpl() {
-    return tensor_impl_.release();
+    return impl_.release();
   }
   const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   bool defined() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   void reset() {
-    tensor_impl_.reset();
+    impl_.reset();
   }
 
   // The following overloads are very intruiging.  Consider the following
@@ -102,11 +102,11 @@ struct CAFFE2_API Tensor {
   // Tensor& operator=(const Tensor&) & = default;
   // Tensor& operator=(Tensor&&) & = default;
   Tensor& operator=(const Tensor& x) & {
-    tensor_impl_ = x.tensor_impl_;
+    impl_ = x.impl_;
     return *this;
   }
   Tensor& operator=(Tensor&& x) & {
-    tensor_impl_ = std::move(x.tensor_impl_);
+    impl_ = std::move(x.impl_);
     return *this;
   }
 
@@ -115,37 +115,37 @@ struct CAFFE2_API Tensor {
   Tensor& operator=(Tensor&&) &&;
 
   bool is_same(const Tensor& other) const noexcept {
-    return tensor_impl_ == other.tensor_impl_;
+    return impl_ == other.impl_;
   }
   size_t use_count() const noexcept {
-    return tensor_impl_.use_count();
+    return impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return tensor_impl_.weak_use_count();
+    return impl_.weak_use_count();
   }
 
   const char * toString() const;
 
   IntList sizes() const {
-    return tensor_impl_->sizes();
+    return impl_->sizes();
   }
   IntList strides() const {
-    return tensor_impl_->strides();
+    return impl_->strides();
   }
   int64_t ndimension() const {
     return dim();
   }
   Type & type() const {
-    return tensor_impl_->type();
+    return impl_->type();
   }
   TensorTypeId type_id() const {
-    return tensor_impl_->type_id();
+    return impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return dataTypeToScalarType(tensor_impl_->dtype().id());
+    return dataTypeToScalarType(impl_->dtype().id());
   }
   const Storage& storage() const {
-    return tensor_impl_->storage();
+    return impl_->storage();
   }
   Tensor toType(const Type & t, bool non_blocking=false) const;
   Tensor & copy_(const Tensor & src, bool non_blocking=false);
@@ -222,18 +222,18 @@ struct CAFFE2_API Tensor {
   // ~~~~~ Autograd API ~~~~~
 
   Tensor& set_requires_grad(bool requires_grad) {
-    tensor_impl_->set_requires_grad(requires_grad);
+    impl_->set_requires_grad(requires_grad);
     return *this;
   }
   bool requires_grad() const {
-    return tensor_impl_->requires_grad();
+    return impl_->requires_grad();
   }
 
   Tensor& grad() {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
   const Tensor& grad() const {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
 
   void set_data(Tensor new_data);
@@ -259,35 +259,35 @@ struct CAFFE2_API Tensor {
   friend struct WeakTensor;
 
 protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 
 struct CAFFE2_API WeakTensor {
-  WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
+  WeakTensor(const Tensor& t) : weak_impl_(t.impl_) {}
 
   // XXX: this can return undefined tensors
   // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
   Tensor lock() const {
-    return Tensor(weak_tensor_impl_.lock());
+    return Tensor(weak_impl_.lock());
   }
 
   bool is_same(const WeakTensor& other) const noexcept {
-    return weak_tensor_impl_ == other.weak_tensor_impl_;
+    return weak_impl_ == other.weak_impl_;
   }
 
   size_t use_count() const noexcept {
-    return weak_tensor_impl_.use_count();
+    return weak_impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return weak_tensor_impl_.weak_use_count();
+    return weak_impl_.weak_use_count();
   }
 
   TensorImpl* unsafeGetTensorImpl() const {
-    return weak_tensor_impl_._unsafe_get_target();
+    return weak_impl_._unsafe_get_target();
   }
 
 private:
-  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_impl_;
 };
 } // namespace at
 
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index bd2e475645975a..9de77efeb79fae 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -599,6 +599,6 @@ inline Variable::Variable(c10::intrusive_ptr<Variable::Impl> self)
 
 inline Variable::Impl* Variable::get() const {
   AT_CHECK(defined(), "Called Variable::get() on an undefined Variable");
-  return static_cast<Variable::Impl*>(tensor_impl_.get());
+  return static_cast<Variable::Impl*>(impl_.get());
 }
 }} // namespace torch::autograd

From 0947712e5d140a61ee790e6567dd7c88c28a11d1 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Tue, 25 Sep 2018 09:32:44 -0700
Subject: [PATCH 22/51] Move Factory functions from Type to
 TypeExtendedInterface. (#12025)

Summary:
This makes a few changes wrt Type, with the ultimate goal of removing Type from the public Methods/Functions.  In particular:
1) Removes factory functions from Type, into TypeExtendedInterface.
2) sparse_coo_tensor is now a first class at:: namespace function, with TensorOptions overloads.
3) We move from Type-based sparse_coo_tensor dispatch to function-based.

Note we still require a number of changes to get rid of tType in the public interface, in particular TensorOptions needs to support CUDA vs non-CUDA dispatch.  That is coming in a future patch.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12025

Reviewed By: ezyang

Differential Revision: D10017205

Pulled By: gchanan

fbshipit-source-id: 00807a37b09ed33f0656aaa165bb925abb026320
---
 aten/src/ATen/core/Type.h                     | 13 -------------
 aten/src/ATen/native/LegacyBridge.cpp         | 19 +++++++++++++++----
 .../ATen/native/miopen/BatchNorm_miopen.cpp   | 12 ++++++------
 aten/src/ATen/native/miopen/Conv_miopen.cpp   | 12 ++++++------
 aten/src/ATen/native/native_functions.yaml    | 13 ++++++++-----
 aten/src/ATen/preprocess_declarations.py      |  2 --
 aten/src/ATen/test/apply_utils_test.cpp       |  8 ++++----
 aten/src/ATen/test/basic.cpp                  |  2 +-
 aten/src/ATen/test/scalar_tensor_test.cpp     |  2 +-
 tools/autograd/templates/VariableType.cpp     |  4 ++--
 tools/autograd/templates/VariableType.h       |  2 +-
 torch/csrc/jit/export.cpp                     |  2 +-
 torch/csrc/torch.cpp                          |  6 +++---
 torch/csrc/utils/tensor_new.cpp               | 14 +++++++-------
 torch/csrc/variable_tensor_functions.h        | 12 ++++++------
 .../THD/base/data_channels/DataChannelMPI.cpp |  2 +-
 16 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index ee40e616f00236..a87432124712cc 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -364,8 +364,6 @@ struct CAFFE2_API Type {
   virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
   virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0;
   virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0;
-  virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride) const = 0;
-  virtual Tensor tensor(IntList size, IntList stride) const = 0;
   virtual Tensor abs(const Tensor & self) const = 0;
   virtual Tensor & abs_(Tensor & self) const = 0;
   virtual Tensor acos(const Tensor & self) const = 0;
@@ -579,17 +577,6 @@ struct CAFFE2_API Type {
   virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0;
   virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
-  virtual Tensor native_tensor() const = 0;
-  virtual Tensor native_tensor(IntList size) const = 0;
-  virtual Tensor tensor() const = 0;
-  virtual Tensor tensor(IntList size) const = 0;
-  virtual Tensor native_sparse_coo_tensor(IntList size) const = 0;
-  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
-  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
-  virtual Tensor sparse_coo_tensor(IntList size) const = 0;
-  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
-  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
-  virtual Tensor _native_sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntList size) const = 0;
   virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
   virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
   virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0;
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 0aaf2149b42a05..5fc554410ac9ca 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -150,10 +150,6 @@ Tensor tensor(const Type& dtype, ArrayRef<int64_t> size) {
   }
 }
 
-Tensor sparse_coo_tensor(const Type& dtype, ArrayRef<int64_t> size) {
-  return at::getType(dtype.options().layout(at::kSparse)).native_sparse_coo_tensor(size);
-}
-
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
   return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
 }
@@ -162,6 +158,21 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<i
   return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
 }
 
+Tensor sparse_coo_tensor(ArrayRef<int64_t> size, const TensorOptions& options) {
+  TensorOptions toptions = options;
+  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(size);
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, const TensorOptions& options) {
+  TensorOptions toptions = options;
+  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size, const TensorOptions& options) {
+  TensorOptions toptions = options;
+  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
+}
+
 Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
   return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size);
 }
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index c9d25780bd65d3..f7d163bee732ee 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -89,7 +89,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     mode = miopenBNSpatial;
   }
 
-  auto output_t = input->type().tensor(input->sizes());
+  auto output_t = at::empty(input->sizes(), input->options());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getMiopenHandle();
@@ -103,8 +103,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
 
   if (training) {
     int64_t num_features = input_t.size(1);
-    save_mean = weight_t.type().tensor({ num_features });
-    save_var = weight_t.type().tensor({ num_features });
+    save_mean = at::empty({ num_features }, weight_t.options());
+    save_var = at::empty({ num_features }, weight_t.options());
     MIOPEN_CHECK(miopenBatchNormalizationForwardTraining(
       handle, mode, &one, &zero,
       idesc.desc(), input->data_ptr(),
@@ -177,9 +177,9 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
-  auto grad_input_t  = input->type().tensor(input->sizes());
-  auto grad_weight_t = weight->type().tensor(weight->sizes());
-  auto grad_bias_t   = weight->type().tensor(weight->sizes());
+  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_weight_t = at::empty(weight->sizes(), weight->options());
+  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
   auto handle = getMiopenHandle();
   auto dataType = getMiopenDataType(*input);
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 9aeaad73558617..6515574a299c65 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -616,9 +616,10 @@ Tensor miopen_convolution_forward(
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  auto output_t = input->type().tensor(
+  auto output_t = at::empty(
                     conv_output_size(input->sizes(), weight->sizes(),
-                                     padding, stride, dilation, groups));
+                                     padding, stride, dilation, groups),
+                    input->options());
 
   // Avoid ambiguity of "output" when this is being used as backwards
   TensorArg output{ output_t, "result", 0 };
@@ -734,7 +735,7 @@ Tensor miopen_convolution_backward_input(
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
-  auto grad_input_t = grad_output->type().tensor(input_size);
+  auto grad_input_t = at::empty(input_size, grad_output->options());
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -859,7 +860,7 @@ Tensor miopen_convolution_backward_weight(
   checkAllSameType(c, {grad_output, input});
   checkAllSameGPU(c, {grad_output, input});
 
-  auto grad_weight_t = grad_output->type().tensor(weight_size);
+  auto grad_weight_t = at::empty(weight_size, grad_output->options());
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
@@ -917,8 +918,7 @@ Tensor miopen_convolution_backward_bias(
   TensorArg grad_output{ grad_output_t, "grad_output", 1 };
   setMIOpenStreamToCurrent();
 
-  auto grad_bias_t = grad_output->type().tensor(
-                        { grad_output->size(output_channels_dim) });
+  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
 
   TensorArg grad_bias{ grad_bias_t, "result", 0 };
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 37451065261c81..2cc0995dabadad 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1936,14 +1936,17 @@
     SparseCPU: new_with_tensor_and_size_sparse
     SparseCUDA: new_with_tensor_and_size_sparse
 
-- func: sparse_coo_tensor(Type dtype, IntList size) -> Tensor
-  variants: []
-
 - func: sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor
-  variants: []
 
 - func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
-  variants: []
+
+# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
+# the default would never make sense.
+- func: sparse_coo_tensor(IntList size, *, TensorOptions options) -> Tensor
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, *, TensorOptions options) -> Tensor
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options) -> Tensor
 
 - func: _native_sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
   variants: []
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
index d27d0da7240fc9..68bb4ecc531ca3 100644
--- a/aten/src/ATen/preprocess_declarations.py
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -220,8 +220,6 @@ def signature(option, i=None, value=None):
 def is_extended_method(option):
     if 'method' in option['variants']:
         return False
-    elif not option['variants']:
-        return False
     else:
         return True
 
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index 22be6de7acbc02..ab7e3522bbedae 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -37,10 +37,10 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
   empty_t.fill_(3);
   empty_t.exp_();
 
-  auto a0 = type.tensor();
-  auto a1 = type.tensor();
-  auto a2 = type.tensor();
-  auto a3 = type.tensor();
+  auto a0 = at::empty({0}, type.options());
+  auto a1 = at::empty({0}, type.options());
+  auto a2 = at::empty({0}, type.options());
+  auto a3 = at::empty({0}, type.options());
   auto a4 = CPU(kDouble).tensor();
 
   std::vector<Tensor> tensors({a0, a1, a2, a3, a4});
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index c04518a14fc4d1..361d24b5a6b76f 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -21,7 +21,7 @@ using Catch::Matchers::StartsWith;
 
 static void test(Type & type) {
   CATCH_SECTION( "resize" ) {
-    auto a = type.tensor();
+    auto a = at::empty({0}, type.options());
     a.resize_({3,4});
     CATCH_REQUIRE(a.numel() == 12);
     a.resize_({5, 7});
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index 964f6260e7d9ff..a89ca81da017f7 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -234,7 +234,7 @@ void test(Type &T) {
                        [&]() {
                          int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
                          int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
-                         require_equal_size_dim(result, result.type().tensor({dim0, dim1}));
+                         require_equal_size_dim(result, at::empty({dim0, dim1}, result.options()));
                        }(););
       }
 
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index d697ec8a774208..24ac92dd63926f 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -177,11 +177,11 @@ bool VariableType::isVariableType(const at::Type& type) {
   return type.is_variable();
 }
 
-at::Type* VariableType::getVariableTypeFromBaseType(const at::Type& baseType) {
+at::TypeExtendedInterface* VariableType::getVariableTypeFromBaseType(const at::Type& baseType) {
   auto id = static_cast<size_t>(baseType.ID());
   if(id >= type_to_variable_type.size())
     return nullptr;
-  return type_to_variable_type[id].get();
+  return static_cast<at::TypeExtendedInterface*>(type_to_variable_type[id].get());
 }
 
 namespace {
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 446fb5b889f47a..045279d4cce64e 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -53,7 +53,7 @@ struct TORCH_API VariableType final : public at::TypeDefault {
   Storage unsafeStorageFromTH(void * th_pointer, bool retain) const override;
   at::Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override;
 
-  static at::Type* getVariableTypeFromBaseType(const at::Type& baseType);
+  static at::TypeExtendedInterface* getVariableTypeFromBaseType(const at::Type& baseType);
   static bool isVariableType(const at::Type& type);
   static std::vector<at::Type*> allCUDATypes();
   static std::vector<at::Type*> allCPUTypes();
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 1984f35fcc8974..437d0f6c779972 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -681,7 +681,7 @@ void ModuleEncoder::EncodeTensor(
       // NB: This new tensor is created to support cuda tensors.
       // Storages can be mutated when converting tensors from cuda to cpu,
       // and we need a cpu tensor to copy data from.
-      t = tensor.type().tensor(
+      t = at::getType(tensor).tensor(
           tensor.storage(),
           /* storageOffset = */ 0,
           /* size = */ { static_cast<int64_t>(tensor.type().elementSizeInBytes() * tensor.storage().size()) },
diff --git a/torch/csrc/torch.cpp b/torch/csrc/torch.cpp
index d3f79cd49dbdc7..656cae7f7e1549 100644
--- a/torch/csrc/torch.cpp
+++ b/torch/csrc/torch.cpp
@@ -3,15 +3,15 @@
 #include <torch/csrc/autograd/variable.h>
 
 namespace torch {
-at::Type& getVariableType(at::Backend backend, at::ScalarType type) {
+at::TypeExtendedInterface& getVariableType(at::Backend backend, at::ScalarType type) {
   return *autograd::VariableType::getVariableTypeFromBaseType(at::getNonVariableType(backend, type));
 }
 
-at::Type& CPU(at::ScalarType type) {
+at::TypeExtendedInterface& CPU(at::ScalarType type) {
   return torch::getVariableType(at::Backend::CPU, type);
 }
 
-at::Type& CUDA(at::ScalarType type) {
+at::TypeExtendedInterface& CUDA(at::ScalarType type) {
   return torch::getVariableType(at::Backend::CUDA, type);
 }
 
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 2bdc07ad21e8bb..4c6a2855ea26cd 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -280,12 +280,12 @@ Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwa
     auto deviceOptional = r.deviceOptional(2);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), type.options());
   } else if (r.idx == 3) {
     auto deviceOptional = r.deviceOptional(3);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2), type.options());
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -324,14 +324,14 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
     auto deviceOptional = r.deviceOptional(2);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), type.options());
   } else if (r.idx == 3) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
     auto deviceOptional = r.deviceOptional(3);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2), type.options());
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -472,7 +472,7 @@ Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs
     const auto& index_type = values.type().toScalarType(kLong);
     Tensor indices = internal_new_from_data(index_type, r.deviceOptional(3), r.pyobject(0), false, true, false);
     const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
-    return sparse_type_to_use.sparse_coo_tensor(indices, values).set_requires_grad(r.toBool(4));
+    return at::sparse_coo_tensor(indices, values, sparse_type_to_use.options()).set_requires_grad(r.toBool(4));
   } else if (r.idx == 1) {
     bool type_inference = r.isNone(3);
     const auto& sparse_type = typeWithDefault(r, 3, 4, default_sparse_type);
@@ -482,11 +482,11 @@ Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs
     const auto& index_type = values.type().toScalarType(kLong);
     Tensor indices = internal_new_from_data(index_type, r.deviceOptional(4), r.pyobject(0), false, true, false);
     const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
-    return sparse_type_to_use.sparse_coo_tensor(indices, values, r.intlist(2)).set_requires_grad(r.toBool(5));
+    return at::sparse_coo_tensor(indices, values, r.intlist(2), sparse_type_to_use.options()).set_requires_grad(r.toBool(5));
   } else if (r.idx == 2) {
     const auto& sparse_type_to_use = typeWithDefault(r, 1, 2, default_sparse_type);
     at::DeviceGuard device_guard(r.device(2));
-    return sparse_type_to_use.sparse_coo_tensor(r.intlist(0)).set_requires_grad(r.toBool(3));
+    return at::sparse_coo_tensor(r.intlist(0), sparse_type_to_use.options()).set_requires_grad(r.toBool(3));
   }
   throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
 }
diff --git a/torch/csrc/variable_tensor_functions.h b/torch/csrc/variable_tensor_functions.h
index 692fe60aaeeab1..e18794a970fe98 100644
--- a/torch/csrc/variable_tensor_functions.h
+++ b/torch/csrc/variable_tensor_functions.h
@@ -13,20 +13,20 @@ namespace torch {
 // when we create new tensors. We also provide a few accessors like requires_grad
 // that make it easier to get to varible information when we have a at::Tensor
 
-/// Returns a `Type` object for the given backend (e.g. `at::kCPU`) and
+/// Returns a `TypeExtendedInterface` object for the given backend (e.g. `at::kCPU`) and
 /// `ScalarType` (e.g. `at::kDouble`).
 /// TODO: Eliminate this function as much as possible
-THP_CLASS at::Type& getVariableType(at::Backend backend, at::ScalarType type);
+THP_CLASS at::TypeExtendedInterface& getVariableType(at::Backend backend, at::ScalarType type);
 
-/// Returns a `Type` object for the CPU backend and the given `ScalarType`
+/// Returns a `TypeExtendedInterface` object for the CPU backend and the given `ScalarType`
 /// (e.g. `at::kDouble`). Equivalent to `getVariableType(kCPU, type)`.
 /// TODO: Eliminate this function as much as possible
-THP_CLASS at::Type& CPU(at::ScalarType type);
+THP_CLASS at::TypeExtendedInterface& CPU(at::ScalarType type);
 
-/// Returns a `Type` object for the CUDA backend and the given `ScalarType`
+/// Returns a `TypeExtendedInterface` object for the CUDA backend and the given `ScalarType`
 /// (e.g. `at::kDouble`). Equivalent to `getVariableType(kCUDA, type)`.
 /// TODO: Eliminate this function as much as possible
-THP_CLASS at::Type& CUDA(at::ScalarType type);
+THP_CLASS at::TypeExtendedInterface& CUDA(at::ScalarType type);
 
 /// Sets the `requires_grad` property of the given `Tensor`.
 THP_CLASS void set_requires_grad(at::Tensor& tensor, bool requires_grad) noexcept;
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
index b23157581bdfc0..e551da81d13562 100644
--- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -153,7 +153,7 @@ at::Tensor DataChannelMPI::_newLikeFlat(std::vector<at::Tensor>& tensors) const
   at::DeviceGuard gpu_guard(t.is_cuda() ? t.get_device() : -1);
   std::vector<int64_t> sizes { static_cast<int64_t>(tensors.size()) };  // sizes = [output.size()] + input.sizes()
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return t.type().tensor(sizes);
+  return at::empty(sizes, t.options());
 }
 
 

From fcb3ccf23fefe57525592a60c7091b0baf4b4e31 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 09:37:10 -0700
Subject: [PATCH 23/51] Don't record Git version automatically via cmake
 (#12046)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12046

This /sounds/ like a good idea in theory, but a feature
like this must be implemented very carefully, because if
you just plop the Git version in a header (that is included
by every file in your project, as macros.h is), then every
time you do a 'git pull', you will do a FULL rebuild, because
macros.h is going to regenerate to a new version and of course
you have to rebuild a source file if a header file changes.

I don't have time to implement it correctly, so I'm axing
the feature instead. If you want git versions in, e.g.,
nightly builds, please explicitly specify that when you feed
in the version.

Reviewed By: pjh5

Differential Revision: D10030556

fbshipit-source-id: 499d001c7b8ccd4ef15ce10dd6591c300c7df27d
---
 caffe2/core/macros.h.in  |  2 --
 cmake/Dependencies.cmake | 17 -----------------
 cmake/Summary.cmake      |  1 -
 3 files changed, 20 deletions(-)

diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index a055de0fdc2543..188853296f816c 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -11,7 +11,6 @@
 #define CAFFE2_VERSION_MAJOR @CAFFE2_VERSION_MAJOR@
 #define CAFFE2_VERSION_MINOR @CAFFE2_VERSION_MINOR@
 #define CAFFE2_VERSION_PATCH @CAFFE2_VERSION_PATCH@
-#define CAFFE2_GIT_VERSION "@CAFFE2_GIT_VERSION@"
 
 static_assert(
     CAFFE2_VERSION_MINOR < 100,
@@ -54,7 +53,6 @@ static_assert(
 
 // Useful build settings that are recorded in the compiled binary
 #define CAFFE2_BUILD_STRINGS { \
-  {"GIT_VERSION", "${CAFFE2_GIT_VERSION}"}, \
   {"CXX_FLAGS", "${CMAKE_CXX_FLAGS}"}, \
   {"BUILD_TYPE", "${CMAKE_BUILD_TYPE}"}, \
   {"BLAS", "${BLAS}"}, \
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 4fbf634cf7ac71..4607ef23727565 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -39,23 +39,6 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   endif()
 endif()
 
-# ---[ git: used to generate git build string.
-find_package(Git)
-if(GIT_FOUND)
-  execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --always --dirty
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
-                  WORKING_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/.."
-                  OUTPUT_VARIABLE CAFFE2_GIT_VERSION
-                  RESULT_VARIABLE __git_result)
-  if(NOT ${__git_result} EQUAL 0)
-    set(CAFFE2_GIT_VERSION "unknown")
-  endif()
-else()
-  message(
-      WARNING
-      "Cannot find git, so Caffe2 won't have any git build info available")
-endif()
-
 # ---[ BLAS
 if(NOT BUILD_ATEN_MOBILE)
   set(BLAS "MKL" CACHE STRING "Selected BLAS library")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 16d18ac7634d0d..3df260f3b49aad 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -5,7 +5,6 @@ function (caffe2_print_configuration_summary)
   message(STATUS "General:")
   message(STATUS "  CMake version         : ${CMAKE_VERSION}")
   message(STATUS "  CMake command         : ${CMAKE_COMMAND}")
-  message(STATUS "  Git version           : ${CAFFE2_GIT_VERSION}")
   message(STATUS "  System                : ${CMAKE_SYSTEM_NAME}")
   message(STATUS "  C++ compiler          : ${CMAKE_CXX_COMPILER}")
   message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")

From 3deb4791c3fd6e4ce63b708006e75fa43fe3d971 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 09:42:41 -0700
Subject: [PATCH 24/51] Replace 'struct Tensor' with 'class Tensor'. (#12034)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12034

We need ATen and Caffe2 to line up, and the rule is
that if you have any private/protected members, you
should declare it as a class.  Class we go.

(There are some other obvious candidates for this treatment,
but I've kept this patch just to Tensor)

Reviewed By: gchanan, mingzhe09088

Differential Revision: D10024467

fbshipit-source-id: 17cfe2741ba9c3f56cb87d6f5d1afd3c61a8e4fe
---
 aten/src/ATen/Context.h                   | 2 +-
 aten/src/ATen/core/Scalar.h               | 2 +-
 aten/src/ATen/core/ScalarType.h           | 2 +-
 aten/src/ATen/core/SparseTensorRef.h      | 2 +-
 aten/src/ATen/core/Tensor.h               | 5 +++--
 aten/src/ATen/core/TensorImpl.h           | 2 +-
 aten/src/ATen/core/Type.h                 | 2 +-
 aten/src/ATen/templates/NativeFunctions.h | 2 +-
 aten/src/ATen/templates/Tensor.h          | 5 +++--
 aten/src/ATen/templates/Type.h            | 2 +-
 torch/csrc/jit/interpreter.h              | 2 +-
 torch/csrc/tensor/python_tensor.h         | 2 +-
 12 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 6a2f28cf9eb32b..1f546f8574a780 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -22,7 +22,7 @@
 
 namespace at {
 
-struct Tensor;
+class Tensor;
 
 class CAFFE2_API Context {
  public:
diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h
index 45b99fdb34cd7e..f1b40d6f8053bb 100644
--- a/aten/src/ATen/core/Scalar.h
+++ b/aten/src/ATen/core/Scalar.h
@@ -12,7 +12,7 @@
 
 namespace at {
 
-struct Tensor;
+class Tensor;
 
 class CAFFE2_API Scalar {
  public:
diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
index 5a88fadf00de67..fad2f765fe4331 100644
--- a/aten/src/ATen/core/ScalarType.h
+++ b/aten/src/ATen/core/ScalarType.h
@@ -188,7 +188,7 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
   return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
 }
 
-struct Tensor;
+class Tensor;
 typedef ArrayRef<int64_t> IntList;
 typedef ArrayRef<Tensor> TensorList;
 
diff --git a/aten/src/ATen/core/SparseTensorRef.h b/aten/src/ATen/core/SparseTensorRef.h
index 9c9fada2dc7117..9a5bbddb783c01 100644
--- a/aten/src/ATen/core/SparseTensorRef.h
+++ b/aten/src/ATen/core/SparseTensorRef.h
@@ -2,7 +2,7 @@
 
 namespace at {
 
-struct Tensor;
+class Tensor;
 struct SparseTensorRef {
   explicit SparseTensorRef(const Tensor& t): tref(t) {}
   const Tensor& tref;
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index a359dc13b41fea..fa31741313db39 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -15,7 +15,7 @@
 namespace at {
 struct Generator;
 struct Type;
-struct Tensor;
+class Tensor;
 struct TensorOptions;
 } // namespace at
 
@@ -37,7 +37,8 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct CAFFE2_API Tensor {
+class CAFFE2_API Tensor {
+public:
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : impl_(std::move(tensor_impl)) {
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index cee843458b9e5e..1e7ee932f63ddc 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -16,7 +16,7 @@ namespace at {
 class Scalar;
 struct Type;
 struct Storage;
-struct Tensor;
+class Tensor;
 } // namespace at
 
 namespace at {
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index a87432124712cc..3a2ccbe1e45edb 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -33,7 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
-struct Tensor;
+class Tensor;
 
 static inline void noop_deleter(void*) {}
 
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index 0f7b8ba27ef9e1..1ca3e495358cbf 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -16,7 +16,7 @@
 namespace at {
 struct Generator;
 class Scalar;
-struct Tensor;
+class Tensor;
 struct Type;
 } // namespace at
 
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 6f03690d9b9997..1d5ac020f231e1 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -15,7 +15,7 @@
 namespace at {
 struct Generator;
 struct Type;
-struct Tensor;
+class Tensor;
 struct TensorOptions;
 } // namespace at
 
@@ -37,7 +37,8 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct CAFFE2_API Tensor {
+class CAFFE2_API Tensor {
+public:
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : impl_(std::move(tensor_impl)) {
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 2db006c82d5834..fbbf88823ea24d 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -33,7 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
-struct Tensor;
+class Tensor;
 
 static inline void noop_deleter(void*) {}
 
diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h
index 151a980d76a112..d28558d4d15b40 100644
--- a/torch/csrc/jit/interpreter.h
+++ b/torch/csrc/jit/interpreter.h
@@ -6,7 +6,7 @@
 #include "torch/csrc/WindowsTorchApiMacro.h"
 
 namespace at {
-  struct Tensor;
+  class Tensor;
 }
 namespace torch { namespace jit {
 
diff --git a/torch/csrc/tensor/python_tensor.h b/torch/csrc/tensor/python_tensor.h
index 64ebbef786052f..a8c282dd1e96a0 100644
--- a/torch/csrc/tensor/python_tensor.h
+++ b/torch/csrc/tensor/python_tensor.h
@@ -5,7 +5,7 @@
 namespace at {
 struct Type;
 struct Device;
-struct Tensor;
+class Tensor;
 } // namespace at
 
 namespace torch { namespace tensors {

From d7e11e3aaec8f6dcec261e85a330a437336ab8c9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 10:12:24 -0700
Subject: [PATCH 25/51] Revert "Move CreateContext to global registry (#11688)"
 (#12049)

Summary:
This reverts commit 3ae6ee4ebded136da30aa53fd3873d84acfbc9f0.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12049

Differential Revision: D10030954

Pulled By: ezyang

fbshipit-source-id: 6ca9de65b707c5b4c68280fc6f1b8e5ad7251efc
---
 aten/src/ATen/core/context_base.cpp  | 11 ----------
 aten/src/ATen/core/context_base.h    | 26 ++++++----------------
 caffe2/core/blob_serialization.cc    |  5 +++--
 caffe2/core/context.cc               |  4 ----
 caffe2/core/context.h                | 11 ++++++++--
 caffe2/core/context_base.cc          |  1 -
 caffe2/core/context_base.h           |  2 --
 caffe2/core/context_gpu.cu           |  5 -----
 caffe2/core/context_gpu.h            | 15 +++++++++++--
 caffe2/core/hip/context_hip.cc       | 18 ++++++----------
 caffe2/core/hip/context_hip.h        | 15 +++++++++++--
 caffe2/core/registry.h               |  2 +-
 caffe2/core/tensor.h                 |  8 +++----
 caffe2/core/tensor_impl.cc           |  2 +-
 caffe2/core/tensor_impl.h            | 28 +++++++++++++-----------
 caffe2/ideep/utils/ideep_context.h   | 11 ++++++++--
 caffe2/ideep/utils/ideep_register.cc |  3 ---
 caffe2/mkl/utils/mkl_context.cc      |  4 ----
 caffe2/mkl/utils/mkl_context.h       | 11 ++++++++--
 caffe2/proto/caffe2_pb.h             | 32 +---------------------------
 caffe2/python/pybind_state.h         |  2 +-
 21 files changed, 94 insertions(+), 122 deletions(-)

diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
index 5fa747180d1214..e34c6880c0210a 100644
--- a/aten/src/ATen/core/context_base.cpp
+++ b/aten/src/ATen/core/context_base.cpp
@@ -1,16 +1,5 @@
 #include <ATen/core/context_base.h>
 
-namespace at {
-
-AT_DEFINE_TYPED_REGISTRY(
-    ContextRegistry,
-    DeviceType,
-    BaseContext,
-    std::unique_ptr,
-    at::Device);
-
-} // namespace at
-
 namespace caffe2 {
 
 // TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index be9c36bfd60796..45b38387b46ca6 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -6,12 +6,11 @@
 #include <memory>
 #include <unordered_map>
 
-#include <ATen/core/ATenGeneral.h>
-#include <ATen/core/Device.h>
+#include <ATen/core/DeviceType.h>
 #include <ATen/core/Error.h>
-#include <ATen/core/Registry.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
+#include <ATen/core/ATenGeneral.h>
 
 namespace caffe2 {
 class Event;
@@ -32,6 +31,11 @@ class AT_CORE_API BaseStaticContext {
 
   virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
 
+  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext(
+      const caffe2::DeviceOption&) = 0;
+
   virtual DeviceType GetDeviceType() = 0;
 
   /*
@@ -180,22 +184,6 @@ class AT_CORE_API BaseContext {
   }
 };
 
-// Context constructor registry
-AT_DECLARE_TYPED_REGISTRY(
-    ContextRegistry,
-    at::DeviceType,
-    BaseContext,
-    std::unique_ptr,
-    at::Device);
-
-#define REGISTER_CONTEXT(type, ...) \
-  AT_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
-
-inline std::unique_ptr<at::BaseContext> CreateContext(
-    const at::Device& device) {
-  return ContextRegistry()->Create(device.type(), device);
-}
-
 } // namespace at
 
 namespace caffe2 {
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 38125b242def2f..7ff5a2b25eacc1 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -196,7 +196,7 @@ void TensorSerializer::Serialize(
   const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
   proto.set_data_type(data_type);
   StoreDeviceDetail(input, &proto);
-  auto uniq_ptr = CreateContext(input.GetDevice());
+  auto uniq_ptr = input.GetStaticContext()->CreateContext();
   // A lot of copypaste is error prone. Should we create a macro for this?
   switch (data_type) {
     case TensorProto_DataType_FLOAT:
@@ -370,7 +370,8 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
 void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
   // We create a local context for deserializing. Since Caffe2 contexts are
   // usually lightweight, this should not involve too much overhead.
-  auto uniq_ptr = CreateContext(OptionToDevice(proto.device_detail()));
+  auto uniq_ptr =
+      tensor->GetStaticContext()->CreateContext(proto.device_detail());
   auto context = uniq_ptr.get();
   context->SwitchToDevice(0);
   vector<int64_t> dims;
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
index 94047eb71ee0b6..30819afdc4ce3f 100644
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@@ -5,10 +5,6 @@
 #include <process.h>
 #endif
 
-namespace at {
-
-REGISTER_CONTEXT(DeviceType::CPU, caffe2::CPUContext);
-} // namespace at
 namespace caffe2 {
 
 uint32_t RandomNumberSeed() {
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index af66396af72c44..aff66534d22198 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -50,8 +50,6 @@ class CAFFE2_API CPUContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU);
   }
-  explicit CPUContext(const at::Device& device)
-      : CPUContext(DeviceToOption(device)) {}
 
   ~CPUContext() noexcept override {}
 
@@ -194,6 +192,15 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
     return data_and_deleter;
   }
 
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<CPUContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<CPUContext>(option);
+  }
+
   DeviceType GetDeviceType() override {
     return CPU;
   }
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
index 99996d9e165b9b..b61b73cbad1cb5 100644
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@@ -1,5 +1,4 @@
 #include "context_base.h"
 
 namespace caffe2 {
-
 } // namespace caffe2
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index 50b9252a3cf3dd..3a6dfad5b95cc3 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -5,5 +5,3 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {} // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 0d9e2686212a1e..1eaa579ee0cdbe 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -57,11 +57,6 @@ CAFFE2_DEFINE_int(
     128,
     "The threshold in MB on how frequently to report memory changes");
 
-namespace at {
-
-REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
-} // namespace at
-
 namespace caffe2 {
 
 ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index ce73f5f942828b..5fcdb98b100794 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -142,8 +142,6 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   // The default cuda context constructor.
   explicit CUDAContext(const int gpu_id = -1);
   explicit CUDAContext(const DeviceOption& option);
-  explicit CUDAContext(const at::Device& device)
-      : CUDAContext(DeviceToOption(device)) {}
 
   ~CUDAContext() override {
     if (curand_generator_) {
@@ -387,6 +385,19 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<CUDAContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<CUDAContext>(option);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
+    return caffe2::make_unique<CUDAContext>(gpu_id);
+  }
+
   DeviceType GetDeviceType() override {
     return CUDA;
   }
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 3eadaf0e71b118..0fabb20a642c94 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -50,11 +50,6 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
                   128,
                   "The threshold in MB on how frequently to report memory changes");
 
-namespace at {
-
-REGISTER_CONTEXT(DeviceType::HIP, caffe2::HIPContext);
-} // namespace at
-
 namespace caffe2 {
 
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
@@ -413,12 +408,13 @@ void HIPStaticContext::Delete(void* ptr) {
         g_hip_device_affiliation.erase(it);
         break;
     }
-    case HipMemoryPoolType::THC: {
-      HIP_ENFORCE(g_thc_allocator->Free(ptr));
-      if (FLAGS_caffe2_gpu_memory_tracking) {
-        g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
-      }
-      break;
+    case HipMemoryPoolType::THC: 
+    {
+        HIP_ENFORCE(g_thc_allocator->Free(ptr));
+        if (FLAGS_caffe2_gpu_memory_tracking) {
+          g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
+        }
+        break;
     }
     }
 }
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index fb04336354e704..5a7613cf934fd0 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -127,8 +127,6 @@ class HIPContext final : public BaseContext {
   // The default HIP context constructor.
   explicit HIPContext(const int gpu_id = -1);
   explicit HIPContext(const DeviceOption& option);
-  explicit HIPContext(const at::Device& device)
-      : HIPContext(DeviceToOption(device)) {}
 
   ~HIPContext() override {
     if (hiprand_generator_) {
@@ -376,6 +374,19 @@ class HIPStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<HIPContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<HIPContext>(option);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
+    return caffe2::make_unique<HIPContext>(gpu_id);
+  }
+
   DeviceType GetDeviceType() override {
     return HIP;
   }
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index 634323af1eb4d0..7db975077ea8b9 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -172,7 +172,7 @@ class Registerer {
       key,                                                                    \
       RegistryName(),                                                         \
       Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      at::demangle_type<__VA_ARGS__>());                                      \
+      at::demangle_type<__VA_ARGS__>());                                           \
   }
 
 // CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 7e563e37d3418e..1e4cac2788b560 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -130,12 +130,12 @@ class CAFFE2_API Tensor final {
     return impl_.get()->GetStaticContext();
   }
 
-  DeviceType GetDeviceType() const {
-    return impl_.get()->GetDeviceType();
+  std::unique_ptr<BaseContext> CreateContext() const {
+    return impl_.get()->CreateContext();
   }
 
-  at::Device GetDevice() const {
-    return impl_.get()->GetDevice();
+  DeviceType GetDeviceType() const {
+    return impl_.get()->GetDeviceType();
   }
 
   void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const {
diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc
index 74aa5385ed0199..cff98c6101ea5d 100644
--- a/caffe2/core/tensor_impl.cc
+++ b/caffe2/core/tensor_impl.cc
@@ -1,5 +1,5 @@
 #include "caffe2/core/tensor_impl.h"
-#include "caffe2/core/context_base.h"
+
 #include "caffe2/core/flags.h"
 
 CAFFE2_DEFINE_bool(
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index eb59291689cbc5..53c812f55e297b 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -3,6 +3,7 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/context_base.h>
+#include <ATen/core/context_base.h>
 
 #include "caffe2/core/allocator.h"
 #include "caffe2/core/common.h"
@@ -111,12 +112,19 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return get_static_context(device_type);
   }
 
-  at::DeviceType GetDeviceType() const {
-    return storage_.device_type();
+  /* @brief
+   * Create a context that has the same device_type
+   * as the tensor.
+   * Note that this doesn't support passing in argument
+   * TODO(jerryzh): move this to a global registry
+   * that can create context for us
+   */
+  std::unique_ptr<at::BaseContext> CreateContext() const {
+    return GetStaticContext()->CreateContext();
   }
 
-  at::Device GetDevice() const {
-    return storage_.device();
+  at::DeviceType GetDeviceType() const {
+    return storage_.device_type();
   }
 
   /**
@@ -159,12 +167,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         // knows how to copy between CPU and that context
         if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
           if (!context) {
-            CreateContext(src.GetDevice())
-                ->CopyBytesToDevice(
-                    nbytes(),
-                    src.raw_data(),
-                    raw_mutable_data(),
-                    GetDeviceType());
+            src.CreateContext()->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
           } else {
             CAFFE_ENFORCE(
                 context->device_type() == src.GetDeviceType(),
@@ -176,8 +180,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
           // In case source context is CPU, and target context is non-CPU
           // We'll have to create a Context from target and perform the
           // copy using that context
-          CreateContext(GetDevice())
-              ->CopyBytesFromCPU(nbytes(), src.raw_data(), raw_mutable_data());
+          CreateContext()->CopyBytesFromCPU(
+              nbytes(), src.raw_data(), raw_mutable_data());
         }
       }
     }
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 087078c507d164..f50a4f34c66789 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -20,8 +20,6 @@ class IDEEPContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP);
   }
-  explicit IDEEPContext(const at::Device& device)
-      : IDEEPContext(DeviceToOption(device)) {}
 
   ~IDEEPContext() noexcept override {}
 
@@ -180,6 +178,15 @@ class IDEEPStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<IDEEPContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<IDEEPContext>(option);
+  }
+
   DeviceType GetDeviceType() override {
     return IDEEP;
   }
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 53b8bcbf072c5a..020e22fa6143ed 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -4,9 +4,6 @@
 #include <ideep_pin_singletons.hpp>
 #include "ideep_context.h"
 
-namespace at {
-REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext);
-} // namespace at
 namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(ideep::tensor);
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
index 8c66bc111282ac..6e9075df43475f 100644
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -3,10 +3,6 @@
 #include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"
 
-namespace at {
-
-REGISTER_CONTEXT(DeviceType::MKLDNN, caffe2::MKLContext);
-} // namespace at
 namespace caffe2 {
 
 // MKL events are the same as CPU events
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 8364026d91c651..0a7b5808a446be 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -29,8 +29,6 @@ class MKLContext : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_MKLDNN);
   }
-  explicit MKLContext(const at::Device& device)
-      : MKLContext(DeviceToOption(device)) {}
 
   ~MKLContext() override {}
 
@@ -157,6 +155,15 @@ class MKLStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<MKLContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<MKLContext>(option);
+  }
+
   DeviceType GetDeviceType() override {
     return MKLDNN;
   }
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index a0d455d4519d74..0a08c8db241e98 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/core/Device.h>
+#include <ATen/core/DeviceType.h>
 #include <ATen/core/Error.h>
 #include <caffe2/proto/caffe2.pb.h>
 
@@ -47,10 +47,6 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   }
 }
 
-inline CAFFE2_API DeviceType ProtoToType(int p) {
-  return ProtoToType(static_cast<caffe2::DeviceTypeProto>(p));
-}
-
 inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   switch (t) {
     case DeviceType::CPU:
@@ -81,30 +77,4 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   }
 }
 
-inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
-    const at::Device& device) {
-  caffe2::DeviceOption option;
-  auto type = device.type();
-  option.set_device_type(TypeToProto(type));
-  // sets the gpu_id to -1 means we'll use the current gpu id when the function
-  // is being called, see context_gpu.cu for more info.
-  if (type == at::DeviceType::CUDA) {
-    option.set_cuda_gpu_id(device.index());
-  } else if (type == at::DeviceType::HIP) {
-    option.set_hip_gpu_id(device.index());
-  }
-  return option;
-}
-
-inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
-  at::Device device(ProtoToType(option.device_type()));
-  auto type = device.type();
-  if (type == at::DeviceType::CUDA) {
-    device.set_index(option.cuda_gpu_id());
-  } else if (type == at::DeviceType::HIP) {
-    device.set_index(option.hip_gpu_id());
-  }
-  return device;
-}
-
 } // namespace caffe2
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 97ec6628fe3f27..59f39dd313032c 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -148,7 +148,7 @@ class TensorFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      auto context = CreateContext(tensor.GetDeviceType());
+      auto context = tensor.GetStaticContext()->CreateContext();
       context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
       context->FinishDeviceComputation();
     }

From 7122f8b3bb7f7287cf0410a5ced2c5b120d15d30 Mon Sep 17 00:00:00 2001
From: Will Feng <willfeng@fb.com>
Date: Tue, 25 Sep 2018 10:19:39 -0700
Subject: [PATCH 26/51] Disable more flaky tests on CircleCI (#11399)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/11362.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11399

Differential Revision: D9736673

Pulled By: yf225

fbshipit-source-id: cad8c0e86a70a01b047e648975ca5b9926e4acb3
---
 caffe2/python/data_parallel_model_test.py             | 1 +
 caffe2/python/onnx/tests/onnx_backend_test.py         | 4 ++++
 caffe2/python/operator_test/cross_entropy_ops_test.py | 3 +++
 caffe2/python/operator_test/im2col_col2im_test.py     | 4 ++++
 4 files changed, 12 insertions(+)

diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index ebf3c3b8cd44a5..1b9b4929bb0f9b 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -831,6 +831,7 @@ def param_update_fun(model):
 
         return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
 
+    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     def test_equiv_recurrent(self):
         '''
         Test that the model produces exactly same results given
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index 8ffaef3004d9ab..ad229a97f807d9 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -59,6 +59,10 @@
 if 'JENKINS_URL' in os.environ:
     backend_test.exclude(r'(test_vgg19|test_vgg)')
 
+# FIXME: flaky test in CircleCI
+if "IN_CIRCLECI" in os.environ:
+    backend_test.exclude(r'(test_dynamic_slice_cpu)')
+
 # import all test cases at global scope to make them visible to python.unittest
 globals().update(backend_test
                  .enable_report()
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index 5ee60d877c33bb..f97b0c5809d5fd 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -8,6 +8,8 @@
 import hypothesis.strategies as st
 import numpy as np
 
+import unittest
+import os
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
@@ -248,6 +250,7 @@ def weighted_sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
             output_to_grad='xentropy',
             grad_reference=weighted_sigmoid_xentr_logit_grad_ref)
 
+    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(n=st.integers(2, 10),
            b=st.integers(1, 5),
            **hu.gcs_cpu_only)
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 6db6cae47ad1c2..46b16f4356ff5f 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -10,6 +10,9 @@
 import hypothesis.strategies as st
 import numpy as np
 
+import unittest
+import os
+
 
 class TestReduceFrontSum(hu.HypothesisTestCase):
     @given(batch_size=st.integers(1, 3),
@@ -111,6 +114,7 @@ def test_im2col_layout(self, batch_size, stride, pad, kernel, dilation,
             atol=1e-4,
             rtol=1e-4)
 
+    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(batch_size=st.integers(1, 3),
            stride=st.integers(1, 3),
            pad=st.integers(0, 3),

From 364ae10bb8813266aea1a708e89f9deea0c4a243 Mon Sep 17 00:00:00 2001
From: Duc Ngo <duc@fb.com>
Date: Tue, 25 Sep 2018 10:48:02 -0700
Subject: [PATCH 27/51] nomnigraph - easy - add some python test helper methods
 (#12020)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12020

- make it less verbose to create random blobs in python unit test by adding some test helper methods
- move str_compare test helper method to test_util.py

Reviewed By: ZolotukhinM

Differential Revision: D10003637

fbshipit-source-id: cb79d2ad508341f750a1bb8f564e87d055c65652
---
 caffe2/python/test_util.py            |  22 +++++
 caffe2/python/transformations_test.py | 114 +++++++++++---------------
 2 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index 769679e46f2b7c..dc1f7370132230 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -16,6 +16,28 @@ def rand_array(*dims):
     return np.array(np.random.rand(*dims) - 0.5).astype(np.float32)
 
 
+def randBlob(name, type, *dims, **kwargs):
+    offset = kwargs['offset'] if 'offset' in kwargs else 0.0
+    workspace.FeedBlob(name, np.random.rand(*dims).astype(type) + offset)
+
+
+def randBlobFloat32(name, *dims, **kwargs):
+    randBlob(name, np.float32, *dims, **kwargs)
+
+
+def randBlobsFloat32(names, *dims, **kwargs):
+    for name in names:
+        randBlobFloat32(name, *dims, **kwargs)
+
+
+def str_compare(a, b, encoding="utf8"):
+    if isinstance(a, bytes):
+        a = a.decode(encoding)
+    if isinstance(b, bytes):
+        b = b.decode(encoding)
+    return a == b
+
+
 class TestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 383b8410ea6ae2..26f5450605a1c1 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -23,39 +23,32 @@
 import numpy as np
 
 from caffe2.python.transformations import Transformer
-from caffe2.python import core, workspace, test_util
+from caffe2.python import core, workspace
+from caffe2.python import test_util as tu
 
 transformer = Transformer()
 
 
-def str_compare(a, b, encoding="utf8"):
-    if isinstance(a, bytes):
-        a = a.decode(encoding)
-    if isinstance(b, bytes):
-        b = b.decode(encoding)
-    return a == b
-
-
-class TestTransformations(test_util.TestCase):
+class TestTransformations(tu.TestCase):
     def test_transformer_AddNNPACK(self):
         net = core.Net("net")
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
         transformer.AddNNPACK(net)
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
 
     def test_transformer_FuseNNPACKConvRelu(self):
         net = core.Net("net")
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 1
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
 
@@ -65,12 +58,12 @@ def test_noFuseNNPACKConvRelu(self):
         net.Relu(["Y"], ["Y2"])
         net.Relu(["Y"], ["Y3"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 3
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation") and str_compare(arg.s, "Relu"):
+            if tu.str_compare(arg.name, "activation") and tu.str_compare(arg.s, "Relu"):
                 has_activation_arg = True
         assert not has_activation_arg
 
@@ -79,13 +72,13 @@ def test_transformer_FuseNNPACKConvReluNoInplace(self):
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["X"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 1
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -95,13 +88,13 @@ def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 1
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -112,13 +105,13 @@ def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
         net.Relu(["Y"], ["X"])
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 2
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -131,13 +124,13 @@ def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
         net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 2
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -150,13 +143,13 @@ def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
         net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y2"], ["Y2"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 2
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -168,8 +161,8 @@ def test_transformer_SinkMaxPool(self):
         net.MaxPool(["Y"], ["Y1"], kernel=3)
         net.Relu(["Y1"], ["Y1"])
         transformer.SinkMaxPool(net)
-        assert str_compare(net.Proto().op[1].type, "Relu")
-        assert str_compare(net.Proto().op[2].type, "MaxPool")
+        assert tu.str_compare(net.Proto().op[1].type, "Relu")
+        assert tu.str_compare(net.Proto().op[2].type, "MaxPool")
 
     @given(
         size=st.integers(7, 10),
@@ -196,18 +189,16 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
 
         np.random.seed(seed)
         if order == "NCHW":
-            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+            tu.randBlobFloat32("X", 1, c, h, w)
+            tu.randBlobFloat32("w", c, c, k, k)
         else:
-            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
-        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+            tu.randBlobFloat32("X", 1, h, w, c)
+            tu.randBlobFloat32("w", c, k, k, c)
+        tu.randBlobsFloat32(["b", "scale", "bias", "mean"], c)
+
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
@@ -250,17 +241,15 @@ def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, orde
 
         np.random.seed(seed)
         if order == "NCHW":
-            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+            tu.randBlobFloat32("X", 1, c, h, w)
+            tu.randBlobFloat32("w", c, c, k, k)
         else:
-            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+            tu.randBlobFloat32("X", 1, h, w, c)
+            tu.randBlobFloat32("w", c, k, k, c)
+        tu.randBlobsFloat32(["scale", "bias", "mean"], c)
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
@@ -303,17 +292,15 @@ def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channe
 
         np.random.seed(seed)
         if order == "NCHW":
-            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+            tu.randBlobFloat32("X", 1, c, h, w)
+            tu.randBlobFloat32("w", c, c, k, k)
         else:
-            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("_bias0", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+            tu.randBlobFloat32("X", 1, h, w, c)
+            tu.randBlobFloat32("w", c, k, k, c)
+        tu.randBlobsFloat32(["scale", "_bias0", "mean"], c)
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
@@ -366,15 +353,12 @@ def test_transformer_FuseConv3DBN(
         )
 
         np.random.seed(seed)
-        workspace.FeedBlob("X", np.random.rand(1, c, t, h, w).astype(np.float32))
-        workspace.FeedBlob("w", np.random.rand(c, c, kt, kh, kw).astype(np.float32))
-        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+        tu.randBlobFloat32("X", 1, c, t, h, w)
+        tu.randBlobFloat32("w", c, c, kt, kh, kw)
+        tu.randBlobsFloat32(["b", "scale", "bias", "mean"], c)
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))

From 94c513cc7f27d98ec6b5dead0af7fabcf5c5cd13 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Tue, 25 Sep 2018 11:10:34 -0700
Subject: [PATCH 28/51] Improve pybind11 message (#11640)

Summary:
Improving the message based on https://github.com/pytorch/pytorch/issues/11570
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11640

Differential Revision: D10033383

Pulled By: orionr

fbshipit-source-id: 0cdcdbe0582d896283a12970aebe771efa390dd2
---
 cmake/Dependencies.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 4607ef23727565..82aff7b8cc87d5 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -402,13 +402,15 @@ find_package(pybind11 CONFIG)
 if((DEFINED pybind11_DIR) AND pybind11_DIR)
   get_target_property(pybind11_INCLUDE_DIRS pybind11::pybind11 INTERFACE_INCLUDE_DIRECTORIES)
 else()
-  message("pybind11 config not found. Fallback to legacy find.")
   find_package(pybind11)
 endif()
 
 if(pybind11_FOUND)
+    message(STATUS "System pybind11 found")
+    message(STATUS "pybind11l include dirs: " ${pybind11_INCLUDE_DIRS})
     include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
 else()
+    message(STATUS "Using third_party/pybind11.")
     include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/pybind11/include)
 endif()
 

From 8f0db9bbbb84632b1bf6c89e0949964f36e4588b Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Tue, 25 Sep 2018 11:26:48 -0700
Subject: [PATCH 29/51] Removing some dependency edges from Blob to other
 caffe2 (#12043)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12043

Re-trying D9979976, this time with all call sites fixed.

D9979976 got reverted because there was a call site that wasn't covered by sandcastle it seems.
I fixed it and used 'grep' to ensure there aren't any more call sites in fbsource.

Reviewed By: ezyang

Differential Revision: D10026392

fbshipit-source-id: cd341514a8e53a40147ea0ee3e52f63bb6444157
---
 binaries/benchmark_helper.cc                  |   6 +-
 binaries/speed_benchmark.cc                   |   2 +-
 caffe2/contrib/gloo/common.cc                 |   2 +-
 .../contrib/nervana/nervana_fc_op_gpu_test.cc |   2 +-
 .../contrib/tensorrt/tensorrt_tranformer.cc   |   4 +-
 caffe2/core/blob.h                            |  54 +++---
 caffe2/core/blob_gpu_test.cc                  |   8 +-
 caffe2/core/blob_serialization.cc             |   3 +-
 caffe2/core/blob_test.cc                      |  30 ++--
 caffe2/core/operator.h                        |   6 +-
 caffe2/core/plan_executor.cc                  |   3 +-
 caffe2/core/workspace.h                       |   2 +-
 caffe2/ideep/operators/concat_split_op.cc     |   5 +-
 .../ideep/operators/operator_fallback_ideep.h |   6 +-
 caffe2/ideep/operators/utility_ops.cc         |   2 +-
 caffe2/mkl/operators/operator_fallback_mkl.h  |   6 +-
 .../contrib/arm-compute/operators/copy_op.cc  |   4 +-
 .../arm-compute/test/gl_operator_test.h       |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |   4 +-
 .../mobile/contrib/ios/mpscnn/mpscnn_test.mm  | 169 +++++++++---------
 caffe2/mobile/contrib/ios/pool_test.cc        |   2 +-
 caffe2/mobile/contrib/ios/resize_test.cc      |   2 +-
 caffe2/mobile/contrib/nnapi/nnapi.cc          |   2 +-
 .../mobile/contrib/nnapi/nnapi_benchmark.cc   |  24 +--
 caffe2/mobile/contrib/nnapi/nnapi_test.cc     |  28 +--
 .../mobile/contrib/opengl/test/opengl_test.cc |  94 +++++-----
 .../mobile/contrib/snpe/snpe_op_benchmark.cc  |   8 +-
 caffe2/mobile/contrib/ulp2/ulp_test.cc        |   6 +-
 caffe2/operators/batch_matmul_op_gpu_test.cc  |   2 +-
 caffe2/operators/batch_matmul_op_test.cc      |   2 +-
 caffe2/operators/boolean_unmask_ops_test.cc   |   2 +-
 caffe2/operators/conv_op_shared.cc            |   4 +-
 caffe2/operators/conv_op_shared_gpu.cc        |   4 +-
 .../conv_transpose_op_mobile_test.cc          |   4 +-
 caffe2/operators/dataset_ops.cc               |   2 +-
 caffe2/operators/dropout_op_cudnn.cc          |   2 +-
 caffe2/operators/elementwise_op_test.h        |   2 +-
 .../operators/generate_proposals_op_test.cc   |   6 +-
 caffe2/operators/index_ops.cc                 |   2 +-
 caffe2/operators/onnx_while_op.h              |  12 +-
 caffe2/operators/onnxifi_op.cc                |   2 +-
 caffe2/operators/operator_fallback_gpu.h      |   6 +-
 .../operators/operator_fallback_gpu_test.cc   |   4 +-
 caffe2/operators/reshape_op_gpu_test.cc       |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |   5 +-
 .../rnn/recurrent_network_executor.h          |   4 +-
 caffe2/operators/rnn/recurrent_network_op.h   |  22 +--
 caffe2/operators/roi_align_op_gpu_test.cc     |   6 +-
 caffe2/operators/string_ops_test.cc           |  14 +-
 caffe2/operators/stylizer_ops.cc              |   4 +-
 caffe2/operators/tensor_protos_db_input.h     |   6 +-
 caffe2/operators/tt_linear_op.h               |   2 +-
 caffe2/operators/utility_ops_gpu_test.cc      |   2 +-
 caffe2/operators/utility_ops_test.cc          |   2 +-
 caffe2/opt/fusion.cc                          |  10 +-
 caffe2/opt/onnxifi_transformer.cc             |   2 +-
 caffe2/predictor/predictor.cc                 |   6 +-
 caffe2/predictor/predictor_test.cc            |   6 +-
 caffe2/python/pybind_state.cc                 |   2 +-
 caffe2/python/pybind_state.h                  |  15 +-
 caffe2/python/pybind_state_ideep.cc           |   4 +-
 .../depthwise/depthwise3x3_conv_op_test.cc    |   2 +-
 caffe2/share/contrib/nnpack/conv_op.cc        |  11 +-
 caffe2/share/contrib/nnpack/nnpack_test.cc    |   2 +-
 caffe2/utils/hip/math_blas_hip_test.cc        |  32 ++--
 caffe2/utils/math_gpu_test.cc                 |  48 ++---
 66 files changed, 380 insertions(+), 371 deletions(-)

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 001c8e965f6a6e..f481a6292c7f56 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -163,7 +163,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -200,7 +200,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+    caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       int total_size = tensor_proto->string_data_size();
       for (size_t i = 0; i < total_size; i++) {
@@ -298,7 +298,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
+              BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
               output_prefix,
               name);
         }
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index 5914e3f58b44b2..fd502cf3c078ab 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -137,7 +137,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+        caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
index 21ce0343d81819..d4929938f19174 100644
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutableTensor(CPU);
+  auto* res = BlobGetMutableTensor(status_blob, CPU);
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
index 972d9231dcf9c6..9eee8973142ed7 100644
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index 3612d8b46f1f8d..2dd17e00169902 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -95,10 +95,10 @@ void BlobToTensorProto(
   }
 
   // Set values
-  if (blob->IsTensorType(CPU)) {
+  if (BlobIsTensorType(*blob, CPU)) {
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (blob->IsTensorType(CUDA)) {
+  } else if (BlobIsTensorType(*blob, CUDA)) {
     const auto& cuda_tensor = blob->template Get<TensorCUDA>();
     const auto cpu_tensor = TensorCPU(cuda_tensor, context);
     context->FinishDeviceComputation();
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 870fc88322b158..80470cea443331 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -6,16 +6,16 @@
 #include <typeinfo>
 #include <type_traits>
 #include <vector>
-
-#include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
+
+#include <ATen/core/typeid.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/tensor.h"
-#include "caffe2/core/typeid.h"
-#include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
 
+class Tensor;
+
 /**
  * @brief Blob is a general container that hosts a typed pointer.
  *
@@ -50,15 +50,6 @@ class CAFFE2_API Blob final {
     return meta_.Match<T>();
   }
 
-  bool IsTensorType(DeviceType device_type) const {
-    bool is_match = meta_.Match<Tensor>();
-    auto* tensor = static_cast<Tensor*>(pointer_);
-    if (is_match && tensor && tensor->GetDeviceType() == device_type) {
-      return true;
-    }
-    return false;
-  }
-
   /**
    * Returns the meta info of the blob.
    */
@@ -109,9 +100,6 @@ class CAFFE2_API Blob final {
         std::is_default_constructible<T>::value,
         "GetMutable can't be called with non-default-constructible types. "
         "Try using specialized methods");
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "Use GetMutableTensor(DeviceType) instead");
     if (IsType<T>()) {
       return static_cast<T*>(pointer_);
     } else {
@@ -129,16 +117,6 @@ class CAFFE2_API Blob final {
     }
   }
 
-  inline Tensor* GetMutableTensor(DeviceType device_type) {
-    if (IsTensorType(device_type)) {
-      return static_cast<Tensor*>(pointer_);
-    } else {
-      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-              << " DeviceType:" << device_type;
-      return Reset<Tensor>(new Tensor(device_type));
-    }
-  }
-
   /**
    * Sets the underlying object to the allocated one. The Blob then takes over
    * the ownership of the passed in pointer. If there is already an object in
@@ -248,5 +226,29 @@ inline void swap(Blob& lhs, Blob& rhs) {
   lhs.swap(rhs);
 }
 
+inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
+  bool is_match = blob.meta().Match<Tensor>();
+  if (!is_match) {
+    return false;
+  }
+  const Tensor* tensor = &blob.Get<Tensor>();
+  return tensor && tensor->GetDeviceType() == device_type;
+}
+
+inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
+  if (blob->IsType<Tensor>()) {
+    Tensor* tensor = blob->GetMutable<Tensor>();
+    if (tensor->GetDeviceType() == device_type) {
+      return tensor;
+    }
+  }
+
+  // if we're here, then either Blob didn't hold a Tensor
+  // or that Tensor had the wrong DeviceType.
+  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
+          << " DeviceType:" << device_type;
+  return blob->Reset<Tensor>(new Tensor(device_type));
+}
+
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_BLOB_H_
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index e8fdf47f69ddb0..55eafdede7269a 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -132,7 +132,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     for (int i = 0; i < 6; ++i) {                                          \
       cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
     }                                                                      \
-    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
+    BlobGetMutableTensor(&blob, CUDA)->CopyFrom(cpu_tensor);               \
     string serialized = SerializeBlob(blob, "test");                       \
     BlobProto proto;                                                       \
     CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@@ -149,7 +149,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));               \
-    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CUDA));                         \
     Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@@ -199,7 +199,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -207,7 +207,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 7ff5a2b25eacc1..d4ef19db69ce4f 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -363,7 +363,8 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
   auto tensor_proto = blob_proto.tensor();
   Deserialize(
       tensor_proto,
-      blob->GetMutableTensor(
+      BlobGetMutableTensor(
+          blob,
           static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
 }
 
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 24b2a2d0593d3a..bb2f4ba6a91818 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
   int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
   EXPECT_TRUE(blob.IsType<int>());
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
 
   BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
   EXPECT_TRUE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
 
-  Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
-  EXPECT_TRUE(blob.IsTensorType(CPU));
+  Tensor* tensor_unused CAFFE2_UNUSED = BlobGetMutableTensor(&blob, CPU);
+  EXPECT_TRUE(BlobIsTensorType(blob, CPU));
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
 }
@@ -600,7 +600,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
   TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
     Blob blob;                                                            \
-    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);                    \
     tensor->Resize(2, 3);                                                 \
     for (int i = 0; i < 6; ++i) {                                         \
       tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -634,7 +634,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                           \
   TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
     Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
+    TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);                 \
     tensor->Resize(0, 3);                                                 \
     tensor->mutable_data<TypeParam>();                                    \
     string serialized = SerializeBlob(blob, "test");                      \
@@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -669,7 +669,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerialization_CustomType) {
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor->mutable_data<BlobTestFoo>()[i].val = i;
@@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -696,7 +696,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, Half) {
   const int64_t kSize = 3000000;
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(kSize);
   for (int i = 0; i < tensor->size(); ++i) {
     tensor->mutable_data<at::Half>()[i].x = i % 10000;
@@ -724,7 +724,7 @@ TEST(TensorTest, Half) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -860,7 +860,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
   {
     VLOG(1) << "Test begin";
     Blob blob;
-    Tensor* tensor = blob.GetMutableTensor(CPU);
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);
     VLOG(1) << "Allocating blob";
     tensor->Resize(d1, d2);
     auto mutableData = tensor->mutable_data<TypeParam>();
@@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*new_blob, CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
@@ -1030,7 +1030,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
   int64_t size = d1 * d2;
 
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(d1, d2);
   tensor->mutable_data<float>();
   std::mutex mutex;
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 25aa801d265dba..f5683d1497377e 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -122,7 +122,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     static_assert(
         std::is_same<T, Tensor>::value,
         "Output(int, DeviceType) is only available for Tensor");
-    return outputs_.at(idx)->GetMutableTensor(type);
+    return BlobGetMutableTensor(outputs_.at(idx), type);
   }
 
   template <typename T>
@@ -149,7 +149,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool InputIsTensorType(int idx, DeviceType device_type) {
-    return inputs_.at(idx)->IsTensorType(device_type);
+    return BlobIsTensorType(*inputs_.at(idx), device_type);
   }
 
   template <typename T>
@@ -162,7 +162,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool OutputIsTensorType(int idx, DeviceType type) {
-    return outputs_.at(idx)->IsTensorType(type);
+    return BlobIsTensorType(*outputs_.at(idx), type);
   }
 
   inline int InputSize() const {
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 2c0ad9e7a8127b..8e48b6b7beabca 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -131,7 +131,8 @@ struct WorkspaceIdInjector {
           "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
       int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
       Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
+      TensorCPU* global_ws_id_tensor =
+          BlobGetMutableTensor(global_ws_id_blob, CPU);
       global_ws_id_tensor->Resize();
       global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
       VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 11bf9c413c5966..cbc58f742c2398 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -151,7 +151,7 @@ class CAFFE2_API Workspace {
       auto* to_blob = CreateBlob(blob);
       CAFFE_ENFORCE(to_blob);
       const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
+      auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
       to_tensor->CopyFrom(from_tensor);
     }
   }
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 8d011cd3be8bfa..38ffdc99426452 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -33,8 +33,9 @@ class IDEEPConcatOp final : public IDEEPOperator {
       if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
         inputs.emplace_back(Input(i));
       } else {
-        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
-                      "Expect cpu tensor if not itensor");
+        CAFFE_ENFORCE(
+            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
+            "Expect cpu tensor if not itensor");
         auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
         CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
                       tensor_cpu.size_from_dim(0) == 0,
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 08e6de2ae3f0dc..3226a08c4af9cf 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -89,7 +89,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           local_input_blobs_[i]->Reset();
         }
         input_share_[i] = false;
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
@@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
@@ -153,7 +153,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         Blob* dst = OperatorBase::OutputBlob(i);
         dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(dst, CPU);
         dtensor->Resize(src_dims);
         dtensor->ShareData(src);
       }
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 626568a989b939..468a42df1a9239 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.IsTensorType(CPU)) {
+    if (BlobIsTensorType(input_blob, CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<Tensor>(0, CPU);
       auto* Y = OperatorBase::Output<Tensor>(0, CPU);
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index 6d9713b74612d8..a3135758813ecf 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
     for (int i = 0; i < InputSize(); ++i) {
       if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
         OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
       } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
         OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
       } else {
         VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
@@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "MKL fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
index 111af03f8602b9..06ec2b50acc178 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
   if (first_run_) {
     first_run_ = false;
     for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
       Y->Resize(inputs_[i]->dims());
       Y->template mutable_data<float>();
     }
@@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
       // GLTensor
       auto* X = inputs_[i].get();
       X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
       Timer timer;
       timer.Start();
       getTensorCPU(*X, *Y);
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
index daa7ef008fc7b3..68f79e84a89f87 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                      std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
   Blob *blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(dims);
   T *t_data = tensor->mutable_data<T>();
   std::random_device rd;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 52f746f63f317b..742f8e48f4e9e1 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,13 +489,13 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsTensorType(CPU) ||
+    if (!BlobIsTensorType(*noiseBlob, CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
       t->Resize(noiseSize);
       math::RandGaussian<float, CPUContext>(
           t->size(),
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 7216b16611aa2a..7ac629019c58c0 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("stddev"), CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("pw"), CPU);
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(name), CPU);
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -682,8 +682,8 @@ void testMPSCNN() {
                           LOG(INFO) << "MPSCNNPool Test: " << pool;
                           Workspace ws;
                           {
-                            auto* t =
-                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                            auto* t = BlobGetMutableTensor(
+                                ws.CreateBlob("X_cpu"), CPU);
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,7 +1072,8 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                        auto* t =
+                            BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1080,7 +1081,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1092,7 +1093,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1188,7 +1189,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1196,7 +1197,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1204,7 +1205,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1275,7 +1276,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1283,7 +1284,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1291,7 +1292,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1385,7 +1386,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1393,7 +1394,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1401,7 +1402,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1493,7 +1494,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1501,7 +1502,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1509,7 +1510,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1607,7 +1608,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1615,7 +1616,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1623,7 +1624,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1726,7 +1727,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1734,7 +1735,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1791,7 +1792,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1799,7 +1800,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1856,7 +1857,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1864,7 +1865,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1921,7 +1922,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1929,7 +1930,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2011,7 +2012,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2065,7 +2066,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2136,7 +2137,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2144,7 +2145,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2250,14 +2251,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2362,7 +2363,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2497,7 +2498,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2505,7 +2506,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("bbox_delta_cpu"), CPU);
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2513,7 +2514,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("im_info"), CPU);
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2521,7 +2522,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("anchors"), CPU);
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2587,7 +2588,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2661,8 +2662,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t =
-                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                              auto* t = BlobGetMutableTensor(
+                                  ws.CreateBlob("X_cpu"), CPU);
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2675,7 +2676,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2692,7 +2693,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2809,7 +2810,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2891,7 +2892,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2964,7 +2965,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3336,8 +3337,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace cws;
     cws.RunNetOnce(initNet);
     {
-      auto* t =
-          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(
+          cws.CreateBlob(predictNet.external_input(0)), CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3348,8 +3349,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace mws;
     mws.RunNetOnce(initNet);
     {
-      auto* t =
-          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(
+          mws.CreateBlob(predictNet.external_input(0)), CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3397,16 +3398,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                                             \
-  ws.RunNetOnce(initNet);                                                   \
-  {                                                                         \
-    auto* t =                                                               \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
-    t->Resize(inputDims);                                                   \
-    CPUContext ctx;                                                         \
-    math::RandGaussian<float, CPUContext>(                                  \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
-  }                                                                         \
+#define RUN_NET(ws, predictNet)                            \
+  ws.RunNetOnce(initNet);                                  \
+  {                                                        \
+    auto* t = BlobGetMutableTensor(                        \
+        ws.CreateBlob(predictNet.external_input(0)), CPU); \
+    t->Resize(inputDims);                                  \
+    CPUContext ctx;                                        \
+    math::RandGaussian<float, CPUContext>(                 \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);  \
+  }                                                        \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index 47fd405eef01e4..3f78c5d1fcd6ae 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 1c08df0f32a1c0..428c395fe442d4 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index 45ea26c44cc964..56f1fc28986a7c 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
         output_dims.push_back(dim);
       }
 
-      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
+      auto* tensor = BlobGetMutableTensor(ws_.CreateBlob(blob), CPU);
       tensor->Resize(output_dims);
       outputs->push_back(tensor);
 
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
index 359e7767746b69..c14e9ed26376e1 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
@@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
@@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
@@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
@@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
index deab1ca7b43f76..9b4608dc07aee1 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
   // CPU reference
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -103,21 +103,21 @@ static void test_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, kernel, kernel, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -189,7 +189,7 @@ static void test_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(1, kernel, kernel, D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -428,7 +428,7 @@ static void test_pooling(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -496,7 +496,7 @@ static void test_pooling(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     if (H == 1 && W == 1) {
       t->Resize(N, C);
     } else {
@@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 9da266c4e85051..690a33cb854f16 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
   LOG(INFO) << "OPENGLCopyFrom/To Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -275,7 +275,7 @@ void testOpenGLConv(int N,
             << " Op: " << glPoolOperationName[poolOp];
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -301,7 +301,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
       t->Resize(K, C, kernel_h, kernel_w);
     } else {
@@ -343,7 +343,7 @@ void testOpenGLConv(int N,
 
     // bias
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(K);
       CPUContext ctx;
       if (random_input) {
@@ -367,7 +367,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -532,7 +532,7 @@ void testOpenGLPRelu(
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -541,7 +541,7 @@ void testOpenGLPRelu(
 
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(prelu_size);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
 
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -814,8 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t =
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i)), CPU);
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -891,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -942,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@@ -992,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
   }
 
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(1);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@@ -1060,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
   LOG(INFO) << "OpenGL Softmax Test "
             << "N: " << N << " D: " << D << " Tiled:" << tiled;
   Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
   {
     t->Resize(N, D);
     CPUContext ctx;
@@ -1151,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1163,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1172,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1254,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1266,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1275,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1284,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1385,7 +1385,7 @@ void OpenGL_speedtest(int N,
             << " C: " << C << " H: " << H << " W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1399,7 +1399,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1413,7 +1413,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1479,7 +1479,7 @@ void testOpenGLPadImage(
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1593,7 +1593,7 @@ void testOpenGLResize(int N,
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1675,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGL Preprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1684,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 100;
@@ -1748,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLDeprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1757,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1800,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLNormPlanarYUV Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, 3, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1809,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1818,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 6;
@@ -1879,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
   LOG(INFO) << "OpenGL CopyOps Speed Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1893,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1907,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1990,8 +1990,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace cws;
     cws.RunNetOnce(initNet);
 
-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
-                      ->GetMutableTensor(CPU);
+    auto* t_cpu = BlobGetMutableTensor(
+        cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2032,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace mws;
     mws.RunNetOnce(initNet);
 
-    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
-                     ->GetMutableTensor(CPU);
+    auto* t_gl = BlobGetMutableTensor(
+        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2116,7 +2116,7 @@ void compareBatchedToTiledModels(std::string name,
     tws.RunNetOnce(initNet);
 
     auto* t_batch =
-        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2143,7 +2143,7 @@ void compareBatchedToTiledModels(std::string name,
     bws.RunNetOnce(initNet);
 
     auto* t_tiling =
-        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index deced719644963..cfeed00e8b9730 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -14,7 +14,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                         \
   do {                                                                    \
     Blob* _blob = ws.CreateBlob((_n));                                    \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);                     \
     _tensor->Resize((_s));                                                \
     memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
   } while (0)
@@ -23,7 +23,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                 \
   do {                                                            \
     Blob* _blob = ws.CreateBlob((_n));                            \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);             \
     _tensor->Resize((_s));                                        \
     memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
   } while (0)
@@ -43,7 +43,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@@ -56,7 +56,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index a1c1af0f6dfb8d..6316b05284fba9 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -289,13 +289,13 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
     def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
     def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
     def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
+    auto* Xws = BlobGetMutableTensor(ws.CreateBlob("X"), CPU);
     Xws->ResizeLike(X);
     Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* Wws = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     Wws->ResizeLike(W_);
     Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* bws = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     bws->ResizeLike(bias);
     bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
     ws.RunOperatorOnce(def);
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index 804296307d6ef8..31e179b3e41f82 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -30,7 +30,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CUDA);
+    auto* tensor = BlobGetMutableTensor(blob, CUDA);
     tensor->Resize(dims);
     math::Set<float, CUDAContext>(
         tensor->size(),
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index 45db7dd5b8484a..c74829b4f8f9c5 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -24,7 +24,7 @@ class BatchMatMulOpTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
     tensor->Resize(dims);
     math::Set<float, CPUContext>(
         tensor->size(),
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index 8814be17153d44..b0c5f7dcdfff0b 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -16,7 +16,7 @@ static void AddScalarInput(
     Workspace* ws,
     bool isEmpty = false) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   if (!isEmpty) {
     tensor->Resize(vector<int64_t>{1});
     *(tensor->template mutable_data<DataT>()) = value;
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
index b9f54b6d55be7c..155b6f0cd24561 100644
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@@ -27,8 +27,8 @@ void runWithSharedBuffer<CPUContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
+  auto* buffer = BlobGetMutableTensor(
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__"), CPU);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index f80d15a5d9054b..c1f37c7f1362f2 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -20,8 +20,8 @@ void runWithSharedBuffer<CUDAContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
+  auto* buffer = BlobGetMutableTensor(
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__"), CUDA);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index 6eb45eb5f8d17c..3bc2951664353b 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -17,7 +17,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -29,7 +29,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 83294224280831..e3c0abe83d8b4e 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -1428,7 +1428,7 @@ class TreeCursorSerializer : public BlobSerializerBase {
     // serialize offsets as a tensor
     if (cursor->offsets.size() > 0) {
       Blob offsets_blob;
-      auto* offsets = offsets_blob.GetMutableTensor(CPU);
+      auto* offsets = BlobGetMutableTensor(&offsets_blob, CPU);
       offsets->Resize(cursor->offsets.size());
       std::copy(
           cursor->offsets.begin(),
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index a68a1263f6f451..8a40c731143f44 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -150,7 +150,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
     // Reshape tensor descriptors if necessary
     if (X.dims() != cudnn_input_dims_ && !is_test_) {
       CAFFE_ENFORCE(scratch_blob_);
-      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
+      Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
       cudnn_input_dims_ = X.dims();
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index bcd547e28f0989..b785d040c8f1a7 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -19,7 +19,7 @@ void FillTensor(
     const std::vector<int64_t>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
   tensor->Resize(shape);
   auto* mutable_data = tensor->template mutable_data<O_Type>();
   const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index 2b3a033a665df7..da7fdc650879c3 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -18,7 +18,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -34,7 +34,7 @@ static void AddLinSpacedInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -51,7 +51,7 @@ static void AddInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index 241b0ff97c6070..2fb8f3b338dc64 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -353,7 +353,7 @@ class IndexSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor) override {
     auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
     Blob tensor_blob;
-    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
+    auto* tensor_out = BlobGetMutableTensor(&tensor_blob, CPU);
 
     if (base->Type().Match<std::string>()) {
       doStore<std::string>(base, tensor_out);
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index dbd5103952469c..7a3c34cfbf7cce 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -213,23 +213,23 @@ class ONNXWhileOp final : public Operator<Context> {
       lcd_tensors_.clear();
       for (int i = 2; i < body_net_def.external_input_size(); ++i) {
         Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
+        Tensor* t = BlobGetMutableTensor(b, Context::GetDeviceType());
         lcd_tensors_.push_back(t);
       }
       // First output is the iteration variable
       auto* iteration_var_blob = loop_ws_->CreateBlob(
           body_net_def.external_input(0));
       iteration_var_ =
-          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(iteration_var_blob, Context::GetDeviceType());
 
-      input_condition_var_ =
-          loop_ws_->CreateBlob(body_net_def.external_input(1))
-              ->GetMutableTensor(Context::GetDeviceType());
+      input_condition_var_ = BlobGetMutableTensor(
+          loop_ws_->CreateBlob(body_net_def.external_input(1)),
+          Context::GetDeviceType());
 
       auto* condition_var_blob =
           loop_ws_->CreateBlob(body_net_def.external_output(0));
       condition_var_ =
-          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(condition_var_blob, Context::GetDeviceType());
       condition_var_->Resize(1);
       condition_var_->template mutable_data<bool>();
 
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index d1b0824f1b3191..767a37d5fc7924 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->IsTensorType(CPU),
+      BlobIsTensorType(*blob, CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 8ef39e7c0e78d1..5b3a38dbfbd13d 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -65,8 +65,8 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
       if (this->InputIsTensorType(i, CUDA)) {
-        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
-            Input(i), &context_);
+        BlobGetMutableTensor(local_input_blobs_[i], CPU)
+            ->CopyFrom(Input(i), &context_);
         need_sync = true;
       } else {
         VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
@@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index 964708bc10906f..0870a4be2dd7bd 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -40,7 +40,7 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
@@ -64,7 +64,7 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CUDA)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index 3537ab69d058f0..d4ac325a78b80a 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -20,7 +20,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 98675cea858d54..63d58f3ccd8f6d 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -43,11 +43,10 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
             prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
-        ws_->CreateBlob(newBlobName)
-            ->GetMutableTensor(CPU)
+        BlobGetMutableTensor(ws_->CreateBlob(newBlobName), CPU)
             ->ResizeLike(currentTensor);
         auto type = Context::GetDeviceType();
-        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
+        auto* newTensor = BlobGetMutableTensor(ws_->GetBlob(newBlobName), type);
         newTensor->CopyFrom(currentTensor);
       }
     }
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 7e37e562e77a50..4cb53a6d7d330a 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
       // the forward-only mode.
       std::string this_timestep_blob =
           timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
+      BlobGetMutableTensor(ws->CreateBlob(this_timestep_blob), CPU)->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
-      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+      BlobGetMutableTensor(b, CPU)->template mutable_data<int32_t>()[0] = t;
 
       // Copy the operators from template
       for (auto& template_rnn_op : timestep_ops_template_) {
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index 2421bc44263afd..21b3064a6fac3c 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -52,10 +52,11 @@ struct CAFFE2_API ScratchWorkspaces {
 };
 
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
+  BlobGetMutableTensor(ws->CreateBlob(blob_name), CPU)->Resize(1);
   auto timestepBlob = ws->GetBlob(blob_name);
   CAFFE_ENFORCE(timestepBlob);
-  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+  BlobGetMutableTensor(timestepBlob, CPU)->template mutable_data<int32_t>()[0] =
+      t;
 }
 
 CAFFE2_API std::map<string, string> GetRecurrentMapping(
@@ -71,8 +72,9 @@ void applyOffsetAlias(
           << " at offset: " << oc.offset;
   auto srcBlob = ws->GetBlob(oc.src);
   CAFFE_ENFORCE(srcBlob);
-  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
-  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
+  auto* src = BlobGetMutableTensor(srcBlob, Context::GetDeviceType());
+  auto* dst =
+      BlobGetMutableTensor(ws->GetBlob(oc.dst), Context::GetDeviceType());
   auto timestep = src->size() / src->dim(0);
   auto dims = src->dims();
   const int32_t startDstTimestep =
@@ -113,7 +115,7 @@ void initializeRecurrentInput(
     Context* context) {
   auto stateBlob = ws->GetBlob(rc.state);
   CAFFE_ENFORCE(stateBlob);
-  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* state = BlobGetMutableTensor(stateBlob, Context::GetDeviceType());
 
   auto inputBlob = ws->GetBlob(rc.input);
   CAFFE_ENFORCE(inputBlob);
@@ -660,7 +662,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->GetBlob(param.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
       g->ResizeLike(p);
       math::Set<T, Context>(
           g->size(),
@@ -676,7 +678,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->CreateBlob(rg.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
       g->ResizeLike(p);
       CAFFE_ENFORCE_EQ(g->ndim(), 3);
       const auto timestep = g->size() / g->dim(0);
@@ -703,7 +705,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
               << ". Size: " << Input(gradientInputIndex).size();
       auto pGradientBlob = sharedWs_->GetBlob(gradientName);
       CAFFE_ENFORCE(pGradientBlob);
-      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(pGradientBlob, Context::GetDeviceType());
       g->ResizeLike(Input(gradientInputIndex));
       g->template mutable_data<T>();
     }
@@ -717,7 +719,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                 << rg.lastExternalGrad << " for final time step (sep. blob)";
         auto gBlob = sharedWs_->GetBlob(rg.grad);
         CAFFE_ENFORCE(gBlob);
-        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+        auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
 
         auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
         CAFFE_ENFORCE(oglastBlob);
@@ -779,7 +781,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
       T* output_data = Output(outputIdx)->template mutable_data<T>();
       auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
       CAFFE_ENFORCE(pBlob);
-      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* p = BlobGetMutableTensor(pBlob, Context::GetDeviceType());
 
       if (Input(inputId).ndim() >= 2) {
         // Gradient states blob should live. And if it gets changed by the
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 2647a97d6f0b90..7257ec44c25984 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -18,7 +18,7 @@ void AddConstInput(
     Context* context,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
   tensor->Resize(shape);
   math::Set<float, Context>(
       tensor->size(), value, tensor->template mutable_data<float>(), context);
@@ -39,7 +39,7 @@ void AddInput<CPUContext>(
     const string& name,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -57,7 +57,7 @@ void AddInput<CUDAContext>(
   tmp_vec.array() = utils::AsEArrXt(values);
 
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->CopyFrom(tmp);
 }
 
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index c9ba13efb50258..2092ae804f2c3b 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
  public:
   bool runOp(const TensorCPU& input) {
     auto* blob = ws_.CreateBlob("X");
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
     tensor->ResizeLike(input);
     tensor->ShareData(input);
 
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*output, CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
@@ -42,7 +42,7 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
   std::vector<std::string> input = {"a", "xx", "c"};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -62,7 +62,7 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                  {"dd", "ee", "ff"}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -82,7 +82,7 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
   std::vector<float> input = {3.90f, 5.234f, 8.12f};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -102,7 +102,7 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                            {4.67f, 5.90f, 6.32f}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -122,7 +122,7 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
   std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<int64_t>();
   for (int i = 0; i < input.size(); ++i) {
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index a6d395fe9ba647..bfc41a462999b5 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsTensorType(CPU)) {
+    if (!BlobIsTensorType(*noiseBlob, CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index cd081bf959e399..e9f5b1a8f8455f 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -56,7 +56,7 @@ bool TensorProtosDBInput<Context>::Prefetch() {
         protos.mutable_protos(i)->clear_device_detail();
       }
       deserializer.Deserialize(
-          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
+          protos.protos(i), BlobGetMutableTensor(&prefetched_blobs_[i], CPU));
     }
   } else {
     vector<Tensor> temp_tensors;
@@ -74,11 +74,11 @@ bool TensorProtosDBInput<Context>::Prefetch() {
           vector<int> dims(
               protos.protos(i).dims().begin(), protos.protos(i).dims().end());
           dims.insert(dims.begin(), batch_size_);
-          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
+          BlobGetMutableTensor(&prefetched_blobs_[i], CPU)->Resize(dims);
         }
       }
       for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
+        TensorCPU* dst = BlobGetMutableTensor(&prefetched_blobs_[i], CPU);
         TensorCPU& src = temp_tensors[i];
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index 421c26e318b6e9..1a5cdc344ce4a8 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
     int cores_idx = 0;
 
     // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
+    auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType());
     Y_buf->ResizeLike(X);
     Y_buf->CopyFrom(X);
 
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index f500afaf9ed24f..1099d900cbefdc 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -19,7 +19,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index 379dd52655c4f4..a3a2a409674edd 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -16,7 +16,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index fdf5fdc31e1046..8c324a97c50934 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -44,10 +44,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
     CAFFE_ENFORCE(
         bnInputs.size() >= 5, "Invalid batch normalization input size");
 
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
-  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                                \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                      \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");             \
+  auto name##Tensor = BlobGetMutableTensor(ws->GetBlob(name->getName()), CPU); \
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
@@ -76,7 +76,7 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
           nn->dataFlow.createEdge(convBiasNode, convNode);
 
           auto* blob = ws->CreateBlob(convBiasName);
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
           CHECK_NOTNULL(tensor);
           // Get output channel
           size_t c = filterTensor->dim32(0);
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index ce79df56ecb728..a048503fea99c7 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -173,7 +173,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
 
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = blob->GetMutableTensor(CPU);
+      auto* cpu_tensor = BlobGetMutableTensor(blob, CPU);
       std::vector<int64_t> dims;
       for(const auto& d : t.dims()) {
         dims.push_back(d);
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 84dac93753d37a..7775e69776450c 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -10,14 +10,14 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
+      BlobIsTensorType(*blob, CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return blob->GetMutableTensor(CPU);
+  return BlobGetMutableTensor(blob, CPU);
 }
 
 void shareInputTensor(
@@ -60,7 +60,7 @@ Predictor::Predictor(PredictorConfig config) : config_(std::move(config)) {
   for (const auto& name : config_.predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = config_.ws->CreateBlob(name);
-      blob->GetMutableTensor(CPU);
+      BlobGetMutableTensor(blob, CPU);
     }
   }
   CAFFE_ENFORCE(config_.ws->CreateNet(config_.predict_net));
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index ae4f73e9da0ad7..a0245cd7a86d66 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
     const std::vector<int64_t>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
-  auto* t = blob->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(blob.get(), CPU);
   t->Resize(dims);
   math::RandUniform<float, CPUContext>(
       t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@@ -180,7 +180,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorList input;
   input.emplace_back(CPU);
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
   input.back().ResizeLike(*tensor);
   input.back().ShareData(*tensor);
   Predictor::TensorList output;
@@ -196,7 +196,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorMap input;
   auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
   iter.first->second.ResizeLike(*tensor);
   iter.first->second.ShareData(*tensor);
 
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 81197047102ffb..9a1d715bfdf225 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -328,7 +328,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "tensor",
-          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
+          [](Blob* blob) { return py::cast(BlobGetMutableTensor(blob, CPU)); },
           py::return_value_policy::reference_internal)
       .def(
           "_feed",
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 59f39dd313032c..dd5d3b9bc18ef9 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -234,7 +234,7 @@ class TensorFeeder : public BlobFeederBase {
     FeedTensor(
         option,
         original_array,
-        blob->GetMutableTensor(Context::GetDeviceType()));
+        BlobGetMutableTensor(blob, Context::GetDeviceType()));
   }
 };
 
@@ -366,31 +366,32 @@ class PythonOpBase : public Operator<Context> {
 
         // make sure output blob is initialized before creating the binding
         if (forced_cpu_outputs_.count(i)) {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
         } else {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
         }
 
         py::object py_obj;
         if (blob->template IsType<Tensor>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
+                cpu_option);
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         }
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index ebad6cf8d96839..f0307f7b6485d2 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -163,8 +163,8 @@ class IDeepFeeder : public BlobFeederBase {
         DeviceOption cpu_option(option);
         cpu_option.set_device_type(DeviceTypeProto::PROTO_CPU);
         TensorFeeder<CPUContext> cpu_tensor_feeder;
-        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
-                                     blob->GetMutableTensor(CPU));
+        cpu_tensor_feeder.FeedTensor(
+            cpu_option, original_array, BlobGetMutableTensor(blob, CPU));
       }
     } catch (ideep::error &e) {
       LOG(ERROR) << "IDEEP error: " << e.message;
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 4ac3524d49d8a6..d102985e2fd7aa 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index 05c945106c52da..f11e05b67392c9 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -231,11 +231,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
 
         for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] = ws_->CreateBlob(
-                                          "__transformed_kernel_" +
-                                          to_string(__sync_fetch_and_add(
-                                              &precomputed_transform_id, 1)))
-                                       ->GetMutableTensor(CPU);
+          transformedFilters_[g] = BlobGetMutableTensor(
+              ws_->CreateBlob(
+                  "__transformed_kernel_" +
+                  to_string(
+                      __sync_fetch_and_add(&precomputed_transform_id, 1))),
+              CPU);
           transformedFilters_[g]->Resize(transformedFilterElements);
 
           status = nnp_convolution_inference(
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index 2f892118982da2..10eb6348becc06 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index 911c2b09868fc3..a5df5900ee23a2 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{10, 6};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{6, 10};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
   vector<int> shapeA{5, 10};
   vector<int> shapeX{10};
   vector<int> shapeY{5};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 50);
@@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
   vector<int> shapeA{6, 10};
   vector<int> shapeX{6};
   vector<int> shapeY{10};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 60);
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 9be1c3db6c1d01..4b0247a0786fcc 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
   Blob* bloby = ws.CreateBlob("Y");
   Blob* bloby_host = ws.CreateBlob("Y_host");
 
-  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
-  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensorx0 = BlobGetMutableTensor(blobx0, CUDA);
+  auto* tensorx1 = BlobGetMutableTensor(blobx1, CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
 
   vector<int> shapex0_vector{shapex0};
   vector<int> shapex1_vector{shapex1};
@@ -71,7 +71,7 @@ void executeGpuBinaryOpTest(
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   vector<int> shapex{33 * 9, 25};
   vector<int> shapey{33, 25};
 
-  auto* tensorx = blobx->GetMutableTensor(CUDA);
+  auto* tensorx = BlobGetMutableTensor(blobx, CUDA);
   tensorx->Resize(shapex);
   int stripe = 33 * 25;
   vector<float> tot(33, 0.0);
@@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
     }
   }
 
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
   tensory->Resize(shapey);
   math::Set<float, CUDAContext>(
       stripe, 0.0, tensory->mutable_data<float>(), &context);
@@ -125,7 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -258,9 +258,9 @@ class GemmBatchedGPUTest
     Blob* X_blob = ws_.CreateBlob("X");
     Blob* W_blob = ws_.CreateBlob("W");
     Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = X_blob->GetMutableTensor(CUDA);
-    W_ = W_blob->GetMutableTensor(CUDA);
-    Y_ = Y_blob->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(X_blob, CUDA);
+    W_ = BlobGetMutableTensor(W_blob, CUDA);
+    Y_ = BlobGetMutableTensor(Y_blob, CUDA);
     X_->Resize(std::vector<int64_t>{3, 5, 10});
     W_->Resize(std::vector<int64_t>{3, 6, 10});
     Y_->Resize(std::vector<int64_t>{3, 5, 6});
@@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -402,7 +402,7 @@ class ReduceTensorGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -664,8 +664,8 @@ class BroadcastGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -681,7 +681,7 @@ class BroadcastGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -741,9 +741,9 @@ class MomentsGPUTest : public testing::Test {
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_mean = ws_.CreateBlob("mean");
     Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    mean_ = blob_mean->GetMutableTensor(CUDA);
-    variance_ = blob_variance->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    mean_ = BlobGetMutableTensor(blob_mean, CUDA);
+    variance_ = BlobGetMutableTensor(blob_variance, CUDA);
   }
 
   void SetUpData(
@@ -766,10 +766,10 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& mean_data,
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
+    auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
     mean_host->CopyFrom(*mean_, cuda_context_.get());
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
+    auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
     variance_host->CopyFrom(*variance_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
 
@@ -868,8 +868,8 @@ class TransposeGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -890,7 +890,7 @@ class TransposeGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());

From a106388187dbb8a0fd04e7bff8f4339ce99c4bb4 Mon Sep 17 00:00:00 2001
From: vishwakftw <cs15btech11043@iith.ac.in>
Date: Tue, 25 Sep 2018 12:49:43 -0700
Subject: [PATCH 30/51] Free MAGMA queues after use (#11882)

Summary:
This PR is a minor change, just adds a simple `magma_queue_destroy` function to the implementation of `Gesv`.

Also, I have replaced calls for obtaining handles with those already written in ATen.
```
THCState_getCurrentSparseHandle(at::globalContext().getTHCState()) --> getCurrentCUDASparseHandle()
THCState_getCurrentBlasHandle(at::globalContext().getTHCState()) --> getCurrentCUDABlasHandle()
```

Differential Revision: D10032204

Pulled By: soumith

fbshipit-source-id: ccd11989ecdc357313f0b661a2468f75d3aecb0e
---
 aten/src/ATen/cuda/CUDAContext.cpp |  4 ++++
 aten/src/ATen/cuda/CUDAContext.h   |  2 ++
 aten/src/ATen/native/cuda/Gesv.cu  | 12 +++++++++---
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 7f934ef2cb049a..58248acfe17951 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -58,6 +58,10 @@ Allocator* getCUDADeviceAllocator() {
   cusparseHandle_t getCurrentCUDASparseHandle() {
     return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
   }
+
+  cublasHandle_t getCurrentCUDABlasHandle() {
+    return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
+  }
 #endif
 
 } // namespace cuda
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 564a918d943c01..83a890da4d535e 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -9,6 +9,7 @@
 
 #include "cuda_runtime_api.h"
 #include "cusparse.h"
+#include "cublas_v2.h"
 
 namespace at {
 namespace cuda {
@@ -60,6 +61,7 @@ CAFFE2_API Allocator* getCUDADeviceAllocator();
 /* Handles */
 #ifndef __HIP_PLATFORM_HCC__
 CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
+CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
 #endif
 
 
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index 80c7aaeb74f6a8..a5802192eb77df 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -53,12 +53,16 @@ static magma_queue_t createMagmaQueue(const Tensor& tensor) {
   magma_queue_create_from_cuda(
       tensor.get_device(),
       at::cuda::getCurrentCUDAStream(),
-      THCState_getCurrentBlasHandle(context.getTHCState()),
-      THCState_getCurrentSparseHandle(context.getTHCState()),
+      at::cuda::getCurrentCUDABlasHandle(),
+      at::cuda::getCurrentCUDASparseHandle(),
       &magma_queue);
   return magma_queue;
 }
 
+static void destroyMagmaQueue(magma_queue_t& existing_queue) {
+  magma_queue_destroy(existing_queue);
+}
+
 static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<magma_int_t>(value);
   if (static_cast<int64_t>(result) != value) {
@@ -117,9 +121,11 @@ AT_ERROR("gesv: MAGMA library not found in "
     ipiv_array[i] = &ipiv_data[i * n];
   }
 
+  magma_queue_t gesv_queue = createMagmaQueue(b);
   magmaGesvBatched<scalar_t>(
       n, nrhs, A_array, n, ipiv_array, b_array, n,
-      info_array, batch_size, createMagmaQueue(b));
+      info_array, batch_size, gesv_queue);
+  destroyMagmaQueue(gesv_queue);
 
   for (int64_t i = 0; i < batch_size; i++) {
     infos[i] = info_array[i];

From b263078bc3528c1e64a93290d2eaa473bce6ed92 Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Tue, 25 Sep 2018 13:07:47 -0700
Subject: [PATCH 31/51] Fix CUDA division by a scalar on large arrays. (#12023)

Summary:
The gpu_unary_kernel function was not handling arrays that
cannot use 32-bit indexing. This functions was only called directly
by CUDA division by a scalar. Other arithmetic operations go through
gpu_binary_kernel, which already properly handled large arrays.

This bug sometimes manifested as a crash and sometimes as an incorrect
answer.

Fixes #11788
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12023

Differential Revision: D10034017

Pulled By: colesbury

fbshipit-source-id: b17300f327de54035746bf02f576766007c9b144
---
 aten/src/ATen/native/cuda/Loops.cuh | 14 ++++++++++++++
 test/test_cuda.py                   | 24 ++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index b3435bd0f6bfb8..6b5a0e59d08ab9 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -63,6 +63,13 @@ template<typename func_t>
 void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) {
   ASSERT_HOST_DEVICE_LAMBDA(func_t);
 
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_nullary_kernel(sub_iter, f);
+    }
+    return;
+  }
+
   char* out_data = (char*)iter.data_ptr(0);
 
   using traits = function_traits<func_t>;
@@ -93,6 +100,13 @@ template<typename func_t>
 void gpu_unary_kernel(TensorIterator& iter, const func_t& f) {
   ASSERT_HOST_DEVICE_LAMBDA(func_t);
 
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_unary_kernel(sub_iter, f);
+    }
+    return;
+  }
+
   char* out_data = (char*)iter.data_ptr(0);
   const char* in1_data = (char*)iter.data_ptr(1);
 
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 8f59afc0891806..cdf8d46ce236cf 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -30,9 +30,11 @@
     TestCase = object  # noqa: F811
 
 TEST_MAGMA = TEST_CUDA
+TEST_LARGE_TENSOR = TEST_CUDA
 if TEST_CUDA:
     torch.ones(1).cuda()  # has_magma shows up after cuda is initialized
     TEST_MAGMA = torch.cuda.has_magma
+    TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 9e9
 
 floating_set = {torch.FloatTensor, torch.DoubleTensor, torch.cuda.FloatTensor,
                 torch.cuda.DoubleTensor, torch.HalfTensor, torch.cuda.HalfTensor}
@@ -935,6 +937,28 @@ def test_type_conversions_same_gpu(self):
     def test_neg(self):
         TestTorch._test_neg(self, lambda t: t.cuda())
 
+    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    def test_arithmetic_large_tensor(self):
+        x = torch.empty(2**30, device='cuda')
+
+        x.fill_(1)
+        self.assertEqual(x.sum(), 2**30)
+
+        x += 1
+        self.assertEqual(x.sum(), 2**31)
+
+        x.fill_(1)
+        x -= 0.5
+        self.assertEqual(x.sum(), 2**29)
+
+        x.fill_(1)
+        x *= 2
+        self.assertEqual(x.sum(), 2**31)
+
+        x.fill_(1)
+        x /= 2
+        self.assertEqual(x.sum(), 2**29)
+
     def _test_broadcast(self, input):
         if not TEST_MULTIGPU:
             raise unittest.SkipTest("only one GPU detected")

From ceadde2a7f59b32196cf84b9242c0504ed178d4d Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 13:12:48 -0700
Subject: [PATCH 32/51] Add some more locations to search for nccl. (#12063)

Summary:
Users generally expect ./configure to find libraries
installed in /usr/local and /usr, so search for nccl
there too.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12063

Differential Revision: D10036248

Pulled By: ezyang

fbshipit-source-id: d331ddd2ccc8ac9846fb54222db284b1ec371659
---
 tools/setup_helpers/nccl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/setup_helpers/nccl.py b/tools/setup_helpers/nccl.py
index 703446520870aa..c1cc88657ebf69 100644
--- a/tools/setup_helpers/nccl.py
+++ b/tools/setup_helpers/nccl.py
@@ -33,9 +33,11 @@
         os.path.join(ENV_ROOT, 'lib64') if ENV_ROOT is not None else None,
         os.path.join(CUDA_HOME, 'lib'),
         os.path.join(CUDA_HOME, 'lib64'),
+        '/usr/local/lib',
         '/usr/lib/x86_64-linux-gnu/',
         '/usr/lib/powerpc64le-linux-gnu/',
         '/usr/lib/aarch64-linux-gnu/',
+        '/usr/lib',
     ] + gather_paths([
         'LIBRARY_PATH',
     ]) + gather_paths([
@@ -45,7 +47,9 @@
         INCLUDE_DIR,
         ENV_ROOT,
         os.path.join(ENV_ROOT, 'include') if ENV_ROOT is not None else None,
-        '/usr/include'
+        os.path.join(CUDA_HOME, 'include'),
+        '/usr/local/include',
+        '/usr/include',
     ]))
 
     if IS_CONDA:

From aa1adde80ba9aca4f4ef3113fae7f0ac0b99af72 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 13:48:07 -0700
Subject: [PATCH 33/51] Refactor fastGet/fastSet for clarity, removing a null
 pointer check. (#11902)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11902

Previously, they were going through THTensor_getStoragePtr which
incurred a null pointer check on storage.  Now they use unsafe_data
method which doesn't do this check.

I don't know if this actually make things go faster, but I get
an added bonus of reducing code duplication, so we should take
this change anyway :)

Reviewed By: SsnL

Differential Revision: D9977654

fbshipit-source-id: f45c74828213a0439480755ad0b2d7f8858cb327
---
 aten/src/TH/generic/THTensorFastGetSet.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp
index f3c202fd4234b6..f430839565471a 100644
--- a/aten/src/TH/generic/THTensorFastGetSet.hpp
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@@ -3,47 +3,47 @@
 #else
 
 static inline scalar_t THTensor_(fastGetLegacy1dNoScalars)(THTensor *self, int64_t x0) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*THTensor_strideLegacyNoScalars(self, 0)];
+  return self->unsafe_data<scalar_t>()[x0*THTensor_strideLegacyNoScalars(self, 0)];
 }
 
 static inline scalar_t THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)];
 }
 
 static inline scalar_t THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)];
 }
 
 static inline scalar_t THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)];
 }
 
 static inline scalar_t THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)];
 }
 
 static inline scalar_t THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)];
 }
 
 static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)] = value;
 }
 
 static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)] = value;
 }
 
 static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)] = value;
 }
 
 static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)] = value;
 }
 
 static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)] = value;
 }
 
 #endif

From e53e8df20bbeac85496221ba9294cdd94b11de41 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 14:10:39 -0700
Subject: [PATCH 34/51] Support TypeIdentifier::name() (#12036)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12036

Sometimes you have a TypeIdentifier, and no way to get to
the TypeMeta.  Still nice to be able to read out the name.

This should be obsoleted by smessmer's patches.

Reviewed By: gchanan, mingzhe09088

Differential Revision: D10024554

fbshipit-source-id: 42cdceefd5c59be0441254665f66f5edc829f422
---
 aten/src/ATen/core/typeid.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index 2ed81cb1e1c8a7..c7066863682f21 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -61,6 +61,8 @@ class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, ui
     return TypeIdentifier(11);
   }
 
+  const char* name() const noexcept;
+
  private:
   constexpr explicit TypeIdentifier(uint16_t id) : IdWrapper(id) {}
   friend class TypeMeta;
@@ -91,6 +93,11 @@ namespace caffe2 {
 AT_CORE_API std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
 AT_CORE_API std::unordered_set<std::string>& gRegisteredTypeNames();
 
+inline const char* TypeIdentifier::name() const noexcept {
+  auto it = gTypeNames().find(*this);
+  assert(it != gTypeNames().end());
+  return it->second.c_str();
+}
 
 AT_CORE_API std::mutex& gTypeRegistrationMutex();
 

From 1e2829448791c4d1aa3ad3632b30a5b94b953dcc Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 14:11:36 -0700
Subject: [PATCH 35/51] Delete some unused variables. (#12059)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12059

Differential Revision: D10034632

Pulled By: ezyang

fbshipit-source-id: ff33da0d93734856b8e8bcfe744cefe127fffb91
---
 aten/src/ATen/native/cuda/Activation.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index 1bce68730f0d01..505054b8d431ce 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -72,11 +72,10 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
     AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
+    int64_t input_stride0 = 1, input_stride1 = 1;
 
     if (input_ndim > 1) {
       channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
@@ -189,11 +188,10 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
     AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
+    int64_t input_stride0 = 1, input_stride1 = 1;
 
     if (input_ndim > 1) {
       channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }

From b7b9e3c7e8cf869bcc4bb5aa73a149d2e1f92f9d Mon Sep 17 00:00:00 2001
From: Brian Gesiak <bgesiak@fb.com>
Date: Tue, 25 Sep 2018 16:25:19 -0700
Subject: [PATCH 36/51] Fix "identifier following the 'template' keyword does
 not refer to a template" (#12037)

Summary:
LLVM trunk emits an error diagnostic when attempting to compile caffe2. The
identifiers following the `template` keywords are not templates, so the use of
the keyword does not make sense in this context.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12037

Reviewed By: ezyang

Differential Revision: D10024531

Pulled By: modocache

fbshipit-source-id: da4b9ba405d9f7fd633ab8c1a61c77da9c1a1f89
---
 caffe2/contrib/gloo/allgather_ops.h      | 2 +-
 caffe2/contrib/gloo/allreduce_ops.h      | 4 ++--
 caffe2/contrib/gloo/broadcast_ops.h      | 4 ++--
 caffe2/contrib/gloo/reduce_scatter_ops.h | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h
index 1f55233a095c8c..f97a00f8956eeb 100644
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ b/caffe2/contrib/gloo/allgather_ops.h
@@ -114,7 +114,7 @@ class AllgatherOp final : public Operator<Context> {
     params.size = Input(1).size();
     params.meta = Input(1).meta();
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
+      params.inputs[i] = Input(i + 1).raw_data();
     }
     params.outputs.resize(OutputSize());
     params.outputs[0] = Output(0)->raw_mutable_data(params.meta);
diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h
index 85d10c313085ff..f3b1bd3560b3d4 100644
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@@ -117,8 +117,8 @@ class AllreduceOp final : public Operator<Context> {
     params.inputs.resize(InputSize() - 1);
     params.outputs.resize(OutputSize());
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
-      params.outputs[i] = Output(i)->template raw_mutable_data();
+      params.inputs[i] = Input(i + 1).raw_data();
+      params.outputs[i] = Output(i)->raw_mutable_data();
     }
     params.size = Output(0)->size();
     params.meta = Output(0)->meta();
diff --git a/caffe2/contrib/gloo/broadcast_ops.h b/caffe2/contrib/gloo/broadcast_ops.h
index e525b8e158f4c5..171dbbd8c97a1f 100644
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@@ -95,8 +95,8 @@ class BroadcastOp final : public Operator<Context> {
     params.inputs.resize(InputSize() - 1);
     params.outputs.resize(OutputSize());
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
-      params.outputs[i] = Output(i)->template raw_mutable_data();
+      params.inputs[i] = Input(i + 1).raw_data();
+      params.outputs[i] = Output(i)->raw_mutable_data();
     }
     params.size = Output(0)->size();
     params.meta = Output(0)->meta();
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.h b/caffe2/contrib/gloo/reduce_scatter_ops.h
index 069c5238694939..559b35618a1083 100644
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@@ -108,15 +108,15 @@ class ReduceScatterOp final : public Operator<Context> {
     params.inputs.resize(InputSize() - 2);
     params.outputs.resize(OutputSize() - 1);
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
-      params.outputs[i] = Output(i)->template raw_mutable_data();
+      params.inputs[i] = Input(i + 1).raw_data();
+      params.outputs[i] = Output(i)->raw_mutable_data();
     }
     params.size = Output(0)->size();
     params.meta = Output(0)->meta();
 
     // Verify recvCountsSize == comm_size
     CAFFE_ENFORCE_EQ(Input(InputSize() - 1).size(), params.context->size);
-    int* recvCounts = (int*)Input(InputSize() - 1).template raw_data();
+    int* recvCounts = (int*)Input(InputSize() - 1).raw_data();
     recvCounts_.assign(recvCounts, recvCounts + Input(InputSize() - 1).size());
   }
 

From 658386a63f52de5bad764ec8d64fae47d54c431d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Tue, 25 Sep 2018 16:55:16 -0700
Subject: [PATCH 37/51] Make USE_IDEEP work again (#12026)

Summary:
This PR establish a baseline so that we can build IDEEP ops in the new work flow. From this baseline, we need to
- Merge the CMakefile of MKLDNN from caffe2 and Pytorch
- Get rid of `USE_MKL=ON`.

Build command from now on:
```
EXTRA_CAFFE2_CMAKE_FLAGS="-DUSE_MKL=ON -DINTEL_COMPILER_DIR=/opt/IntelComposerXE/2017.0.098"  python setup.py build_deps
```

gujinghui
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12026

Differential Revision: D10041199

Pulled By: yinghai

fbshipit-source-id: b7310bd84a494ac899d8e25da368b63feed4eeaf
---
 cmake/Modules/FindMKL.cmake | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index b296e5f2e47aed..441a8e20cf068f 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -314,15 +314,15 @@ if (USE_MKL AND USE_IDEEP)
   set(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep")
   set(MKLDNN_ROOT "${IDEEP_ROOT}/mkl-dnn")
   find_path(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
-  find_path(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
-  if (NOT MKLDNN_INCLUDE_DIR)
+  find_path(MKLDNN_INCLUDE_DIR_HACK mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+  if (NOT MKLDNN_INCLUDE_DIR_HACK)
     execute_process(COMMAND git submodule update --init mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
-    find_path(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+    find_path(MKLDNN_INCLUDE_DIR_HACK mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
   endif()
 
-  if (MKLDNN_INCLUDE_DIR)
-    list(APPEND IDEEP_INCLUDE_DIR ${MKLDNN_INCLUDE_DIR})
-    list(APPEND __ideep_looked_for MKLDNN_INCLUDE_DIR)
+  if (MKLDNN_INCLUDE_DIR_HACK)
+    list(APPEND IDEEP_INCLUDE_DIR ${MKLDNN_INCLUDE_DIR_HACK})
+    list(APPEND __ideep_looked_for MKLDNN_INCLUDE_DIR_HACK)
     # to avoid adding conflicting submodels
     set(ORIG_WITH_TEST ${WITH_TEST})
     set(WITH_TEST OFF)
@@ -379,7 +379,7 @@ if (USE_MKL AND USE_IDEEP)
     endif()
 
     caffe_clear_vars(__ideep_looked_for __mklml_inner_libs)
-  endif() # MKLDNN_INCLUDE_DIR
+  endif() # MKLDNN_INCLUDE_DIR_HACK
 endif() # USE_IDEEP
 
 # Do nothing if MKL_FOUND was set before!

From 90bcf41291ca62aa040e7f1437b18e648aefef32 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 25 Sep 2018 17:21:58 -0700
Subject: [PATCH 38/51] Add safety asserts for methods on TensorImpl which
 don't work on Variable. (#12058)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12058

Methods on TensorImpl have to be written very carefully, because
when you have a VariableImpl subclass of TensorImpl, usually the
local fields on the TensorImpl are not valid; instead, you have to
forward to the "wrapped" tensor.  Functions which are virtualized
are probably handled correctly by Variable, but functions which
are NOT cannot be handled correctly and shouldn't be called if you
have a Variable.  This diff add checks to determine if this is
the case or not.

Reviewed By: jerryzh168

Differential Revision: D10034589

fbshipit-source-id: 650b2036ca9a044c0ab4abdf6f825521a64e1fc2
---
 aten/src/ATen/core/TensorImpl.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 1e7ee932f63ddc..27232e2a3a8e97 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -69,9 +69,11 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   // numbers. Otherwise, they behave like their non-wrapped equivalents.
   // See [Result type computation] in TensorIterator.h.
   bool is_wrapped_number() const {
+    AT_ASSERT(!is_variable());
     return is_wrapped_number_;
   }
   void set_wrapped_number(bool value) {
+    AT_ASSERT(!is_variable());
     AT_ASSERT(dim() == 0);
     is_wrapped_number_ = value;
   }
@@ -97,10 +99,12 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
   template <typename T>
   inline T * data() const {
+    AT_ASSERT(!is_variable());
     return storage_.data<T>() + storage_offset_;
   }
 
   inline void* data() const {
+    AT_ASSERT(!is_variable());
     return static_cast<void*>(
         static_cast<char*>(storage_.data()) +
         data_type_.itemsize() * storage_offset_);
@@ -108,6 +112,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
   template <typename T>
   inline T * unsafe_data() const {
+    AT_ASSERT(!is_variable());
     return storage_.unsafe_data<T>() + storage_offset_;
   }
 
@@ -155,6 +160,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   // sizes/strides are in bounds for the storage that is allocated;
   // this is the responsibility of the caller
   void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) {
+    AT_ASSERT(!is_variable());
     AT_CHECK(
         new_size.size() == new_stride.size(),
         "dimensionality of sizes (",
@@ -192,9 +198,11 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
  protected:
   void refresh_numel() {
+    AT_ASSERT(!is_variable());
     numel_ = compute_numel();
   }
   void refresh_contiguous() {
+    AT_ASSERT(!is_variable());
     is_contiguous_ = compute_contiguous();
   }
   TensorTypeId type_id_;

From 28dba2f928bb0d7bc14c46ac0cf4894ee00a1d32 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Tue, 25 Sep 2018 17:28:37 -0700
Subject: [PATCH 39/51] Unify all *_EXPORT and *_IMPORT macros across c++
 backend (#12019)

Summary:
TSIA. Right now we should basically use C10_EXPORT and C10_IMPORT for explicitly marking dllexport and dllimport, as a continued effort of the C10 unification.

This is a codemod by mechanically doing the following change:

CAFFE2_{EXPORT,IMPORT} -> C10_{EXPORT,IMPORT}
AT_CORE_{EXPORT,IMPORT} -> C10_{EXPORT,IMPORT}
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12019

Reviewed By: ezyang, teng-li

Differential Revision: D10016276

Pulled By: Yangqing

fbshipit-source-id: a420d62c43d1110105fc88f9e9076e28a3203164
---
 aten/src/ATen/core/ATenCoreTest.h             |   2 +-
 aten/src/ATen/core/ATenGeneral.h              |   4 -
 aten/src/ATen/core/Allocator.h                |   2 +-
 aten/src/ATen/core/Backtrace.h                |   4 +-
 aten/src/ATen/core/Device.h                   |   4 +-
 aten/src/ATen/core/DeviceType.h               |   4 +-
 aten/src/ATen/core/Error.h                    |  10 +-
 aten/src/ATen/core/Half.h                     |   6 +-
 aten/src/ATen/core/IdWrapper.h                |   2 +-
 aten/src/ATen/core/LegacyTypeDispatch.h       |  14 +-
 aten/src/ATen/core/SmallVector.h              |   2 +-
 aten/src/ATen/core/TensorTypeId.h             |   6 +-
 aten/src/ATen/core/TensorTypeIdRegistration.h |  19 +-
 aten/src/ATen/core/UniqueVoidPtr.h            |   2 +-
 aten/src/ATen/core/context_base.h             |   4 +-
 aten/src/ATen/core/intrusive_ptr.h            |  12 +-
 aten/src/ATen/core/ivalue.h                   |   2 +-
 aten/src/ATen/core/typeid.h                   |  29 +-
 c10/CMakeLists.txt                            |   5 +
 c10/macros/Legacy.h                           |  13 -
 caffe2/contrib/nccl/cuda_nccl_gpu.cc          |   2 +-
 caffe2/core/blob.h                            |   2 +-
 caffe2/core/common_cudnn.h                    |   4 +-
 caffe2/core/cudnn_wrappers.h                  |   4 +-
 caffe2/core/db.cc                             |   2 +-
 caffe2/core/db.h                              |   8 +-
 caffe2/core/dispatch/KernelRegistration.h     |   2 +-
 caffe2/core/flags.cc                          |  41 +-
 caffe2/core/flags.h                           |  60 +--
 caffe2/core/hip/common_miopen.h               |   2 +-
 caffe2/core/hip/miopen_wrapper.h              |   4 +-
 caffe2/core/hip/net_async_dag_hip.cc          |   2 +-
 caffe2/core/net.h                             |   2 +-
 caffe2/core/net_async_base.h                  |   2 +-
 caffe2/core/net_async_dag_gpu.cc              |   4 +-
 caffe2/core/net_async_dag_gpu.h               |   2 +-
 caffe2/core/net_async_polling.h               |   2 +-
 caffe2/core/net_async_scheduling.h            |   2 +-
 caffe2/core/net_dag.h                         |   2 +-
 caffe2/core/net_simple.h                      |   2 +-
 caffe2/core/net_simple_async.h                |   2 +-
 .../nomnigraph/Representations/NeuralNet.h    |  20 +-
 caffe2/core/observer.h                        |   2 +-
 caffe2/core/operator.h                        |  12 +-
 caffe2/core/operator_schema.cc                |   2 +-
 caffe2/core/operator_schema.h                 |  12 +-
 caffe2/core/qtensor.h                         |   2 +-
 caffe2/core/registry.h                        |  14 +-
 caffe2/core/timer.h                           |   2 +-
 caffe2/core/workspace.h                       |   2 +-
 caffe2/db/create_db_op.h                      |   2 +-
 caffe2/db/leveldb.cc                          |   2 +-
 caffe2/db/lmdb.cc                             |   2 +-
 caffe2/db/protodb.cc                          |   2 +-
 caffe2/mkl/utils/mkl_memory.h                 |   8 +-
 .../mobile/contrib/arm-compute/core/net_gl.h  |   2 +-
 caffe2/operators/expand_squeeze_dims_op.h     |   2 +-
 caffe2/operators/partition_ops.h              |   4 +-
 caffe2/operators/slice_op.cu                  |   2 +-
 caffe2/operators/slice_op.h                   |   4 +-
 caffe2/opt/fusion.h                           |   2 +-
 caffe2/opt/sink.cc                            |   2 +-
 caffe2/python/pybind_state.h                  |   4 +-
 caffe2/queue/blobs_queue_db.cc                |   2 +-
 caffe2/utils/math_cpu.cc                      | 428 +++++++++---------
 caffe2/utils/proto_convert.cc                 |   8 +-
 caffe2/utils/proto_utils.cc                   | 187 ++++----
 caffe2/utils/proto_utils.h                    |   2 +-
 caffe2/utils/threadpool/WorkersPool.h         |   2 +-
 caffe2/utils/zmq_helper.h                     |   4 +-
 modules/rocksdb/rocksdb.cc                    |   2 +-
 71 files changed, 534 insertions(+), 506 deletions(-)

diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h
index a6769b10b93eed..93f894ea66b971 100644
--- a/aten/src/ATen/core/ATenCoreTest.h
+++ b/aten/src/ATen/core/ATenCoreTest.h
@@ -4,5 +4,5 @@
 
 namespace at {
 
-AT_CORE_API int CoreTest();
+CAFFE2_API int CoreTest();
 }
diff --git a/aten/src/ATen/core/ATenGeneral.h b/aten/src/ATen/core/ATenGeneral.h
index 78cfe5fe4d9f1b..cb946c93c9b965 100644
--- a/aten/src/ATen/core/ATenGeneral.h
+++ b/aten/src/ATen/core/ATenGeneral.h
@@ -1,7 +1,3 @@
 #pragma once
 
 #include "ATen/core/Macros.h"
-
-// TODO: Merge the *_API macros.
-#define AT_EXPORT AT_CORE_EXPORT
-#define AT_IMPORT AT_CORE_IMPORT
diff --git a/aten/src/ATen/core/Allocator.h b/aten/src/ATen/core/Allocator.h
index dc520008e3bbfe..a3bae36efe4a44 100644
--- a/aten/src/ATen/core/Allocator.h
+++ b/aten/src/ATen/core/Allocator.h
@@ -115,7 +115,7 @@ struct Allocator {
   }
 };
 
-struct AT_CORE_API InefficientStdFunctionContext {
+struct CAFFE2_API InefficientStdFunctionContext {
   std::unique_ptr<void, std::function<void(void*)>> ptr_;
   InefficientStdFunctionContext(
       std::unique_ptr<void, std::function<void(void*)>>&& ptr)
diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h
index 9aa3ac826ce78c..7a4e9e6b1dba20 100644
--- a/aten/src/ATen/core/Backtrace.h
+++ b/aten/src/ATen/core/Backtrace.h
@@ -8,7 +8,7 @@
 
 namespace at {
 /// Utility to demangle a C++ symbol name.
-AT_CORE_API std::string demangle(const char* name);
+CAFFE2_API std::string demangle(const char* name);
 
 /// Returns the printable name of the type.
 template <typename T>
@@ -21,7 +21,7 @@ inline const char* demangle_type() {
 #endif // __GXX_RTTI
 }
 
-AT_CORE_API std::string get_backtrace(
+CAFFE2_API std::string get_backtrace(
     size_t frames_to_skip = 0,
     size_t maximum_number_of_frames = 64,
     bool skip_python_frames = true);
diff --git a/aten/src/ATen/core/Device.h b/aten/src/ATen/core/Device.h
index cd3efb6734e2dd..a06d5f1e0d1667 100644
--- a/aten/src/ATen/core/Device.h
+++ b/aten/src/ATen/core/Device.h
@@ -21,7 +21,7 @@ namespace at {
 /// 1. A negative index represents the current device, a non-negative index
 /// represents a specific, concrete device,
 /// 2. When the device type is CPU, the device index must be zero.
-struct AT_CORE_API Device {
+struct CAFFE2_API Device {
   using Type = at::DeviceType;
 
   /// Constructs a new `Device` from a `DeviceType` and an optional device
@@ -92,7 +92,7 @@ struct AT_CORE_API Device {
   int32_t index_ = -1;
 };
 
-AT_CORE_API std::ostream& operator<<(
+CAFFE2_API std::ostream& operator<<(
     std::ostream& stream,
     const at::Device& device);
 
diff --git a/aten/src/ATen/core/DeviceType.h b/aten/src/ATen/core/DeviceType.h
index 870b1e5bf9e538..a4342eade903a7 100644
--- a/aten/src/ATen/core/DeviceType.h
+++ b/aten/src/ATen/core/DeviceType.h
@@ -26,11 +26,11 @@ enum class DeviceType : int32_t {
   ONLY_FOR_TEST = 20901701, // This device type is only for test.
 };
 
-AT_CORE_API std::string DeviceTypeName(
+CAFFE2_API std::string DeviceTypeName(
     at::DeviceType d,
     bool lower_case = false);
 
-AT_CORE_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
+CAFFE2_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h
index de3231180f4f76..a36608256ddf0d 100644
--- a/aten/src/ATen/core/Error.h
+++ b/aten/src/ATen/core/Error.h
@@ -19,7 +19,7 @@ namespace at {
 namespace detail {
 
 // Obtains the base name from a full path.
-AT_CORE_API std::string StripBasename(const std::string& full_path);
+CAFFE2_API std::string StripBasename(const std::string& full_path);
 
 inline std::ostream& _str(std::ostream& ss) {
   return ss;
@@ -56,7 +56,7 @@ inline std::string str(const char* c_str) {
 }
 
 /// Represents a location in source code (for debugging).
-struct AT_CORE_API SourceLocation {
+struct CAFFE2_API SourceLocation {
   const char* function;
   const char* file;
   uint32_t line;
@@ -71,7 +71,7 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
 ///
 /// NB: at::Error is handled specially by the default torch to suppress the
 /// backtrace, see torch/csrc/Exceptions.h
-class AT_CORE_API Error : public std::exception {
+class CAFFE2_API Error : public std::exception {
   std::vector<std::string> msg_stack_;
   std::string backtrace_;
 
@@ -128,7 +128,7 @@ class AT_CORE_API Error : public std::exception {
   }
 };
 
-class AT_CORE_API Warning {
+class CAFFE2_API Warning {
   using handler_t =
       void (*)(const SourceLocation& source_location, const char* msg);
 
@@ -152,7 +152,7 @@ class AT_CORE_API Warning {
 
 // A utility function to return an exception std::string by prepending its
 // exception type before its what() content
-AT_CORE_API std::string GetExceptionString(const std::exception& e);
+CAFFE2_API std::string GetExceptionString(const std::exception& e);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index 47a8e8e52d2adb..ec72fe0067dcbb 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -34,8 +34,8 @@ namespace at {
 
 namespace detail {
 
-AT_CORE_API float halfbits2float(unsigned short bits);
-AT_CORE_API unsigned short float2halfbits(float value);
+CAFFE2_API float halfbits2float(unsigned short bits);
+CAFFE2_API unsigned short float2halfbits(float value);
 
 } // namespace detail
 
@@ -178,7 +178,7 @@ To checked_convert(From f, const char* name) {
   return convert<To, From>(f);
 }
 
-AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, const Half& value);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h
index 58632ce111db57..268fe6725356fc 100644
--- a/aten/src/ATen/core/IdWrapper.h
+++ b/aten/src/ATen/core/IdWrapper.h
@@ -22,7 +22,7 @@ namespace at {
  * for you, given the underlying type supports it.
  */
 template <class ConcreteType, class UnderlyingType>
-class AT_CORE_API IdWrapper {
+class CAFFE2_API IdWrapper {
  public:
   using underlying_type = UnderlyingType;
   using concrete_type = ConcreteType;
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
index 578e02e739d0d3..53cedf04e4601a 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -30,7 +30,7 @@
 
 namespace at {
 
-struct AT_CORE_API LegacyTypeInitInterface {
+struct CAFFE2_API LegacyTypeInitInterface {
   virtual ~LegacyTypeInitInterface() {}
   virtual void initCPU() const {
     AT_ERROR("cannot use CPU without ATen library");
@@ -42,15 +42,15 @@ struct AT_CORE_API LegacyTypeInitInterface {
     AT_ERROR("cannot use complex without ATen Complex library");
   }
 };
-struct AT_CORE_API LegacyTypeInitArgs {};
+struct CAFFE2_API LegacyTypeInitArgs {};
 AT_DECLARE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs);
 #define REGISTER_LEGACY_TYPE_INIT(clsname) AT_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
 
-AT_CORE_API const LegacyTypeInitInterface& getLegacyTypeInit();
+CAFFE2_API const LegacyTypeInitInterface& getLegacyTypeInit();
 
 struct Type;
 
-struct AT_CORE_API LegacyTypeDeleter {
+struct CAFFE2_API LegacyTypeDeleter {
   using TypeDeleterFun = void(Type*);
   TypeDeleterFun *fn_ = nullptr;
   LegacyTypeDeleter() {}
@@ -62,8 +62,8 @@ struct AT_CORE_API LegacyTypeDeleter {
   }
 };
 
-class AT_CORE_API LegacyTypeDispatch {
-public:
+class CAFFE2_API LegacyTypeDispatch {
+ public:
   using TypeUniquePtr = std::unique_ptr<Type, LegacyTypeDeleter>;
   // WARNING: This function has the precondition that you have
   // initialized the type you want to call.  This initialization
@@ -150,6 +150,6 @@ class AT_CORE_API LegacyTypeDispatch {
     [static_cast<int>(ScalarType::NumOptions)];
 };
 
-AT_CORE_API LegacyTypeDispatch & globalLegacyTypeDispatch();
+CAFFE2_API LegacyTypeDispatch& globalLegacyTypeDispatch();
 
 } // namespace at
diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h
index 483144794f46e1..cd2c2f51f49607 100644
--- a/aten/src/ATen/core/SmallVector.h
+++ b/aten/src/ATen/core/SmallVector.h
@@ -59,7 +59,7 @@ static inline uint64_t NextPowerOf2(uint64_t A) {
 } // namespace detail
 
 /// This is all the non-templated stuff common to all SmallVectors.
-class AT_CORE_API SmallVectorBase {
+class CAFFE2_API SmallVectorBase {
  protected:
   void *BeginX, *EndX, *CapacityX;
 
diff --git a/aten/src/ATen/core/TensorTypeId.h b/aten/src/ATen/core/TensorTypeId.h
index d01437bbe9197b..ac584263c8018a 100644
--- a/aten/src/ATen/core/TensorTypeId.h
+++ b/aten/src/ATen/core/TensorTypeId.h
@@ -17,7 +17,7 @@ using _tensorTypeId_underlyingType = uint8_t;
  * Dynamic type ID of a Tensor argument.  It represents something like
  * CPUTensor, etc.
  */
-class AT_CORE_API TensorTypeId final
+class CAFFE2_API TensorTypeId final
     : public at::
           IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
  public:
@@ -32,10 +32,10 @@ class AT_CORE_API TensorTypeId final
       : IdWrapper(id) {}
 
   friend class TensorTypeIdCreator;
-  friend AT_CORE_API std::ostream& operator<<(std::ostream&, TensorTypeId);
+  friend CAFFE2_API std::ostream& operator<<(std::ostream&, TensorTypeId);
 };
 
-AT_CORE_API std::ostream& operator<<(std::ostream&, at::TensorTypeId);
+CAFFE2_API std::ostream& operator<<(std::ostream&, at::TensorTypeId);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.h b/aten/src/ATen/core/TensorTypeIdRegistration.h
index a7b30932cebe85..c252a6ef6e4437 100644
--- a/aten/src/ATen/core/TensorTypeIdRegistration.h
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.h
@@ -16,7 +16,7 @@
 
 namespace at {
 
-class AT_CORE_API TensorTypeIdCreator final {
+class CAFFE2_API TensorTypeIdCreator final {
  public:
   TensorTypeIdCreator();
 
@@ -29,10 +29,10 @@ class AT_CORE_API TensorTypeIdCreator final {
  private:
   std::atomic<details::_tensorTypeId_underlyingType> last_id_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
 };
 
-class AT_CORE_API TensorTypeIdRegistry final {
+class CAFFE2_API TensorTypeIdRegistry final {
  public:
   TensorTypeIdRegistry();
 
@@ -43,10 +43,10 @@ class AT_CORE_API TensorTypeIdRegistry final {
   std::unordered_set<at::TensorTypeId> registeredTypeIds_;
   std::mutex mutex_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
 };
 
-class AT_CORE_API TensorTypeIds final {
+class CAFFE2_API TensorTypeIds final {
  public:
   static TensorTypeIds& singleton();
 
@@ -61,14 +61,14 @@ class AT_CORE_API TensorTypeIds final {
   TensorTypeIdCreator creator_;
   TensorTypeIdRegistry registry_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
 };
 
 inline constexpr at::TensorTypeId TensorTypeIds::undefined() noexcept {
   return TensorTypeIdCreator::undefined();
 }
 
-class AT_CORE_API TensorTypeIdRegistrar final {
+class CAFFE2_API TensorTypeIdRegistrar final {
  public:
   TensorTypeIdRegistrar();
   ~TensorTypeIdRegistrar();
@@ -78,14 +78,15 @@ class AT_CORE_API TensorTypeIdRegistrar final {
  private:
   at::TensorTypeId id_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
 };
 
 inline at::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
   return id_;
 }
 
-#define AT_DECLARE_TENSOR_TYPE(TensorName) AT_CORE_API at::TensorTypeId TensorName();
+#define AT_DECLARE_TENSOR_TYPE(TensorName) \
+  CAFFE2_API at::TensorTypeId TensorName();
 
 #define AT_DEFINE_TENSOR_TYPE(TensorName)           \
   at::TensorTypeId TensorName() {                   \
diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h
index a7c9d6119bfcd8..daa6cdd3735784 100644
--- a/aten/src/ATen/core/UniqueVoidPtr.h
+++ b/aten/src/ATen/core/UniqueVoidPtr.h
@@ -10,7 +10,7 @@ using DeleterFnPtr = void (*)(void*);
 namespace detail {
 
 // Does not delete anything
-AT_CORE_API void deleteNothing(void*);
+CAFFE2_API void deleteNothing(void*);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 45b38387b46ca6..326cae5eb9691e 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -25,7 +25,7 @@ class BaseContext;
    functions that are invoked statically before in Tensor class, e.g. New,
    We will merge this with Allocator later.
  */
-class AT_CORE_API BaseStaticContext {
+class CAFFE2_API BaseStaticContext {
  public:
   virtual ~BaseStaticContext() noexcept {}
 
@@ -55,7 +55,7 @@ class AT_CORE_API BaseStaticContext {
  * functions in the BaseContext class.
  * TODO: add docs after this is finalized.
  */
-class AT_CORE_API BaseContext {
+class CAFFE2_API BaseContext {
  public:
   virtual ~BaseContext() noexcept {}
 
diff --git a/aten/src/ATen/core/intrusive_ptr.h b/aten/src/ATen/core/intrusive_ptr.h
index 961915555a3756..4dc3c501e94337 100644
--- a/aten/src/ATen/core/intrusive_ptr.h
+++ b/aten/src/ATen/core/intrusive_ptr.h
@@ -33,7 +33,7 @@ namespace c10 {
 // tells us if the object was allocated by us.  If it wasn't, no
 // intrusive_ptr for you!
 
-class AT_CORE_API intrusive_ptr_target {
+class CAFFE2_API intrusive_ptr_target {
   // Note [Weak references for intrusive refcounting]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Here's the scheme:
@@ -114,7 +114,7 @@ class AT_CORE_API intrusive_ptr_target {
 
 namespace detail {
 template <class TTarget>
-struct AT_CORE_EXPORT intrusive_target_default_null_type final {
+struct C10_EXPORT intrusive_target_default_null_type final {
   static constexpr TTarget* singleton() noexcept {
     return nullptr;
   }
@@ -136,7 +136,7 @@ class weak_intrusive_ptr;
 template <
     class TTarget,
     class NullType = detail::intrusive_target_default_null_type<TTarget>>
-class AT_CORE_EXPORT intrusive_ptr final {
+class C10_EXPORT intrusive_ptr final {
  private:
   static_assert(
       std::is_base_of<intrusive_ptr_target, TTarget>::value,
@@ -391,7 +391,7 @@ inline bool operator!=(
 template <
     typename TTarget,
     class NullType = detail::intrusive_target_default_null_type<TTarget>>
-class AT_CORE_EXPORT weak_intrusive_ptr final {
+class C10_EXPORT weak_intrusive_ptr final {
  private:
   static_assert(
       std::is_base_of<intrusive_ptr_target, TTarget>::value,
@@ -739,13 +739,13 @@ namespace std {
 // To allow intrusive_ptr and weak_intrusive_ptr inside std::unordered_map or
 // std::unordered_set, we need std::hash
 template <class TTarget, class NullType>
-struct AT_CORE_EXPORT hash<c10::intrusive_ptr<TTarget, NullType>> {
+struct C10_EXPORT hash<c10::intrusive_ptr<TTarget, NullType>> {
   size_t operator()(const c10::intrusive_ptr<TTarget, NullType>& x) const {
     return std::hash<TTarget*>()(x.get());
   }
 };
 template <class TTarget, class NullType>
-struct AT_CORE_EXPORT hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
+struct C10_EXPORT hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
   size_t operator()(const c10::weak_intrusive_ptr<TTarget, NullType>& x) const {
     return std::hash<TTarget*>()(x._unsafe_get_target());
   }
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index ef88c8c746093c..5064f5114e7df9 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -34,7 +34,7 @@ struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
 
 // non-mutable list
 template <typename Elem>
-struct AT_CORE_EXPORT ConstantList final : c10::intrusive_ptr_target {
+struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target {
  private:
   const std::vector<Elem> elements_;
  public:
diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index c7066863682f21..9055746ea377dd 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -47,7 +47,8 @@ class TypeMeta;
  * use TypeIdentifier with custom types. This is for example used to store the
  * dtype of tensors.
  */
-class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, uint16_t> {
+class CAFFE2_API TypeIdentifier final
+    : public at::IdWrapper<TypeIdentifier, uint16_t> {
  public:
   static TypeIdentifier createTypeId();
 
@@ -90,8 +91,8 @@ AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::TypeIdentifier)
 
 namespace caffe2 {
 
-AT_CORE_API std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
-AT_CORE_API std::unordered_set<std::string>& gRegisteredTypeNames();
+CAFFE2_API std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
+CAFFE2_API std::unordered_set<std::string>& gRegisteredTypeNames();
 
 inline const char* TypeIdentifier::name() const noexcept {
   auto it = gTypeNames().find(*this);
@@ -99,7 +100,7 @@ inline const char* TypeIdentifier::name() const noexcept {
   return it->second.c_str();
 }
 
-AT_CORE_API std::mutex& gTypeRegistrationMutex();
+CAFFE2_API std::mutex& gTypeRegistrationMutex();
 
 template <typename T>
 struct TypeNameRegisterer {
@@ -146,7 +147,7 @@ struct TypeNameRegisterer {
  * stores some additional data such as the item size and the name of the type
  * for run-time inspection.
  */
-class AT_CORE_API TypeMeta {
+class CAFFE2_API TypeMeta {
  public:
   using PlacementNew = void(void*, size_t);
   using TypedCopy = void(const void*, void*, size_t);
@@ -247,7 +248,7 @@ class AT_CORE_API TypeMeta {
    * is generated during run-time. Do NOT serialize the id for storage.
    */
   template <typename T>
-  AT_CORE_API static TypeIdentifier Id();
+  CAFFE2_API static TypeIdentifier Id();
 
   /**
    * Returns the item size of the type. This is equivalent to sizeof(T).
@@ -403,20 +404,16 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
  *
  * NOTE: the macro needs to be invoked in ::caffe2 namespace
  */
-// Implementation note: in MSVC, we will need to prepend the AT_CORE_API
+// Implementation note: in MSVC, we will need to prepend the CAFFE2_API
 // keyword in order to get things compiled properly. in Linux, gcc seems to
 // create attribute ignored error for explicit template instantiations, see
 //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
 //   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
 // and as a result, we define these two macros slightly differently.
-// TODO(jiayq): AT_CORE_API below is not correct, because we may use the
-// definition in third party dependent libraries. The proper way is to use
-// CAFFE2_EXPORT (which explicitly requires dllexport). Marking this as a
-// todo item when the unified build is finished.
 #ifdef _MSC_VER
 #define CAFFE_KNOWN_TYPE(T)                                               \
   template <>                                                             \
-  AT_CORE_EXPORT TypeIdentifier TypeMeta::Id<T>() {                       \
+  C10_EXPORT TypeIdentifier TypeMeta::Id<T>() {                           \
     static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \
     static TypeNameRegisterer<T> registerer(type_id, #T);                 \
     return type_id;                                                       \
@@ -438,10 +435,10 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
  * for your own types to allocate dynamic ids for them.
  */
 #ifdef _MSC_VER
-#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)       \
-  template <>                                             \
-  inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {   \
-    return TypeIdentifier(PreallocatedId);                \
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)    \
+  template <>                                          \
+  inline CAFFE2_API TypeIdentifier TypeMeta::Id<T>() { \
+    return TypeIdentifier(PreallocatedId);             \
   }
 #else // _MSC_VER
 #define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 9d9c714e5b0377..4b7bab4f42eeb9 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -20,6 +20,11 @@ file(GLOB_RECURSE C10_HEADERS *.h)
 add_library(c10 ${C10_SRCS} ${C10_HEADERS})
 # If building shared library, set dllimport/dllexport proper.
 target_compile_options(c10 PRIVATE "-DC10_BUILD_MAIN_LIB")
+# Enable hidden visibility if compiler supports it.
+if (${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
+  target_compile_options(c10 PRIVATE "-fvisibility=hidden")
+endif()
+
 target_include_directories(
     c10 PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../>
diff --git a/c10/macros/Legacy.h b/c10/macros/Legacy.h
index eb17bdb7940dc5..86752a838acd32 100644
--- a/c10/macros/Legacy.h
+++ b/c10/macros/Legacy.h
@@ -5,16 +5,3 @@
  */
 
 #pragma once
-
-// Note: this is for caffe2/*. Will need to codemod to use direct C10.
-#define CAFFE2_EXPORT C10_EXPORT
-#define CAFFE2_IMPORT C10_IMPORT
-
-// Note: this is for aten/src/*. Will need to codemod.
-#define AT_CORE_API CAFFE2_API
-#define AT_CORE_EXPORT C10_EXPORT
-#define AT_CORE_IMPORT C10_IMPORT
-
-// Note: this is for both aten and c2, due to cross reference between c2 and
-// aten that we try to unentangle. Will need to codemod.
-#define AT_DISABLE_COPY_AND_ASSIGN C10_DISABLE_COPY_AND_ASSIGN
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
index b544445a26873c..490a69b91abf53 100644
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@@ -72,7 +72,7 @@ class NCCLContext {
   cudaEvent_t master_event_;
   std::vector<cudaEvent_t> events_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(NCCLContext);
+  C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
 };
 
 // We share the contexts across multiple operators, hence the
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 80470cea443331..def0f1b859e823 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -219,7 +219,7 @@ class CAFFE2_API Blob final {
   void* pointer_ = nullptr;
   DestroyCall* destroy_ = nullptr;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Blob);
+  C10_DISABLE_COPY_AND_ASSIGN(Blob);
 };
 
 inline void swap(Blob& lhs, Blob& rhs) {
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index 5332026eedb0ca..c0961c4c6411a9 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -258,7 +258,7 @@ class cudnnTensorDescWrapper {
   cudnnTensorFormat_t format_;
   cudnnDataType_t type_;
   vector<int> dims_;
-  AT_DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
 };
 
 class cudnnFilterDescWrapper {
@@ -312,7 +312,7 @@ class cudnnFilterDescWrapper {
   StorageOrder order_;
   cudnnDataType_t type_;
   vector<int> dims_;
-  AT_DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
 };
 
 
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index b518914e50402d..1bd39fa62a399f 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -89,7 +89,7 @@ class CuDNNState {
   cudaStream_t stream_{nullptr};
   CuDNNWorkspace workspace_;
   size_t gpu_id_{0};
-  AT_DISABLE_COPY_AND_ASSIGN(CuDNNState);
+  C10_DISABLE_COPY_AND_ASSIGN(CuDNNState);
 };
 
 /**
@@ -153,7 +153,7 @@ class CuDNNWrapper {
       CAFFE2_COMPILE_TIME_MAX_GPUS>;
   static PerGPUCuDNNStates& cudnn_states();
 
-  AT_DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
 };
 
 }; // namespace caffe2
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index 386787b51c353a..720c2dcaa46de1 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -119,7 +119,7 @@ class MiniDBTransaction : public Transaction {
   FILE* file_;
   std::lock_guard<std::mutex> lock_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
 };
 
 class MiniDB : public DB {
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 06b74d11bd5851..39f8b6f3f02b0d 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -52,7 +52,7 @@ class CAFFE2_API Cursor {
    */
   virtual bool Valid() = 0;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Cursor);
+  C10_DISABLE_COPY_AND_ASSIGN(Cursor);
 };
 
 /**
@@ -71,7 +71,7 @@ class CAFFE2_API Transaction {
    */
   virtual void Commit() = 0;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Transaction);
+  C10_DISABLE_COPY_AND_ASSIGN(Transaction);
 };
 
 /**
@@ -99,7 +99,7 @@ class CAFFE2_API DB {
  protected:
   Mode mode_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(DB);
+  C10_DISABLE_COPY_AND_ASSIGN(DB);
 };
 
 // Database classes are registered by their names so we can do optional
@@ -285,7 +285,7 @@ class CAFFE2_API DBReader {
   uint32_t num_shards_;
   uint32_t shard_id_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(DBReader);
+  C10_DISABLE_COPY_AND_ASSIGN(DBReader);
 };
 
 class CAFFE2_API DBReaderSerializer : public BlobSerializerBase {
diff --git a/caffe2/core/dispatch/KernelRegistration.h b/caffe2/core/dispatch/KernelRegistration.h
index 9ebc20b7ab0a6e..619cef616222bf 100644
--- a/caffe2/core/dispatch/KernelRegistration.h
+++ b/caffe2/core/dispatch/KernelRegistration.h
@@ -57,7 +57,7 @@ class KernelRegistrar final {
   const typename Schema::dispatch::dispatch_key_type dispatch_key_;
   bool owns_registration_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(KernelRegistrar);
+  C10_DISABLE_COPY_AND_ASSIGN(KernelRegistrar);
 };
 
 /**
diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc
index e7c19efde21b3a..a84d298466dc03 100644
--- a/caffe2/core/flags.cc
+++ b/caffe2/core/flags.cc
@@ -9,7 +9,7 @@ namespace caffe2 {
 
 #ifdef CAFFE2_USE_GFLAGS
 
-CAFFE2_EXPORT void SetUsageMessage(const string& str) {
+C10_EXPORT void SetUsageMessage(const string& str) {
   if (UsageMessage() != nullptr) {
     // Usage message has already been set, so we will simply return.
     return;
@@ -17,16 +17,16 @@ CAFFE2_EXPORT void SetUsageMessage(const string& str) {
   gflags::SetUsageMessage(str);
 }
 
-CAFFE2_EXPORT const char* UsageMessage() {
+C10_EXPORT const char* UsageMessage() {
   return gflags::ProgramUsage();
 }
 
-CAFFE2_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
+C10_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
   if (*pargc == 0) return true;
   return gflags::ParseCommandLineFlags(pargc, pargv, true);
 }
 
-CAFFE2_EXPORT bool CommandLineFlagsHasBeenParsed() {
+C10_EXPORT bool CommandLineFlagsHasBeenParsed() {
   // There is no way we query gflags right now, so we will simply return true.
   return true;
 }
@@ -48,11 +48,14 @@ std::stringstream& GlobalInitStream() {
 static string gUsageMessage = "(Usage message not set.)";
 }
 
+C10_EXPORT void SetUsageMessage(const string& str) {
+  gUsageMessage = str;
+}
+C10_EXPORT const char* UsageMessage() {
+  return gUsageMessage.c_str();
+}
 
-CAFFE2_EXPORT void SetUsageMessage(const string& str) { gUsageMessage = str; }
-CAFFE2_EXPORT const char* UsageMessage() { return gUsageMessage.c_str(); }
-
-CAFFE2_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
+C10_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
   if (*pargc == 0) return true;
   char** argv = *pargv;
   bool success = true;
@@ -136,18 +139,22 @@ CAFFE2_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
   return success;
 }
 
-CAFFE2_EXPORT bool CommandLineFlagsHasBeenParsed() {
+C10_EXPORT bool CommandLineFlagsHasBeenParsed() {
   return gCommandLineFlagsParsed;
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<string>(const string& content, string* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<string>(
+    const string& content,
+    string* value) {
   *value = content;
   return true;
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int>(const string& content, int* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<int>(
+    const string& content,
+    int* value) {
   try {
     *value = std::atoi(content.c_str());
     return true;
@@ -159,7 +166,9 @@ CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int>(const string& content, int* valu
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int64_t>(const string& content, int64_t* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<int64_t>(
+    const string& content,
+    int64_t* value) {
   try {
     static_assert(sizeof(long long) == sizeof(int64_t), "");
 #ifdef __ANDROID__
@@ -177,7 +186,9 @@ CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int64_t>(const string& content, int64
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<double>(const string& content, double* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<double>(
+    const string& content,
+    double* value) {
   try {
     *value = std::atof(content.c_str());
     return true;
@@ -190,7 +201,9 @@ CAFFE2_EXPORT bool Caffe2FlagParser::Parse<double>(const string& content, double
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<bool>(const string& content, bool* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<bool>(
+    const string& content,
+    bool* value) {
   if (content == "false" || content == "False" || content == "FALSE" ||
       content == "0") {
     *value = false;
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
index 2226b66af56fd8..4e39c7bdebf137 100644
--- a/caffe2/core/flags.h
+++ b/caffe2/core/flags.h
@@ -79,14 +79,14 @@ namespace gflags = google;
 // (3) Gflags has a design issue that does not properly expose the global flags,
 // if one builds the library with -fvisibility=hidden. The current gflags (as of
 // Aug 2018) only deals with the Windows case using dllexport, and not the Linux
-// counterparts. As a result, we will explciitly use CAFFE2_EXPORT to export the
+// counterparts. As a result, we will explciitly use C10_EXPORT to export the
 // flags defined in Caffe2. This is done via a global reference, so the flag
 // itself is not duplicated - under the hood it is the same global gflags flag.
-#define CAFFE2_GFLAGS_DEF_WRAPPER(                                             \
-    type, real_type, name, default_value, help_str)                            \
-  DEFINE_##type(name, default_value, help_str);                                \
-  namespace caffe2 {                                                           \
-    CAFFE2_EXPORT real_type& FLAGS_##name = ::FLAGS_##name;                    \
+#define CAFFE2_GFLAGS_DEF_WRAPPER(                     \
+    type, real_type, name, default_value, help_str)    \
+  DEFINE_##type(name, default_value, help_str);        \
+  namespace caffe2 {                                   \
+  C10_EXPORT real_type& FLAGS_##name = ::FLAGS_##name; \
   }
 
 #define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
@@ -102,11 +102,11 @@ namespace gflags = google;
       string, ::fLS::clstring, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, real_type, name)                   \
-  DECLARE_##type(name);                                                        \
-  namespace caffe2 {                                                           \
-    CAFFE2_IMPORT extern real_type& FLAGS_##name;                              \
-  }  // namespace caffe2
+#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, real_type, name) \
+  DECLARE_##type(name);                                      \
+  namespace caffe2 {                                         \
+  C10_IMPORT extern real_type& FLAGS_##name;                 \
+  } // namespace caffe2
 
 #define CAFFE2_DECLARE_int(name)                                               \
   CAFFE2_GFLAGS_DECLARE_WRAPPER(int32, gflags::int32, name)
@@ -150,22 +150,22 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 // write the CAFFE2_DEFINE_* and CAFFE2_DECLARE_* macros outside any namespace
 // as well.
 
-#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)           \
-  namespace caffe2 {                                                           \
-  CAFFE2_EXPORT type FLAGS_##name = default_value;                             \
-  namespace {                                                                  \
-  class Caffe2FlagParser_##name : public Caffe2FlagParser {                    \
-   public:                                                                     \
-    explicit Caffe2FlagParser_##name(const string& content) {                  \
-      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);        \
-    }                                                                          \
-  };                                                                           \
-  }                                                                            \
-  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                  \
-      #name,                                                                   \
-      Caffe2FlagsRegistry(),                                                   \
-      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>,  \
-      "(" #type ", default " #default_value ") " help_str);                    \
+#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)          \
+  namespace caffe2 {                                                          \
+  C10_EXPORT type FLAGS_##name = default_value;                               \
+  namespace {                                                                 \
+  class Caffe2FlagParser_##name : public Caffe2FlagParser {                   \
+   public:                                                                    \
+    explicit Caffe2FlagParser_##name(const string& content) {                 \
+      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);       \
+    }                                                                         \
+  };                                                                          \
+  }                                                                           \
+  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                 \
+      #name,                                                                  \
+      Caffe2FlagsRegistry(),                                                  \
+      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>, \
+      "(" #type ", default " #default_value ") " help_str);                   \
   }
 
 #define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
@@ -180,9 +180,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
   CAFFE2_DEFINE_typed_var(string, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define CAFFE2_DECLARE_typed_var(type, name)                                   \
-  namespace caffe2 {                                                           \
-    CAFFE2_IMPORT extern type FLAGS_##name;                                    \
+#define CAFFE2_DECLARE_typed_var(type, name) \
+  namespace caffe2 {                         \
+  C10_IMPORT extern type FLAGS_##name;       \
   } // namespace caffe2
 
 #define CAFFE2_DECLARE_int(name) CAFFE2_DECLARE_typed_var(int, name)
diff --git a/caffe2/core/hip/common_miopen.h b/caffe2/core/hip/common_miopen.h
index 59fa0f429f8ac6..ecdf376e474904 100644
--- a/caffe2/core/hip/common_miopen.h
+++ b/caffe2/core/hip/common_miopen.h
@@ -164,7 +164,7 @@ class miopenTensorDescWrapper
     miopenTensorDescriptor_t desc_;
     miopenDataType_t type_;
     vector<int> dims_;
-    AT_DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
+    C10_DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h
index 910db8b79d7885..328c7522258d93 100644
--- a/caffe2/core/hip/miopen_wrapper.h
+++ b/caffe2/core/hip/miopen_wrapper.h
@@ -92,7 +92,7 @@ class MIOPENState
     hipStream_t stream_{nullptr};
     MIOPENWorkspace workspace_;
     size_t gpu_id_{0};
-    AT_DISABLE_COPY_AND_ASSIGN(MIOPENState);
+    C10_DISABLE_COPY_AND_ASSIGN(MIOPENState);
 };
 
 /**
@@ -157,7 +157,7 @@ class MIOPENWrapper
                    CAFFE2_COMPILE_TIME_MAX_HIP_GPUS>;
     static PerGPUMIOPENStates& miopen_states();
 
-    AT_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
+    C10_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
 };
 
 }; // namespace caffe2
diff --git a/caffe2/core/hip/net_async_dag_hip.cc b/caffe2/core/hip/net_async_dag_hip.cc
index fa35b2a8c21617..faac5b119f576a 100644
--- a/caffe2/core/hip/net_async_dag_hip.cc
+++ b/caffe2/core/hip/net_async_dag_hip.cc
@@ -58,7 +58,7 @@ class ProfiledRange
     ProfiledRange(const OperatorDef& def, Color color) {}
 
     private:
-    AT_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+     C10_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 
 } // namespace
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 962363ad0270e7..57fd53f1de4f12 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -124,7 +124,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
   string name_;
   vector<const Event*> events_;
   std::shared_ptr<const NetDef> net_def_;
-  AT_DISABLE_COPY_AND_ASSIGN(NetBase);
+  C10_DISABLE_COPY_AND_ASSIGN(NetBase);
 };
 
 class CAFFE2_API ExecutorHelper {
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 7edec76c439a9e..502233e7f045b4 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -125,7 +125,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   bool use_per_net_pools_;
   bool is_blocking_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncNetBase);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncNetBase);
 
  private:
   void storeExceptionPtr();
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 225337d1452b91..550a760826edd8 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -71,7 +71,7 @@ class ProfiledRange {
 
  private:
   nvtxRangeId_t range_ = 0;
-  AT_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+  C10_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 
 #else
@@ -81,7 +81,7 @@ class ProfiledRange {
   ProfiledRange(const OperatorDef& def, Color color) {}
 
  private:
-  AT_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+  C10_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 
 #endif // ifdef CAFFE2_USE_NVTX
diff --git a/caffe2/core/net_async_dag_gpu.h b/caffe2/core/net_async_dag_gpu.h
index 62ae301e4cbf29..845e5160d27b97 100644
--- a/caffe2/core/net_async_dag_gpu.h
+++ b/caffe2/core/net_async_dag_gpu.h
@@ -32,7 +32,7 @@ class AsyncDAGNet : public DAGNetBase {
   int stream(const DeviceOption& device_option);
   static thread_local std::vector<int> stream_counters_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncDAGNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncDAGNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_async_polling.h b/caffe2/core/net_async_polling.h
index 8b3d6db8d695e7..9c4a284f0d13a1 100644
--- a/caffe2/core/net_async_polling.h
+++ b/caffe2/core/net_async_polling.h
@@ -40,7 +40,7 @@ class AsyncPollingNet : public AsyncNetBase {
   void reset() override;
   std::atomic<bool> has_chain_failed_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncPollingNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncPollingNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
index 8576fca1bb07f7..4fcdf4b7316818 100644
--- a/caffe2/core/net_async_scheduling.h
+++ b/caffe2/core/net_async_scheduling.h
@@ -30,7 +30,7 @@ class CAFFE2_API AsyncSchedulingNet : public AsyncNetBase {
 
   std::atomic<int> processed_tasks_num_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_dag.h b/caffe2/core/net_dag.h
index 078fa63a4238aa..ab3ce0f6f3fa10 100644
--- a/caffe2/core/net_dag.h
+++ b/caffe2/core/net_dag.h
@@ -84,7 +84,7 @@ class CAFFE2_API DAGNetBase : public NetBase {
   mutable std::vector<DAGNetStats> stats_;
   std::unordered_map<int, std::unique_ptr<Timer>> task_timers_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(DAGNetBase);
+  C10_DISABLE_COPY_AND_ASSIGN(DAGNetBase);
 };
 
 class CAFFE2_API DAGNet : public DAGNetBase {
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
index a8ac751dbb5edb..c114fd8d224f21 100644
--- a/caffe2/core/net_simple.h
+++ b/caffe2/core/net_simple.h
@@ -48,7 +48,7 @@ class CAFFE2_API SimpleNet : public NetBase {
 
   vector<unique_ptr<OperatorBase>> operators_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(SimpleNet);
+  C10_DISABLE_COPY_AND_ASSIGN(SimpleNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_simple_async.h b/caffe2/core/net_simple_async.h
index 38c3255bf4df33..ea5aae959870f6 100644
--- a/caffe2/core/net_simple_async.h
+++ b/caffe2/core/net_simple_async.h
@@ -43,7 +43,7 @@ class AsyncSimpleNet : public NetBase {
 
   vector<unique_ptr<OperatorBase>> operators_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncSimpleNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncSimpleNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 2a03e428619b30..e7a889980365c5 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -259,7 +259,7 @@ template <bool B, class T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
 
 template <typename T, typename U>
-struct CAFFE2_EXPORT inheritedFrom {
+struct C10_EXPORT inheritedFrom {
   static constexpr bool value =
       std::is_base_of<U, T>::value && !std::is_same<U, T>::value;
 };
@@ -267,14 +267,15 @@ struct CAFFE2_EXPORT inheritedFrom {
 // This is just a way to fix issues when the isa<> implementation
 // can't automatically downcast.
 template <typename T, typename N, typename = void>
-struct CAFFE2_EXPORT is_impl {
+struct C10_EXPORT is_impl {
   inline static bool impl(N n) {
     return isa<T>(n->data());
   }
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
+struct C10_EXPORT
+    is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
   inline static bool impl(N n) {
     if (!isa<NeuralNetOperator>(n->data().get())) {
       return false;
@@ -285,7 +286,8 @@ struct CAFFE2_EXPORT is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperato
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
+struct C10_EXPORT
+    is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
   inline static bool impl(N n) {
     if (!isa<NeuralNetData>(n->data().get())) {
       return false;
@@ -303,14 +305,15 @@ inline bool is(N n) {
 // This is just a way to fix issues when the dyn_cast<> implementation
 // can't automatically downcast.
 template <typename T, typename N, typename = void>
-struct CAFFE2_EXPORT get_impl {
+struct C10_EXPORT get_impl {
   inline static T* impl(N n) {
     return dyn_cast<T>(n->data().get());
   }
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
+struct C10_EXPORT
+    get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
   inline static T* impl(N n) {
     if (!is<T>(n)) {
       assert(0 && "Cannot get type from node");
@@ -322,7 +325,8 @@ struct CAFFE2_EXPORT get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperat
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
+struct C10_EXPORT
+    get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
   inline static T* impl(N n) {
     if (!is<T>(n)) {
       assert(0 && "Cannot get type from node");
@@ -422,7 +426,7 @@ CAFFE2_API std::vector<NNGraph::NodeRef> getOutputs(NNGraph::NodeRef n);
 CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m);
 
 template <NNGraph* G>
-struct CAFFE2_EXPORT NodeHelper {};
+struct C10_EXPORT NodeHelper {};
 
 struct NNNodeMatchCriteria {
   std::function<bool(NNGraph::NodeRef)> predicate;
diff --git a/caffe2/core/observer.h b/caffe2/core/observer.h
index e10ab0bb7eac69..378a7569d37bbe 100644
--- a/caffe2/core/observer.h
+++ b/caffe2/core/observer.h
@@ -51,7 +51,7 @@ class Observable {
 
   virtual ~Observable() = default;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Observable);
+  C10_DISABLE_COPY_AND_ASSIGN(Observable);
 
   using Observer = ObserverBase<T>;
 
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index f5683d1497377e..1a968c4c3755fe 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -397,7 +397,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   // An event used by asynchronous execution.
   std::unique_ptr<Event> event_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(OperatorBase);
+  C10_DISABLE_COPY_AND_ASSIGN(OperatorBase);
 };
 
 // If your operator does not need any specialized contructor or destructor,
@@ -825,7 +825,7 @@ CAFFE_DECLARE_REGISTRY(
 #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
   CAFFE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR(name, ...)                           \
-  CAFFE2_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();\
+  C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
   }                                                                \
@@ -844,7 +844,7 @@ CAFFE_DECLARE_REGISTRY(
 #define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
   CAFFE_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR(name, ...)                           \
-  CAFFE2_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();       \
+  C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();   \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
   }                                                                 \
@@ -869,10 +869,10 @@ CAFFE_DECLARE_REGISTRY(
 #define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
   CAFFE_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR(name, ...)                           \
-  CAFFE2_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();       \
+  C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
-  }                                                                 \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
+  }                                                                \
   CAFFE_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
   CAFFE_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
index a76a0df9bd004b..3082810b85cde9 100644
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@@ -415,7 +415,7 @@ std::vector<TensorFiller> OpSchema::SupplyDenseFillers(
   return fillers;
 }
 
-CAFFE2_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
+C10_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
   if (!schema.args().empty()) {
     out << "Arguments:" << std::endl;
     for (const auto& arg : schema.args()) {
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index e0b6495647ebd1..54a6a17b8a0d24 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -576,16 +576,16 @@ OpSchema::Cost PointwiseCostInference(
 
 #ifndef CAFFE2_NO_OPERATOR_SCHEMA
 
-#define OPERATOR_SCHEMA(name)                                     \
-  CAFFE2_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){};          \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
+#define OPERATOR_SCHEMA(name)                                       \
+  C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
+  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
       &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #else // CAFFE2_NO_OPERATOR_SCHEMA
 
-#define OPERATOR_SCHEMA(name)                                     \
-  CAFFE2_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){};          \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
+#define OPERATOR_SCHEMA(name)                                       \
+  C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
+  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
       1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #endif // CAFFE2_NO_OPERATOR_SCHEMA
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
index f277ffdbdd0a67..385ebf1d5f9f8f 100644
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@@ -14,7 +14,7 @@
 namespace caffe2 {
 
 template <class Context>
-class CAFFE2_EXPORT QTensor {
+class C10_EXPORT QTensor {
  public:
   QTensor() {}
   virtual ~QTensor() {}
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index 7db975077ea8b9..f026795b23c3e1 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -100,7 +100,7 @@ class Registry {
   CaffeMap<SrcType, string> help_message_;
   std::mutex register_mutex_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Registry);
+  C10_DISABLE_COPY_AND_ASSIGN(Registry);
 };
 
 template <class SrcType, class ObjectPtrType, class... Args>
@@ -142,16 +142,16 @@ class Registerer {
  * declaration, as well as creating a convenient typename for its corresponding
  * registerer.
  */
-#define CAFFE_DECLARE_TYPED_REGISTRY(                                    \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
-  CAFFE2_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*   \
-  RegistryName();                                                        \
-  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>        \
+#define CAFFE_DECLARE_TYPED_REGISTRY(                               \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                \
+  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName();                                                   \
+  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
       Registerer##RegistryName;
 
 #define CAFFE_DEFINE_TYPED_REGISTRY(                                         \
     RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  CAFFE2_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*       \
+  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*          \
   RegistryName() {                                                           \
     static Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* registry = \
         new Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();         \
diff --git a/caffe2/core/timer.h b/caffe2/core/timer.h
index a290ffc4aadc1b..a0384b0dbdbd02 100644
--- a/caffe2/core/timer.h
+++ b/caffe2/core/timer.h
@@ -41,7 +41,7 @@ class Timer {
 
  protected:
   std::chrono::time_point<clock> start_time_;
-  AT_DISABLE_COPY_AND_ASSIGN(Timer);
+  C10_DISABLE_COPY_AND_ASSIGN(Timer);
 };
 }
 
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index cbc58f742c2398..2ad486c328f56d 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -328,7 +328,7 @@ class CAFFE2_API Workspace {
   std::mutex thread_pool_creation_mutex_;
   std::shared_ptr<Bookkeeper> bookkeeper_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Workspace);
+  C10_DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 
 }  // namespace caffe2
diff --git a/caffe2/db/create_db_op.h b/caffe2/db/create_db_op.h
index ac7c137cea9aa8..6a964f86d1b439 100644
--- a/caffe2/db/create_db_op.h
+++ b/caffe2/db/create_db_op.h
@@ -34,7 +34,7 @@ class CreateDBOp final : public Operator<Context> {
   string db_name_;
   uint32_t num_shards_;
   uint32_t shard_id_;
-  AT_DISABLE_COPY_AND_ASSIGN(CreateDBOp);
+  C10_DISABLE_COPY_AND_ASSIGN(CreateDBOp);
 };
 
 } // namespace caffe2
diff --git a/caffe2/db/leveldb.cc b/caffe2/db/leveldb.cc
index 23a188027ece7d..fe2665f3a6f0e8 100644
--- a/caffe2/db/leveldb.cc
+++ b/caffe2/db/leveldb.cc
@@ -51,7 +51,7 @@ class LevelDBTransaction : public Transaction {
   leveldb::DB* db_;
   std::unique_ptr<leveldb::WriteBatch> batch_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
 };
 
 class LevelDB : public DB {
diff --git a/caffe2/db/lmdb.cc b/caffe2/db/lmdb.cc
index 2eb65bb7aa7386..a2eee9910655aa 100644
--- a/caffe2/db/lmdb.cc
+++ b/caffe2/db/lmdb.cc
@@ -114,7 +114,7 @@ class LMDBTransaction final : public Transaction {
   MDB_dbi mdb_dbi_;
   MDB_txn* mdb_txn_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
 };
 
 class LMDB : public DB {
diff --git a/caffe2/db/protodb.cc b/caffe2/db/protodb.cc
index 2473ad23b6c45d..fdaaaf57f17162 100644
--- a/caffe2/db/protodb.cc
+++ b/caffe2/db/protodb.cc
@@ -60,7 +60,7 @@ class ProtoDBTransaction : public Transaction {
   TensorProtos* proto_;
   std::unordered_set<string> existing_names_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(ProtoDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(ProtoDBTransaction);
 };
 
 class ProtoDB : public DB {
diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h
index bd0ad40422079e..736d8ede8cf53d 100644
--- a/caffe2/mkl/utils/mkl_memory.h
+++ b/caffe2/mkl/utils/mkl_memory.h
@@ -58,7 +58,7 @@ class PrimitiveWrapper {
 
  private:
   dnnPrimitive_t primitive_ = 0;
-  AT_DISABLE_COPY_AND_ASSIGN(PrimitiveWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(PrimitiveWrapper);
 };
 
 template <typename T>
@@ -138,7 +138,7 @@ class LayoutWrapper {
 
  private:
   dnnLayout_t layout_ = 0;
-  AT_DISABLE_COPY_AND_ASSIGN(LayoutWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(LayoutWrapper);
 };
 
 /**
@@ -557,7 +557,7 @@ class MKLMemory {
   // The primitive to use to convert from internal layout to user layout
   PrimitiveWrapper<T> convert_out_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(MKLMemory);
+  C10_DISABLE_COPY_AND_ASSIGN(MKLMemory);
 };
 
 template <typename T>
@@ -575,7 +575,7 @@ class MKLWorkspace {
 
  private:
   void* buffer_;
-  AT_DISABLE_COPY_AND_ASSIGN(MKLWorkspace);
+  C10_DISABLE_COPY_AND_ASSIGN(MKLWorkspace);
 };
 
 } // namespace mkl
diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.h b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
index 029d888b1ebf94..1dc93dedc3fff3 100644
--- a/caffe2/mobile/contrib/arm-compute/core/net_gl.h
+++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
@@ -57,7 +57,7 @@ class GLNet : public NetBase {
 
   vector<unique_ptr<OperatorBase>> operators_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(GLNet);
+  C10_DISABLE_COPY_AND_ASSIGN(GLNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/expand_squeeze_dims_op.h b/caffe2/operators/expand_squeeze_dims_op.h
index 505b1ec7d69090..37a3b5716127de 100644
--- a/caffe2/operators/expand_squeeze_dims_op.h
+++ b/caffe2/operators/expand_squeeze_dims_op.h
@@ -112,7 +112,7 @@ class SqueezeOp : public Operator<Context> {
   vector<int> dims_;
 
  public:
-  AT_DISABLE_COPY_AND_ASSIGN(SqueezeOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SqueezeOp);
 };
 } // namespace caffe2
 #endif // CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index 94bd1e6150cef4..32f31f97d878c3 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -221,7 +221,7 @@ class PartitionOp : public PartitionOpBase {
     return true;
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(PartitionOp);
+  C10_DISABLE_COPY_AND_ASSIGN(PartitionOp);
 };
 
 class LengthsPartitionOp : public PartitionOpBase {
@@ -302,7 +302,7 @@ class LengthsPartitionOp : public PartitionOpBase {
     return true;
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
+  C10_DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
 
   vector<int32_t*> out_length_;
 };
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index 475d8329c92493..8ddb204ebd5b42 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -302,7 +302,7 @@ class SliceGradientOp<CUDAContext> : public Operator<CUDAContext> {
         ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
-  AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
 
   bool RunOnDevice() override {
     if (InputSize() == 4) {
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index e7f8919bb81c87..6149f077669d76 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -249,7 +249,7 @@ class SliceOp : public Operator<Context> {
         output, data, starts_host_, ends_host_, &context_);
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(SliceOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SliceOp);
 
  protected:
   std::vector<int64_t> starts_;
@@ -269,7 +269,7 @@ class SliceGradientOp : public Operator<Context> {
         ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
-        AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
 
   bool RunOnDevice() override {
     if (InputSize() == 4) {
diff --git a/caffe2/opt/fusion.h b/caffe2/opt/fusion.h
index 33dc2e4c54b1a0..0973ade54b383a 100644
--- a/caffe2/opt/fusion.h
+++ b/caffe2/opt/fusion.h
@@ -37,7 +37,7 @@ CAFFE2_API void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
 // \param postprocess Functor to postprocess the conv node,
 // attaching additional attributes if necessary
 template <typename OperationT, typename ActivationT>
-CAFFE2_EXPORT void fuseActivation(
+C10_EXPORT void fuseActivation(
     repr::NNModule* nn,
     std::function<bool(const OperationT& conv)> should_fuse,
     std::function<void(repr::NNGraph::NodeRef conv_node)> postprocess) {
diff --git a/caffe2/opt/sink.cc b/caffe2/opt/sink.cc
index c4d73d7abb12dd..ed4cd8a3725375 100644
--- a/caffe2/opt/sink.cc
+++ b/caffe2/opt/sink.cc
@@ -8,7 +8,7 @@ namespace opt {
 
 using namespace nom;
 
-CAFFE2_EXPORT void sinkMaxPool(nom::repr::NNModule* nn) {
+C10_EXPORT void sinkMaxPool(nom::repr::NNModule* nn) {
   for (auto max_pool_node :
        repr::nn::nodeIterator<repr::MaxPool>(nn->dataFlow)) {
     if (repr::nn::getInputs(max_pool_node).size() != 1) {
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index dd5d3b9bc18ef9..4f81569e429369 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -43,7 +43,7 @@ void addObjectMethods(pybind11::module& m);
 // Get current workspace
 Workspace* GetCurrentWorkspace();
 
-class CAFFE2_EXPORT BlobFetcherBase {
+class C10_EXPORT BlobFetcherBase {
  public:
   struct FetchedBlob {
     pybind11::object obj;
@@ -60,7 +60,7 @@ class BlobFeederBase {
   Feed(const DeviceOption& option, PyArrayObject* array, Blob* blob) = 0;
 };
 
-CAFFE2_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
+C10_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
     BlobFetcherRegistry,
     TypeIdentifier,
     BlobFetcherBase,
diff --git a/caffe2/queue/blobs_queue_db.cc b/caffe2/queue/blobs_queue_db.cc
index 06a6985848ce26..bd7795c94ad2ec 100644
--- a/caffe2/queue/blobs_queue_db.cc
+++ b/caffe2/queue/blobs_queue_db.cc
@@ -32,7 +32,7 @@ class CreateBlobsQueueDBOp : public Operator<CPUContext> {
   }
 
  private:
-  AT_DISABLE_COPY_AND_ASSIGN(CreateBlobsQueueDBOp);
+  C10_DISABLE_COPY_AND_ASSIGN(CreateBlobsQueueDBOp);
 };
 
 REGISTER_CPU_OPERATOR(CreateBlobsQueueDB, CreateBlobsQueueDBOp<CPUContext>);
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 18e20e4fa41413..e770bcfd9afae5 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -76,7 +76,7 @@ namespace math {
 // (transpose) if the argument TransA or TransB is set to CblasNoTrans or
 // CblasTrans, respectively, for each of A and B.
 template <>
-CAFFE2_EXPORT void Gemm<float, CPUContext>(
+C10_EXPORT void Gemm<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -134,7 +134,7 @@ CAFFE2_EXPORT void Gemm<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void GemmEx<float, CPUContext>(
+C10_EXPORT void GemmEx<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -206,7 +206,7 @@ CAFFE2_EXPORT void GemmEx<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void Gemv<float, CPUContext>(
+C10_EXPORT void Gemv<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -245,7 +245,7 @@ CAFFE2_EXPORT void Gemv<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_DOT(T)                                        \
   template <>                                                            \
-  CAFFE2_EXPORT void Dot<T, CPUContext>(                                               \
+  C10_EXPORT void Dot<T, CPUContext>(                                    \
       const int N, const T* a, const T* b, T* y, CPUContext* context) {  \
     *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
   }
@@ -254,12 +254,12 @@ CAFFE2_SPECIALIZED_DOT(float)
 
 #define CAFFE2_SPECIALIZED_AXPY(T)                                          \
   template <>                                                               \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                                 \
+  C10_EXPORT void Axpy<T, CPUContext>(                                      \
       const int N, const T alpha, const T* x, T* Y, CPUContext* context) {  \
     EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha;        \
   }                                                                         \
   template <>                                                               \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                                 \
+  C10_EXPORT void Axpy<T, CPUContext>(                                      \
       const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
     EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha);     \
   }
@@ -268,7 +268,7 @@ CAFFE2_SPECIALIZED_AXPY(float)
 
 #define CAFFE2_SPECIALIZED_AXPBY(T)                                     \
   template <>                                                           \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                                         \
+  C10_EXPORT void Axpby<T, T, CPUContext>(                              \
       const int N,                                                      \
       const T alpha,                                                    \
       const T* x,                                                       \
@@ -279,7 +279,7 @@ CAFFE2_SPECIALIZED_AXPY(float)
     y_arr = y_arr * beta + ConstEigenVectorArrayMap<T>(x, N) * alpha;   \
   }                                                                     \
   template <>                                                           \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                                         \
+  C10_EXPORT void Axpby<T, T, CPUContext>(                              \
       const int N,                                                      \
       const T* alpha,                                                   \
       const T* x,                                                       \
@@ -295,7 +295,7 @@ CAFFE2_SPECIALIZED_AXPBY(float)
 #else // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
-CAFFE2_EXPORT void Gemm<float, CPUContext>(
+C10_EXPORT void Gemm<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -328,7 +328,7 @@ CAFFE2_EXPORT void Gemm<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void GemmEx<float, CPUContext>(
+C10_EXPORT void GemmEx<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -361,7 +361,7 @@ CAFFE2_EXPORT void GemmEx<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void Gemv<float, CPUContext>(
+C10_EXPORT void Gemv<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -377,7 +377,7 @@ CAFFE2_EXPORT void Gemv<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix)          \
   template <>                                                    \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                         \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(              \
       const int n,                                               \
       const TAlpha alpha,                                        \
       const TData* x,                                            \
@@ -391,7 +391,7 @@ CAFFE2_EXPORT void Gemv<float, CPUContext>(
     }                                                            \
   }                                                              \
   template <>                                                    \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                         \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(              \
       const int n,                                               \
       const TAlpha* alpha,                                       \
       const TData* x,                                            \
@@ -411,7 +411,7 @@ CAFFE2_SPECIALIZED_SCALE(float, double, d)
 
 #define CAFFE2_SPECIALIZED_DOT(T, prefix)                       \
   template <>                                                   \
-  CAFFE2_EXPORT void Dot<T, CPUContext>(                                      \
+  C10_EXPORT void Dot<T, CPUContext>(                           \
       const int N, const T* a, const T* b, T* y, CPUContext*) { \
     *y = cblas_##prefix##dot(N, a, 1, b, 1);                    \
   }
@@ -420,12 +420,12 @@ CAFFE2_SPECIALIZED_DOT(float, s)
 
 #define CAFFE2_SPECIALIZED_AXPY(T, prefix)                          \
   template <>                                                       \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                         \
+  C10_EXPORT void Axpy<T, CPUContext>(                              \
       const int N, const T alpha, const T* x, T* y, CPUContext*) {  \
     cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                     \
   }                                                                 \
   template <>                                                       \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                         \
+  C10_EXPORT void Axpy<T, CPUContext>(                              \
       const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
     cblas_##prefix##axpy(N, *alpha, x, 1, y, 1);                    \
   }
@@ -437,7 +437,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
 #ifdef CAFFE2_USE_MKL
 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix)              \
   template <>                                            \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                          \
+  C10_EXPORT void Axpby<T, T, CPUContext>(               \
       const int N,                                       \
       const T alpha,                                     \
       const T* x,                                        \
@@ -447,7 +447,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
     cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1);   \
   }                                                      \
   template <>                                            \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                          \
+  C10_EXPORT void Axpby<T, T, CPUContext>(               \
       const int N,                                       \
       const T* alpha,                                    \
       const T* x,                                        \
@@ -459,7 +459,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
 #else // CAFFE2_USE_MKL
 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix)      \
   template <>                                    \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                  \
+  C10_EXPORT void Axpby<T, T, CPUContext>(       \
       const int N,                               \
       const T alpha,                             \
       const T* x,                                \
@@ -470,7 +470,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
     cblas_##prefix##axpy(N, alpha, x, 1, y, 1);  \
   }                                              \
   template <>                                    \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                  \
+  C10_EXPORT void Axpby<T, T, CPUContext>(       \
       const int N,                               \
       const T* alpha,                            \
       const T* x,                                \
@@ -488,7 +488,7 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)
 
 #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData)                        \
   template <>                                                          \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                               \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                    \
       const int n,                                                     \
       const TAlpha alpha,                                              \
       const TData* x,                                                  \
@@ -498,7 +498,7 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)
         ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha);  \
   }                                                                    \
   template <>                                                          \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                               \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                    \
       const int n,                                                     \
       const TAlpha* alpha,                                             \
       const TData* x,                                                  \
@@ -517,7 +517,7 @@ CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
 #undef CAFFE2_SPECIALIZED_SCALE
 
 template <>
-CAFFE2_EXPORT void GemmBatched<float, CPUContext>(
+C10_EXPORT void GemmBatched<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -563,7 +563,7 @@ CAFFE2_EXPORT void GemmBatched<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void GemmStridedBatched<float, CPUContext>(
+C10_EXPORT void GemmStridedBatched<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -632,10 +632,11 @@ CAFFE2_EXPORT void GemmStridedBatched<float, CPUContext>(
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef CAFFE2_USE_MKL
 
-#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...)       \
-  template <>                                                                \
-  CAFFE2_EXPORT void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    OriginalFunc(N, x, y, ##__VA_ARGS__);                                    \
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \
+  template <>                                                          \
+  C10_EXPORT void Funcname<T, CPUContext>(                             \
+      const int N, const T* x, T* y, CPUContext*) {                    \
+    OriginalFunc(N, x, y, ##__VA_ARGS__);                              \
   }
 DELEGATE_SIMPLE_UNARY_FUNCTION(
     float,
@@ -683,7 +684,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, vdInv)
 
 #define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc)           \
   template <>                                               \
-  CAFFE2_EXPORT void SinCos<T, CPUContext>(                               \
+  C10_EXPORT void SinCos<T, CPUContext>(                    \
       const int N, const T* a, T* ys, T* yc, CPUContext*) { \
     OriginalFunc(N, a, ys, yc);                             \
   }
@@ -691,10 +692,11 @@ DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
 DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
 #undef DELEGATE_SINCOS_FUNCTION
 
-#define DELEGATE_POWX_FUNCTION(T, OriginalFunc)                               \
-  template <>                                                                 \
-  CAFFE2_EXPORT void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
-    OriginalFunc(N, a, b, y);                                                 \
+#define DELEGATE_POWX_FUNCTION(T, OriginalFunc)          \
+  template <>                                            \
+  C10_EXPORT void Powx<T, CPUContext>(                   \
+      const int N, const T* a, T b, T* y, CPUContext*) { \
+    OriginalFunc(N, a, b, y);                            \
   }
 DELEGATE_POWX_FUNCTION(float, vsPowx)
 DELEGATE_POWX_FUNCTION(double, vdPowx)
@@ -702,7 +704,7 @@ DELEGATE_POWX_FUNCTION(double, vdPowx)
 
 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl)      \
   template <>                                                   \
-  CAFFE2_EXPORT void Func<T, CPUContext>(                                     \
+  C10_EXPORT void Func<T, CPUContext>(                          \
       const int N, const T* A, const T* B, T* C, CPUContext*) { \
     FuncImpl(N, A, B, C);                                       \
   }
@@ -718,10 +720,11 @@ DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
 
 #else // CAFFE2_USE_MKL
 
-#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                    \
-  template <>                                                                \
-  CAFFE2_EXPORT void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr();      \
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)               \
+  template <>                                                           \
+  C10_EXPORT void Funcname<T, CPUContext>(                              \
+      const int N, const T* x, T* y, CPUContext*) {                     \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr(); \
   }
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
@@ -750,7 +753,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, rsqrt)
 
 #define DELEGATE_SINCOS_FUNCTION(T)                                     \
   template <>                                                           \
-  CAFFE2_EXPORT void SinCos<T, CPUContext>(                                           \
+  C10_EXPORT void SinCos<T, CPUContext>(                                \
       const int N, const T* x, T* ys, T* yc, CPUContext*) {             \
     EigenVectorMap<T>(ys, N) = ConstEigenVectorArrayMap<T>(x, N).sin(); \
     EigenVectorMap<T>(yc, N) = ConstEigenVectorArrayMap<T>(x, N).cos(); \
@@ -761,7 +764,8 @@ DELEGATE_SINCOS_FUNCTION(double)
 
 #define DELEGATE_TANH_FUNCTION(T)                                             \
   template <>                                                                 \
-  CAFFE2_EXPORT void Tanh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) {      \
+  C10_EXPORT void Tanh<T, CPUContext>(                                        \
+      const int N, const T* X, T* Y, CPUContext*) {                           \
     EigenVectorMap<T>(Y, N) = T(1) -                                          \
         ((ConstEigenVectorArrayMap<T>(X, N) * T(2)).exp() + T(1)).inverse() * \
             T(2);                                                             \
@@ -770,10 +774,11 @@ DELEGATE_TANH_FUNCTION(float)
 DELEGATE_TANH_FUNCTION(double)
 #undef DELEGATE_TANH_FUNCTION
 
-#define DELEGATE_CBRT_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Cbrt<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    std::transform(X, X + N, Y, [](const T x) { return cbrt(x); });      \
+#define DELEGATE_CBRT_FUNCTION(T)                                   \
+  template <>                                                       \
+  C10_EXPORT void Cbrt<T, CPUContext>(                              \
+      const int N, const T* X, T* Y, CPUContext*) {                 \
+    std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \
   }
 DELEGATE_CBRT_FUNCTION(float)
 DELEGATE_CBRT_FUNCTION(double)
@@ -781,28 +786,30 @@ DELEGATE_CBRT_FUNCTION(double)
 
 #define DELEGATE_POWX_FUNCTION(T)                                       \
   template <>                                                           \
-  CAFFE2_EXPORT void Powx<T, CPUContext>(                                             \
+  C10_EXPORT void Powx<T, CPUContext>(                                  \
       const int N, const T* a, const T b, T* y, CPUContext*) {          \
     EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(a, N).pow(b); \
   }
 DELEGATE_POWX_FUNCTION(float)
 #undef DELEGATE_POWX_FUNCTION
 
-#define DELEGATE_SINH_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Sinh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    ConstEigenVectorArrayMap<T> X_arr(X, N);                             \
-    EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2;        \
+#define DELEGATE_SINH_FUNCTION(T)                                 \
+  template <>                                                     \
+  C10_EXPORT void Sinh<T, CPUContext>(                            \
+      const int N, const T* X, T* Y, CPUContext*) {               \
+    ConstEigenVectorArrayMap<T> X_arr(X, N);                      \
+    EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2; \
   }
 DELEGATE_SINH_FUNCTION(float)
 DELEGATE_SINH_FUNCTION(double)
 #undef DELEGATE_SINH_FUNCTION
 
-#define DELEGATE_COSH_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Cosh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    ConstEigenVectorArrayMap<T> X_arr(X, N);                             \
-    EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2;        \
+#define DELEGATE_COSH_FUNCTION(T)                                 \
+  template <>                                                     \
+  C10_EXPORT void Cosh<T, CPUContext>(                            \
+      const int N, const T* X, T* Y, CPUContext*) {               \
+    ConstEigenVectorArrayMap<T> X_arr(X, N);                      \
+    EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2; \
   }
 DELEGATE_COSH_FUNCTION(float)
 DELEGATE_COSH_FUNCTION(double)
@@ -810,7 +817,8 @@ DELEGATE_COSH_FUNCTION(double)
 
 #define DELEGATE_INV_FUNCTION(T)                                           \
   template <>                                                              \
-  CAFFE2_EXPORT void Inv<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) {    \
+  C10_EXPORT void Inv<T, CPUContext>(                                      \
+      const int N, const T* x, T* y, CPUContext*) {                        \
     EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).inverse(); \
   }
 DELEGATE_INV_FUNCTION(float)
@@ -819,10 +827,11 @@ DELEGATE_INV_FUNCTION(double)
 
 #endif // CAFFE2_USE_MKL
 
-#define DELEGATE_NEG_FUNCTION(T)                                        \
-  template <>                                                           \
-  CAFFE2_EXPORT void Neg<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N);            \
+#define DELEGATE_NEG_FUNCTION(T)                             \
+  template <>                                                \
+  C10_EXPORT void Neg<T, CPUContext>(                        \
+      const int N, const T* x, T* y, CPUContext*) {          \
+    EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N); \
   }
 DELEGATE_NEG_FUNCTION(float)
 DELEGATE_NEG_FUNCTION(double)
@@ -830,10 +839,11 @@ DELEGATE_NEG_FUNCTION(std::int32_t)
 DELEGATE_NEG_FUNCTION(std::int64_t)
 #undef DELEGATE_NEG_FUNCTION
 
-#define DELEGATE_SIGN_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Sign<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign();  \
+#define DELEGATE_SIGN_FUNCTION(T)                                       \
+  template <>                                                           \
+  C10_EXPORT void Sign<T, CPUContext>(                                  \
+      const int N, const T* x, T* y, CPUContext*) {                     \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign(); \
   }
 DELEGATE_SIGN_FUNCTION(float)
 DELEGATE_SIGN_FUNCTION(double)
@@ -841,10 +851,11 @@ DELEGATE_SIGN_FUNCTION(std::int32_t)
 DELEGATE_SIGN_FUNCTION(std::int64_t)
 #undef DELEGATE_SIGN_FUNCTION
 
-#define DELEGATE_ABS_FUNCTION(T)                                        \
-  template <>                                                           \
-  CAFFE2_EXPORT void Abs<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).abs();  \
+#define DELEGATE_ABS_FUNCTION(T)                                       \
+  template <>                                                          \
+  C10_EXPORT void Abs<T, CPUContext>(                                  \
+      const int N, const T* x, T* y, CPUContext*) {                    \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).abs(); \
   }
 #ifndef CAFFE2_USE_MKL
 DELEGATE_ABS_FUNCTION(float)
@@ -854,10 +865,11 @@ DELEGATE_ABS_FUNCTION(std::int32_t)
 DELEGATE_ABS_FUNCTION(std::int64_t)
 #undef DELEGATE_ABS_FUNCTION
 
-#define DELEGATE_CUBE_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Cube<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube();  \
+#define DELEGATE_CUBE_FUNCTION(T)                                       \
+  template <>                                                           \
+  C10_EXPORT void Cube<T, CPUContext>(                                  \
+      const int N, const T* X, T* Y, CPUContext*) {                     \
+    EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube(); \
   }
 DELEGATE_CUBE_FUNCTION(float)
 DELEGATE_CUBE_FUNCTION(double)
@@ -867,7 +879,7 @@ DELEGATE_CUBE_FUNCTION(std::int64_t)
 
 #define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr)             \
   template <>                                                   \
-  CAFFE2_EXPORT void Func<T, CPUContext>(                                     \
+  C10_EXPORT void Func<T, CPUContext>(                          \
       const int N, const T* A, const T* B, T* C, CPUContext*) { \
     EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
         expr ConstEigenVectorArrayMap<T>(B, N);                 \
@@ -903,19 +915,20 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
 // Eigen or via custom code.
 ////////////////////////////////////////////////////////////////////////////////
 
-#define CAFFE2_SPECIALIZED_SET(T)                                             \
-  template <>                                                                 \
-  CAFFE2_EXPORT void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \
-    if (N == 0) {                                                             \
-      return;                                                                 \
-    }                                                                         \
-    if (alpha == (T)0) {                                                      \
-      if (Y != nullptr) {                                                     \
-        std::memset(Y, 0, N * sizeof(T));                                     \
-      }                                                                       \
-    } else {                                                                  \
-      EigenVectorMap<T>(Y, N).setConstant(alpha);                             \
-    }                                                                         \
+#define CAFFE2_SPECIALIZED_SET(T)                         \
+  template <>                                             \
+  C10_EXPORT void Set<T, CPUContext>(                     \
+      const size_t N, const T alpha, T* Y, CPUContext*) { \
+    if (N == 0) {                                         \
+      return;                                             \
+    }                                                     \
+    if (alpha == (T)0) {                                  \
+      if (Y != nullptr) {                                 \
+        std::memset(Y, 0, N * sizeof(T));                 \
+      }                                                   \
+    } else {                                              \
+      EigenVectorMap<T>(Y, N).setConstant(alpha);         \
+    }                                                     \
   }
 
 CAFFE2_SPECIALIZED_SET(float);
@@ -932,7 +945,7 @@ CAFFE2_SPECIALIZED_SET(uint16_t);
 
 #define CAFFE2_SPECIALIZED_REDUCEMIN(T)                \
   template <>                                          \
-  CAFFE2_EXPORT void ReduceMin<T, CPUContext>(                       \
+  C10_EXPORT void ReduceMin<T, CPUContext>(            \
       const int N,                                     \
       const T* x,                                      \
       T* y,                                            \
@@ -945,7 +958,7 @@ CAFFE2_SPECIALIZED_REDUCEMIN(float)
 
 #define CAFFE2_SPECIALIZED_REDUCEMAX(T)                \
   template <>                                          \
-  CAFFE2_EXPORT void ReduceMax<T, CPUContext>(                       \
+  C10_EXPORT void ReduceMax<T, CPUContext>(            \
       const int N,                                     \
       const T* x,                                      \
       T* y,                                            \
@@ -991,7 +1004,7 @@ struct SquaredL2NormFunctor {
 
 #define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp)                    \
   template <typename T>                                                    \
-  CAFFE2_EXPORT void Rowwise##Func(                                                      \
+  C10_EXPORT void Rowwise##Func(                                           \
       const int rows, const int cols, const T alpha, const T* X, T* Y) {   \
     EigenVectorMap<T>(Y, rows) =                                           \
         ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
@@ -1006,7 +1019,7 @@ DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
 
 #define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp)                    \
   template <typename T>                                                    \
-  CAFFE2_EXPORT void Colwise##Func(                                                      \
+  C10_EXPORT void Colwise##Func(                                           \
       const int rows, const int cols, const T alpha, const T* X, T* Y) {   \
     EigenVectorMap<T>(Y, cols) =                                           \
         ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
@@ -1020,7 +1033,7 @@ DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
 #undef DELEGATE_COLWISE_REDUCE_FUNCTION
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceMin(
+C10_EXPORT void BothEndsReduceMin(
     const int pre,
     const int mid,
     const int nxt,
@@ -1044,7 +1057,7 @@ CAFFE2_EXPORT void BothEndsReduceMin(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceMax(
+C10_EXPORT void BothEndsReduceMax(
     const int pre,
     const int mid,
     const int nxt,
@@ -1066,7 +1079,7 @@ CAFFE2_EXPORT void BothEndsReduceMax(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceSum(
+C10_EXPORT void BothEndsReduceSum(
     const int pre,
     const int mid,
     const int nxt,
@@ -1087,7 +1100,7 @@ CAFFE2_EXPORT void BothEndsReduceSum(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceMean(
+C10_EXPORT void BothEndsReduceMean(
     const int pre,
     const int mid,
     const int nxt,
@@ -1108,7 +1121,7 @@ CAFFE2_EXPORT void BothEndsReduceMean(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceL1(
+C10_EXPORT void BothEndsReduceL1(
     const int pre,
     const int mid,
     const int nxt,
@@ -1135,7 +1148,7 @@ CAFFE2_EXPORT void BothEndsReduceL1(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceL2(
+C10_EXPORT void BothEndsReduceL2(
     const int pre,
     const int mid,
     const int nxt,
@@ -1155,7 +1168,7 @@ CAFFE2_EXPORT void BothEndsReduceL2(
 }
 
 template <typename T, class Reducer>
-CAFFE2_EXPORT void ReduceTensor(
+C10_EXPORT void ReduceTensor(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -1183,7 +1196,7 @@ CAFFE2_EXPORT void ReduceTensor(
 
 #define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm)              \
   template <>                                                                  \
-  CAFFE2_EXPORT void Func<T, CPUContext>(                                                    \
+  C10_EXPORT void Func<T, CPUContext>(                                         \
       const int num_dims,                                                      \
       const int* dims,                                                         \
       const int num_axes,                                                      \
@@ -1325,7 +1338,7 @@ DELEGATE_REDUCE_FUNCTION(
 
 #define CAFFE2_SPECIALIZED_REDUCE_MEAN(T)                                      \
   template <>                                                                  \
-  CAFFE2_EXPORT void ReduceMean<T, CPUContext>(                                              \
+  C10_EXPORT void ReduceMean<T, CPUContext>(                                   \
       const int num_dims,                                                      \
       const int* dims,                                                         \
       const int num_axes,                                                      \
@@ -1392,7 +1405,7 @@ CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
 
 #define CAFFE2_SPECIALIZED_REDUCE_L2(T)                                        \
   template <>                                                                  \
-  CAFFE2_EXPORT void ReduceL2<T, CPUContext>(                                                \
+  C10_EXPORT void ReduceL2<T, CPUContext>(                                     \
       const int num_dims,                                                      \
       const int* dims,                                                         \
       const int num_axes,                                                      \
@@ -1462,7 +1475,7 @@ CAFFE2_SPECIALIZED_REDUCE_L2(double)
 namespace {
 
 template <typename T>
-CAFFE2_EXPORT void BroadcastImpl(
+C10_EXPORT void BroadcastImpl(
     const int X_ndim,
     const int* X_dims,
     const int Y_ndim,
@@ -1495,7 +1508,7 @@ CAFFE2_EXPORT void BroadcastImpl(
 
 #define CAFFE2_SPECIALIZED_BROADCAST(T)                                     \
   template <>                                                               \
-  CAFFE2_EXPORT void Broadcast<T, CPUContext>(                                            \
+  C10_EXPORT void Broadcast<T, CPUContext>(                                 \
       const int X_ndim,                                                     \
       const int* X_dims,                                                    \
       const int Y_ndim,                                                     \
@@ -1515,7 +1528,7 @@ CAFFE2_SPECIALIZED_BROADCAST(double)
 namespace {
 
 template <typename T>
-CAFFE2_EXPORT void RowwiseMoments(
+C10_EXPORT void RowwiseMoments(
     const int rows,
     const int cols,
     const T* X,
@@ -1529,7 +1542,7 @@ CAFFE2_EXPORT void RowwiseMoments(
 }
 
 template <typename T>
-CAFFE2_EXPORT void ColwiseMoments(
+C10_EXPORT void ColwiseMoments(
     const int rows,
     const int cols,
     const T* X,
@@ -1551,7 +1564,7 @@ CAFFE2_EXPORT void ColwiseMoments(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsMoments(
+C10_EXPORT void BothEndsMoments(
     const int pre,
     const int mid,
     const int nxt,
@@ -1576,7 +1589,7 @@ CAFFE2_EXPORT void BothEndsMoments(
 }
 
 template <typename T>
-CAFFE2_EXPORT void MomentsImpl(
+C10_EXPORT void MomentsImpl(
     const int num_dims,
     const int* dims,
     const int num_axes,
@@ -1643,7 +1656,7 @@ CAFFE2_EXPORT void MomentsImpl(
 
 #define CAFFE2_SPECIALIZED_MOMENTS(T)                                \
   template <>                                                        \
-  CAFFE2_EXPORT void Moments<T, CPUContext>(                                       \
+  C10_EXPORT void Moments<T, CPUContext>(                            \
       const int num_dims,                                            \
       const int* dims,                                               \
       const int num_axes,                                            \
@@ -1674,7 +1687,7 @@ CAFFE2_SPECIALIZED_INV_STD(float)
 
 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T)                         \
   template <>                                                    \
-  CAFFE2_EXPORT void RowwiseMax<T, CPUContext>(                                \
+  C10_EXPORT void RowwiseMax<T, CPUContext>(                     \
       const int N, const int D, const T* x, T* y, CPUContext*) { \
     EigenVectorMap<T>(y, N) =                                    \
         ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff();    \
@@ -1684,7 +1697,7 @@ CAFFE2_SPECIALIZED_ROWWISEMAX(float)
 
 #define CAFFE2_SPECIALIZED_COLWISEMAX(T)                         \
   template <>                                                    \
-  CAFFE2_EXPORT void ColwiseMax<T, CPUContext>(                                \
+  C10_EXPORT void ColwiseMax<T, CPUContext>(                     \
       const int N, const int D, const T* x, T* y, CPUContext*) { \
     EigenVectorMap<T>(y, D) =                                    \
         ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff();    \
@@ -1694,7 +1707,7 @@ CAFFE2_SPECIALIZED_COLWISEMAX(float)
 
 #define CAFFE2_SPECIALIZED_ELEMWISEMAX(T)                                   \
   template <>                                                               \
-  CAFFE2_EXPORT void ElemwiseMax<T, CPUContext>(                                          \
+  C10_EXPORT void ElemwiseMax<T, CPUContext>(                               \
       const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
     std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) {         \
       return std::max(x_i, y_i);                                            \
@@ -1705,7 +1718,7 @@ CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
 
 #define CAFFE2_SPECIALIZED_MAXIMUM(T)                                          \
   template <>                                                                  \
-  CAFFE2_EXPORT void Maximum<T, CPUContext>(                                                 \
+  C10_EXPORT void Maximum<T, CPUContext>(                                      \
       const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
     std::transform(                                                            \
         x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
@@ -1718,7 +1731,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
 
 #define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
   template <>                                                          \
-  CAFFE2_EXPORT void Rowwise##Func<T, CPUContext, true>(                             \
+  C10_EXPORT void Rowwise##Func<T, CPUContext, true>(                  \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1735,7 +1748,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
     }                                                                  \
   }                                                                    \
   template <>                                                          \
-  CAFFE2_EXPORT void Colwise##Func<T, CPUContext, true>(                             \
+  C10_EXPORT void Colwise##Func<T, CPUContext, true>(                  \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1755,7 +1768,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
 
 #define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
   template <>                                                          \
-  CAFFE2_EXPORT void Rowwise##Func<T, CPUContext, false>(                            \
+  C10_EXPORT void Rowwise##Func<T, CPUContext, false>(                 \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1772,7 +1785,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
     }                                                                  \
   }                                                                    \
   template <>                                                          \
-  CAFFE2_EXPORT void Colwise##Func<T, CPUContext, false>(                            \
+  C10_EXPORT void Colwise##Func<T, CPUContext, false>(                 \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1808,7 +1821,7 @@ DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
 
 #define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T)           \
   template <>                                               \
-  CAFFE2_EXPORT void RowwiseSub<T, CPUContext, true>(                     \
+  C10_EXPORT void RowwiseSub<T, CPUContext, true>(          \
       const int rows,                                       \
       const int cols,                                       \
       const T* A,                                           \
@@ -1820,7 +1833,7 @@ DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
         ConstEigenVectorArrayMap<T>(A, cols);               \
   }                                                         \
   template <>                                               \
-  CAFFE2_EXPORT void ColwiseSub<T, CPUContext, true>(                     \
+  C10_EXPORT void ColwiseSub<T, CPUContext, true>(          \
       const int rows,                                       \
       const int cols,                                       \
       const T* A,                                           \
@@ -1842,7 +1855,7 @@ DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
 
 #define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T)                  \
   template <>                                                      \
-  CAFFE2_EXPORT void RowwiseDiv<T, CPUContext, true>(                            \
+  C10_EXPORT void RowwiseDiv<T, CPUContext, true>(                 \
       const int rows,                                              \
       const int cols,                                              \
       const T* A,                                                  \
@@ -1854,7 +1867,7 @@ DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
         ConstEigenVectorArrayMap<T>(A, cols);                      \
   }                                                                \
   template <>                                                      \
-  CAFFE2_EXPORT void ColwiseDiv<T, CPUContext, true>(                            \
+  C10_EXPORT void ColwiseDiv<T, CPUContext, true>(                 \
       const int rows,                                              \
       const int cols,                                              \
       const T* A,                                                  \
@@ -1878,7 +1891,7 @@ DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
 #undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
 
 template <>
-CAFFE2_EXPORT void Not<bool, CPUContext>(
+C10_EXPORT void Not<bool, CPUContext>(
     const int N,
     const bool* x,
     bool* y,
@@ -1893,7 +1906,7 @@ CAFFE2_EXPORT void Not<bool, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T)             \
   template <>                                                   \
-  CAFFE2_EXPORT void AddStripedBatch(                                         \
+  C10_EXPORT void AddStripedBatch(                              \
       const int N,                                              \
       const T* first,                                           \
       T* y,                                                     \
@@ -1911,7 +1924,7 @@ CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
 namespace {
 
 template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
-CAFFE2_EXPORT void RowwiseBinaryOp(
+C10_EXPORT void RowwiseBinaryOp(
     const int rows,
     const int cols,
     const BinaryOperator& op,
@@ -1929,7 +1942,7 @@ CAFFE2_EXPORT void RowwiseBinaryOp(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
-CAFFE2_EXPORT void ColwiseBinaryOp(
+C10_EXPORT void ColwiseBinaryOp(
     const int rows,
     const int cols,
     const BinaryOperator& op,
@@ -1947,7 +1960,7 @@ CAFFE2_EXPORT void ColwiseBinaryOp(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-CAFFE2_EXPORT void BroadcastBinaryOpImpl(
+C10_EXPORT void BroadcastBinaryOpImpl(
     const int ndim,
     const int* A_dims,
     const int* B_dims,
@@ -1971,7 +1984,7 @@ CAFFE2_EXPORT void BroadcastBinaryOpImpl(
 
 #define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op)               \
   template <>                                                          \
-  CAFFE2_EXPORT void Func<TIn, CPUContext>(                                          \
+  C10_EXPORT void Func<TIn, CPUContext>(                               \
       const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \
     std::transform(A, A + N, B, C, Op<TIn>());                         \
   }
@@ -2011,7 +2024,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op)             \
   template <>                                                                  \
-  CAFFE2_EXPORT void Rowwise##Func<TIn, CPUContext, true>(                                   \
+  C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>(                        \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2021,7 +2034,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
     RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
   }                                                                            \
   template <>                                                                  \
-  CAFFE2_EXPORT void Rowwise##Func<TIn, CPUContext, false>(                                  \
+  C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>(                       \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2032,7 +2045,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
         rows, cols, Op<TIn>(), A, B, C);                                       \
   }                                                                            \
   template <>                                                                  \
-  CAFFE2_EXPORT void Colwise##Func<TIn, CPUContext, true>(                                   \
+  C10_EXPORT void Colwise##Func<TIn, CPUContext, true>(                        \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2042,7 +2055,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
     ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
   }                                                                            \
   template <>                                                                  \
-  CAFFE2_EXPORT void Colwise##Func<TIn, CPUContext, false>(                                  \
+  C10_EXPORT void Colwise##Func<TIn, CPUContext, false>(                       \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2086,28 +2099,28 @@ DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
 
-#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T)   \
-  template <>                                     \
-  CAFFE2_EXPORT void RowwiseDiv<T, CPUContext, true>(           \
-      const int rows,                             \
-      const int cols,                             \
-      const T* A,                                 \
-      const T* B,                                 \
-      T* C,                                       \
-      CPUContext*) {                              \
-    RowwiseBinaryOp<T, T, std::divides<T>, true>( \
-        rows, cols, std::divides<T>(), A, B, C);  \
-  }                                               \
-  template <>                                     \
-  CAFFE2_EXPORT void ColwiseDiv<T, CPUContext, true>(           \
-      const int rows,                             \
-      const int cols,                             \
-      const T* A,                                 \
-      const T* B,                                 \
-      T* C,                                       \
-      CPUContext*) {                              \
-    ColwiseBinaryOp<T, T, std::divides<T>, true>( \
-        rows, cols, std::divides<T>(), A, B, C);  \
+#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T)    \
+  template <>                                      \
+  C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \
+      const int rows,                              \
+      const int cols,                              \
+      const T* A,                                  \
+      const T* B,                                  \
+      T* C,                                        \
+      CPUContext*) {                               \
+    RowwiseBinaryOp<T, T, std::divides<T>, true>(  \
+        rows, cols, std::divides<T>(), A, B, C);   \
+  }                                                \
+  template <>                                      \
+  C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \
+      const int rows,                              \
+      const int cols,                              \
+      const T* A,                                  \
+      const T* B,                                  \
+      T* C,                                        \
+      CPUContext*) {                               \
+    ColwiseBinaryOp<T, T, std::divides<T>, true>(  \
+        rows, cols, std::divides<T>(), A, B, C);   \
   }
 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
@@ -2115,7 +2128,7 @@ DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
 
 #define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op)              \
   template <>                                                                \
-  CAFFE2_EXPORT void Func<TIn, CPUContext>(                                                \
+  C10_EXPORT void Func<TIn, CPUContext>(                                     \
       const int A_ndim,                                                      \
       const int* A_dims,                                                     \
       const int B_ndim,                                                      \
@@ -2258,7 +2271,7 @@ DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #define CAFFE2_RAND_UNIFORM_REAL(T)                                      \
   template <>                                                            \
-  CAFFE2_EXPORT void RandUniform<T, CPUContext>(                                       \
+  C10_EXPORT void RandUniform<T, CPUContext>(                            \
       const size_t n, const T a, const T b, T* r, CPUContext* context) { \
     std::uniform_real_distribution<T> distribution(a, b);                \
     for (size_t i = 0; i < n; ++i) {                                     \
@@ -2271,7 +2284,7 @@ CAFFE2_RAND_UNIFORM_REAL(double);
 
 #define CAFFE2_RAND_UNIFORM_CHAR(T)                                        \
   template <>                                                              \
-  CAFFE2_EXPORT void RandUniform<T, CPUContext>(                                         \
+  C10_EXPORT void RandUniform<T, CPUContext>(                              \
       const size_t n, const T a, const T b, T* r, CPUContext* context) {   \
     std::uniform_int_distribution<short> distribution((short)a, (short)b); \
     for (size_t i = 0; i < n; ++i) {                                       \
@@ -2284,7 +2297,7 @@ CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
 
 #define CAFFE2_RAND_UNIFORM_INT(T)                                       \
   template <>                                                            \
-  CAFFE2_EXPORT void RandUniform<T, CPUContext>(                                       \
+  C10_EXPORT void RandUniform<T, CPUContext>(                            \
       const size_t n, const T a, const T b, T* r, CPUContext* context) { \
     std::uniform_int_distribution<T> distribution(a, b);                 \
     for (size_t i = 0; i < n; ++i) {                                     \
@@ -2310,7 +2323,7 @@ CAFFE2_RAND_UNIFORM_INT(uint64_t);
 // each value.
 #define CAFFE2_RAND_FIXED_SUM(T)                                        \
   template <>                                                           \
-  CAFFE2_EXPORT void RandFixedSum<T, CPUContext>(                                     \
+  C10_EXPORT void RandFixedSum<T, CPUContext>(                          \
       const size_t n,                                                   \
       const T a,                                                        \
       const T b,                                                        \
@@ -2404,7 +2417,7 @@ Ind_t generate_stack_distance(
 }
 
 template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app>
-CAFFE2_EXPORT void generate_trace_lru(
+C10_EXPORT void generate_trace_lru(
     std::vector<Ind_t>& uni_ref,
     std::vector<Ind_t>& cum_val,
     std::vector<Val_t>& cum_dis,
@@ -2481,7 +2494,7 @@ CAFFE2_EXPORT void generate_trace_lru(
 // case we need to know the table id, to sample from the right distribution
 #define CAFFE2_RAND_SYNTHETIC_DATA(T)                                         \
   template <>                                                                 \
-  CAFFE2_EXPORT void RandSyntheticData<T, CPUContext>(                                      \
+  C10_EXPORT void RandSyntheticData<T, CPUContext>(                           \
       const size_t n, const T a, const T b, T* r, CPUContext* context) {      \
     /* unique memory references */                                            \
     std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6};                            \
@@ -2518,32 +2531,33 @@ CAFFE2_RAND_SYNTHETIC_DATA(uint32_t);
 CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
 #undef CAFFE2_RAND_SYNTHETIC_DATA
 
-#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                      \
-  template <>                                                          \
-  CAFFE2_EXPORT void RandUniformUnique<T, CPUContext>(                               \
-      const size_t n,                                                  \
-      const T a,                                                       \
-      const T b,                                                       \
-      T* r,                                                            \
-      const size_t m,                                                  \
-      const T* avoid,                                                  \
-      CPUContext* context) {                                           \
-    CAFFE_ENFORCE_LE(                                                  \
-        n, b - a - m + 1, "Cannot satisfy the unique requirement");    \
-    std::unordered_set<T> avoid_set(n);                                \
-    if (m) {                                                           \
-      avoid_set.insert(avoid, avoid + m);                              \
-      CAFFE_ENFORCE_EQ(m, avoid_set.size(), "ACAFFE2_EXPORT void should be unique"); \
-    }                                                                  \
-    std::uniform_int_distribution<T> distribution(a, b);               \
-    T v = 0;                                                           \
-    for (size_t i = 0; i < n; ++i) {                                   \
-      do {                                                             \
-        v = distribution(context->RandGenerator());                    \
-      } while (avoid_set.count(v));                                    \
-      r[i] = v;                                                        \
-      avoid_set.insert(v);                                             \
-    }                                                                  \
+#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                    \
+  template <>                                                        \
+  C10_EXPORT void RandUniformUnique<T, CPUContext>(                  \
+      const size_t n,                                                \
+      const T a,                                                     \
+      const T b,                                                     \
+      T* r,                                                          \
+      const size_t m,                                                \
+      const T* avoid,                                                \
+      CPUContext* context) {                                         \
+    CAFFE_ENFORCE_LE(                                                \
+        n, b - a - m + 1, "Cannot satisfy the unique requirement");  \
+    std::unordered_set<T> avoid_set(n);                              \
+    if (m) {                                                         \
+      avoid_set.insert(avoid, avoid + m);                            \
+      CAFFE_ENFORCE_EQ(                                              \
+          m, avoid_set.size(), "AC10_EXPORT void should be unique"); \
+    }                                                                \
+    std::uniform_int_distribution<T> distribution(a, b);             \
+    T v = 0;                                                         \
+    for (size_t i = 0; i < n; ++i) {                                 \
+      do {                                                           \
+        v = distribution(context->RandGenerator());                  \
+      } while (avoid_set.count(v));                                  \
+      r[i] = v;                                                      \
+      avoid_set.insert(v);                                           \
+    }                                                                \
   }
 
 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
@@ -2551,7 +2565,7 @@ CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
 
 template <>
-CAFFE2_EXPORT void RandGaussian<float, CPUContext>(
+C10_EXPORT void RandGaussian<float, CPUContext>(
     const size_t n,
     const float mean,
     const float std,
@@ -2565,7 +2579,7 @@ CAFFE2_EXPORT void RandGaussian<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_SUM(T)            \
   template <>                                \
-  CAFFE2_EXPORT void Sum<T, CPUContext>(                   \
+  C10_EXPORT void Sum<T, CPUContext>(        \
       const int N,                           \
       const T* x,                            \
       T* y,                                  \
@@ -2581,7 +2595,7 @@ CAFFE2_SPECIALIZED_SUM(int64_t);
 #undef CAFFE2_SPECIALIZED_SUM
 
 template <>
-CAFFE2_EXPORT void SumSqr<float, CPUContext>(
+C10_EXPORT void SumSqr<float, CPUContext>(
     const int N,
     const float* x,
     float* y,
@@ -2591,7 +2605,7 @@ CAFFE2_EXPORT void SumSqr<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void Select<float, CPUContext>(
+C10_EXPORT void Select<float, CPUContext>(
     const int N,
     const int D,
     const float* x,
@@ -2605,7 +2619,7 @@ CAFFE2_EXPORT void Select<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void CopyMatrix<CPUContext>(
+C10_EXPORT void CopyMatrix<CPUContext>(
     const size_t itemsize,
     const int M,
     const int N,
@@ -2648,7 +2662,7 @@ CAFFE2_EXPORT void CopyMatrix<CPUContext>(
 
 #define DELEGATE_COPY_MATRIX_FUNCTION(T, Func)  \
   template <>                                   \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(               \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(    \
       const int M,                              \
       const int N,                              \
       const T* A,                               \
@@ -2659,7 +2673,7 @@ CAFFE2_EXPORT void CopyMatrix<CPUContext>(
     Func('R', 'N', M, N, T(1), A, lda, B, ldb); \
   }                                             \
   template <>                                   \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(               \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(    \
       const int M,                              \
       const int N,                              \
       const T* A,                               \
@@ -2690,7 +2704,7 @@ DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
 
 #define CAFFE2_SPECIALIZED_COPY_MATRIX(T)                                \
   template <>                                                            \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(                                        \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(                             \
       const int M,                                                       \
       const int N,                                                       \
       const T* A,                                                        \
@@ -2720,7 +2734,7 @@ DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
     }                                                                    \
   }                                                                      \
   template <>                                                            \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(                                        \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(                             \
       const int M,                                                       \
       const int N,                                                       \
       const T* A,                                                        \
@@ -2759,7 +2773,7 @@ CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t)
 namespace {
 
 template <typename T>
-CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
+C10_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
     const int C,
     const int H,
     const int W,
@@ -2806,7 +2820,7 @@ CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
 }
 
 template <typename T>
-CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
+C10_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
     const int C,
     const int H,
     const int W,
@@ -2842,7 +2856,7 @@ CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
 }
 
 template <typename T>
-CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
+C10_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
     const int C,
     const int H,
     const int W,
@@ -2867,7 +2881,7 @@ CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
 }
 
 template <typename T>
-CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
+C10_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
     const int C,
     const int H,
     const int W,
@@ -2894,7 +2908,7 @@ CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
 }
 
 template <typename T, bool kCol2Im>
-CAFFE2_EXPORT void Im2ColNdNCHWImpl(
+C10_EXPORT void Im2ColNdNCHWImpl(
     const int N,
     const int img_size,
     const int col_size,
@@ -2950,7 +2964,7 @@ CAFFE2_EXPORT void Im2ColNdNCHWImpl(
 } // namespace
 
 template <>
-CAFFE2_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -2978,7 +2992,7 @@ CAFFE2_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -3006,7 +3020,7 @@ CAFFE2_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
     const int C,
     const int H,
     const int W,
@@ -3072,7 +3086,7 @@ CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
+C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
     const int C,
     const int H,
     const int W,
@@ -3172,7 +3186,7 @@ CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
 }
 
 template <>
-CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
     const int C,
     const int H,
     const int W,
@@ -3239,7 +3253,7 @@ CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
+C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
     const int C,
     const int H,
     const int W,
@@ -3335,7 +3349,7 @@ CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
 }
 
 template <>
-CAFFE2_EXPORT void BiasCHW<float, CPUContext>(
+C10_EXPORT void BiasCHW<float, CPUContext>(
     const float* bias,
     const float* /*bias_multiplier*/,
     const int bias_channels,
@@ -3420,7 +3434,7 @@ CAFFE2_EXPORT void BiasCHW<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_COPYVECTOR(T)                            \
   template <>                                                       \
-  CAFFE2_EXPORT void CopyVector<T, CPUContext>(                                   \
+  C10_EXPORT void CopyVector<T, CPUContext>(                        \
       const int N, const T* src, T* dst, CPUContext* /*context*/) { \
     if (src != dst && N > 0) {                                      \
       memcpy(dst, src, sizeof(T) * N);                              \
@@ -3633,7 +3647,7 @@ void TransposeCPUImpl(
 
 #define CAFFE2_SPECIALIZED_TRANSPOSE(T)       \
   template <>                                 \
-  CAFFE2_EXPORT void Transpose<T, CPUContext>(              \
+  C10_EXPORT void Transpose<T, CPUContext>(   \
       const int ndim,                         \
       const int* dims,                        \
       const int* axes,                        \
diff --git a/caffe2/utils/proto_convert.cc b/caffe2/utils/proto_convert.cc
index 24984203bcb810..790bd274291dcb 100644
--- a/caffe2/utils/proto_convert.cc
+++ b/caffe2/utils/proto_convert.cc
@@ -3,7 +3,7 @@
 
 namespace caffe2 {
 
-CAFFE2_EXPORT void ArgumentToAttributeProto(
+C10_EXPORT void ArgumentToAttributeProto(
     const Argument& arg,
     ::torch::AttributeProto* attr) {
   CAFFE_ENFORCE(arg.has_name());
@@ -29,7 +29,7 @@ CAFFE2_EXPORT void ArgumentToAttributeProto(
   }
 }
 
-CAFFE2_EXPORT void AttributeProtoToArgument(
+C10_EXPORT void AttributeProtoToArgument(
     const ::torch::AttributeProto& attr,
     Argument* arg) {
   CAFFE_ENFORCE(attr.has_name());
@@ -94,7 +94,7 @@ CAFFE2_EXPORT void AttributeProtoToArgument(
   }
 }
 
-CAFFE2_EXPORT void OperatorDefToNodeProto(
+C10_EXPORT void OperatorDefToNodeProto(
     const OperatorDef& def,
     ::torch::NodeProto* node) {
   node->mutable_input()->CopyFrom(def.input());
@@ -141,7 +141,7 @@ CAFFE2_EXPORT void OperatorDefToNodeProto(
   }
 }
 
-CAFFE2_EXPORT void NodeProtoToOperatorDef(
+C10_EXPORT void NodeProtoToOperatorDef(
     const ::torch::NodeProto& node,
     OperatorDef* def) {
   def->mutable_input()->CopyFrom(node.input());
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index 1daacff3eda2ff..dc8e088eba97c5 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -21,11 +21,11 @@ using ::google::protobuf::MessageLite;
 
 namespace caffe2 {
 
-CAFFE2_EXPORT std::string DeviceTypeName(const int32_t& d) {
+C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
   return at::DeviceTypeName(static_cast<at::DeviceType>(d));
 }
 
-CAFFE2_EXPORT int DeviceId(const DeviceOption& option) {
+C10_EXPORT int DeviceId(const DeviceOption& option) {
   switch (option.device_type()) {
     case PROTO_CPU:
       return option.numa_node_id();
@@ -40,7 +40,7 @@ CAFFE2_EXPORT int DeviceId(const DeviceOption& option) {
   }
 }
 
-CAFFE2_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
+C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
       lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
@@ -49,7 +49,7 @@ CAFFE2_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs
       lhs.numa_node_id() == rhs.numa_node_id());
 }
 
-CAFFE2_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
+C10_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
   std::ifstream ifs(filename, std::ios::in);
   if (!ifs) {
     VLOG(1) << "File cannot be opened: " << filename
@@ -64,7 +64,7 @@ CAFFE2_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
   return true;
 }
 
-CAFFE2_EXPORT bool WriteStringToFile(const string& str, const char* filename) {
+C10_EXPORT bool WriteStringToFile(const string& str, const char* filename) {
   std::ofstream ofs(filename, std::ios::out | std::ios::trunc);
   if (!ofs.is_open()) {
     VLOG(1) << "File cannot be created: " << filename
@@ -102,11 +102,13 @@ class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
 };
 }  // namespace
 
-CAFFE2_EXPORT string ProtoDebugString(const MessageLite& proto) {
+C10_EXPORT string ProtoDebugString(const MessageLite& proto) {
   return proto.SerializeAsString();
 }
 
-CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, MessageLite* proto) {
+C10_EXPORT bool ParseProtoFromLargeString(
+    const string& str,
+    MessageLite* proto) {
   ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
   ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
   // Set PlanDef message size limit to 2G.
@@ -114,7 +116,9 @@ CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, MessageLite* pro
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
+C10_EXPORT bool ReadProtoFromBinaryFile(
+    const char* filename,
+    MessageLite* proto) {
   ::google::protobuf::io::CopyingInputStreamAdaptor stream(
       new IfstreamInputStream(filename));
   stream.SetOwnsCopyingStream(true);
@@ -125,7 +129,7 @@ CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* pr
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-CAFFE2_EXPORT void WriteProtoToBinaryFile(
+C10_EXPORT void WriteProtoToBinaryFile(
     const MessageLite& /*proto*/,
     const char* /*filename*/) {
   LOG(FATAL) << "Not implemented yet.";
@@ -144,16 +148,16 @@ using ::google::protobuf::io::CodedOutputStream;
 using ::google::protobuf::Message;
 
 namespace TextFormat {
-CAFFE2_EXPORT bool ParseFromString(const string& spec, Message* proto) {
+C10_EXPORT bool ParseFromString(const string& spec, Message* proto) {
   return ::google::protobuf::TextFormat::ParseFromString(spec, proto);
 }
 } // namespace TextFormat
 
-CAFFE2_EXPORT string ProtoDebugString(const Message& proto) {
+C10_EXPORT string ProtoDebugString(const Message& proto) {
   return proto.ShortDebugString();
 }
 
-CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
+C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
   ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
   ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
   // Set PlanDef message size limit to 2G.
@@ -161,7 +165,7 @@ CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto)
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-CAFFE2_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
+C10_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
   int fd = open(filename, O_RDONLY);
   CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
   FileInputStream* input = new FileInputStream(fd);
@@ -171,7 +175,9 @@ CAFFE2_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
   return success;
 }
 
-CAFFE2_EXPORT void WriteProtoToTextFile(const Message& proto, const char* filename) {
+C10_EXPORT void WriteProtoToTextFile(
+    const Message& proto,
+    const char* filename) {
   int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
   FileOutputStream* output = new FileOutputStream(fd);
   CAFFE_ENFORCE(google::protobuf::TextFormat::Print(proto, output));
@@ -179,7 +185,9 @@ CAFFE2_EXPORT void WriteProtoToTextFile(const Message& proto, const char* filena
   close(fd);
 }
 
-CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
+C10_EXPORT bool ReadProtoFromBinaryFile(
+    const char* filename,
+    MessageLite* proto) {
 #if defined (_MSC_VER)  // for MSC compiler binary flag needs to be specified
   int fd = open(filename, O_RDONLY | O_BINARY);
 #else
@@ -198,7 +206,9 @@ CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* pr
   return success;
 }
 
-CAFFE2_EXPORT void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename) {
+C10_EXPORT void WriteProtoToBinaryFile(
+    const MessageLite& proto,
+    const char* filename) {
   int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
   CAFFE_ENFORCE_NE(
       fd, -1, "File cannot be created: ", filename, " error number: ", errno);
@@ -213,8 +223,7 @@ CAFFE2_EXPORT void WriteProtoToBinaryFile(const MessageLite& proto, const char*
 
 #endif  // CAFFE2_USE_LITE_PROTO
 
-
-CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
+C10_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
   for (auto& arg : def.arg()) {
     if (arg_map_.count(arg.name())) {
       if (arg.SerializeAsString() != arg_map_[arg.name()].SerializeAsString()) {
@@ -235,7 +244,7 @@ CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
   }
 }
 
-CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
+C10_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
   for (auto& arg : netdef.arg()) {
     CAFFE_ENFORCE(
         arg_map_.count(arg.name()) == 0,
@@ -245,7 +254,7 @@ CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
   }
 }
 
-CAFFE2_EXPORT bool ArgumentHelper::HasArgument(const string& name) const {
+C10_EXPORT bool ArgumentHelper::HasArgument(const string& name) const {
   return arg_map_.count(name);
 }
 
@@ -267,41 +276,42 @@ std::ostream& operator<<(std::ostream& output, const NetDef& n) {
   return output;
 }
 
-#define INSTANTIATE_GET_SINGLE_ARGUMENT(                                      \
-    T, fieldname, enforce_lossless_conversion)                                \
-  template <>                                                                 \
-  CAFFE2_EXPORT T ArgumentHelper::GetSingleArgument<T>(                       \
-      const string& name, const T& default_value) const {                     \
-    if (arg_map_.count(name) == 0) {                                          \
-      VLOG(1) << "Using default parameter value " << default_value            \
-              << " for parameter " << name;                                   \
-      return default_value;                                                   \
-    }                                                                         \
-    CAFFE_ENFORCE(                                                            \
-        arg_map_.at(name).has_##fieldname(),                                  \
-        "Argument ",                                                          \
-        name,                                                                 \
-        " does not have the right field: expected field " #fieldname);        \
-    auto value = arg_map_.at(name).fieldname();                               \
-    if (enforce_lossless_conversion) {                                        \
-      auto supportsConversion =                                               \
-          SupportsLosslessConversion<decltype(value), T>(value);              \
-      CAFFE_ENFORCE(                                                          \
-          supportsConversion,                                                 \
-          "Value",                                                            \
-          value,                                                              \
-          " of argument ",                                                    \
-          name,                                                               \
-          "cannot be represented correctly in a target type");                \
-    }                                                                         \
-    return static_cast<T>(value);                                             \
-  }                                                                           \
-  template <>                                                                 \
-  CAFFE2_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>(const string& name) const { \
-    if (arg_map_.count(name) == 0) {                                          \
-      return false;                                                           \
-    }                                                                         \
-    return arg_map_.at(name).has_##fieldname();                               \
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(                               \
+    T, fieldname, enforce_lossless_conversion)                         \
+  template <>                                                          \
+  C10_EXPORT T ArgumentHelper::GetSingleArgument<T>(                   \
+      const string& name, const T& default_value) const {              \
+    if (arg_map_.count(name) == 0) {                                   \
+      VLOG(1) << "Using default parameter value " << default_value     \
+              << " for parameter " << name;                            \
+      return default_value;                                            \
+    }                                                                  \
+    CAFFE_ENFORCE(                                                     \
+        arg_map_.at(name).has_##fieldname(),                           \
+        "Argument ",                                                   \
+        name,                                                          \
+        " does not have the right field: expected field " #fieldname); \
+    auto value = arg_map_.at(name).fieldname();                        \
+    if (enforce_lossless_conversion) {                                 \
+      auto supportsConversion =                                        \
+          SupportsLosslessConversion<decltype(value), T>(value);       \
+      CAFFE_ENFORCE(                                                   \
+          supportsConversion,                                          \
+          "Value",                                                     \
+          value,                                                       \
+          " of argument ",                                             \
+          name,                                                        \
+          "cannot be represented correctly in a target type");         \
+    }                                                                  \
+    return static_cast<T>(value);                                      \
+  }                                                                    \
+  template <>                                                          \
+  C10_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>(          \
+      const string& name) const {                                      \
+    if (arg_map_.count(name) == 0) {                                   \
+      return false;                                                    \
+    }                                                                  \
+    return arg_map_.at(name).has_##fieldname();                        \
   }
 
 INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false)
@@ -321,7 +331,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false)
 #define INSTANTIATE_GET_REPEATED_ARGUMENT(                             \
     T, fieldname, enforce_lossless_conversion)                         \
   template <>                                                          \
-  CAFFE2_EXPORT vector<T> ArgumentHelper::GetRepeatedArgument<T>(      \
+  C10_EXPORT vector<T> ArgumentHelper::GetRepeatedArgument<T>(         \
       const string& name, const std::vector<T>& default_value) const { \
     if (arg_map_.count(name) == 0) {                                   \
       return default_value;                                            \
@@ -358,14 +368,14 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
 INSTANTIATE_GET_REPEATED_ARGUMENT(NetDef, nets, false)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
 
-#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname)                            \
-template <>                                                                    \
-CAFFE2_EXPORT Argument MakeArgument(const string& name, const T& value) {      \
-  Argument arg;                                                                \
-  arg.set_name(name);                                                          \
-  arg.set_##fieldname(value);                                                  \
-  return arg;                                                                  \
-}
+#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname)                      \
+  template <>                                                            \
+  C10_EXPORT Argument MakeArgument(const string& name, const T& value) { \
+    Argument arg;                                                        \
+    arg.set_name(name);                                                  \
+    arg.set_##fieldname(value);                                          \
+    return arg;                                                          \
+  }
 
 CAFFE2_MAKE_SINGULAR_ARGUMENT(bool, i)
 CAFFE2_MAKE_SINGULAR_ARGUMENT(float, f)
@@ -375,28 +385,29 @@ CAFFE2_MAKE_SINGULAR_ARGUMENT(string, s)
 #undef CAFFE2_MAKE_SINGULAR_ARGUMENT
 
 template <>
-CAFFE2_EXPORT bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
+C10_EXPORT bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
 template <>
 bool ArgumentHelper::RemoveArgument(NetDef& def, int index);
 
 template <>
-CAFFE2_EXPORT Argument MakeArgument(const string& name, const MessageLite& value) {
+C10_EXPORT Argument MakeArgument(const string& name, const MessageLite& value) {
   Argument arg;
   arg.set_name(name);
   arg.set_s(value.SerializeAsString());
   return arg;
 }
 
-#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname)                            \
-template <>                                                                    \
-CAFFE2_EXPORT Argument MakeArgument(const string& name, const vector<T>& value) {\
-  Argument arg;                                                                \
-  arg.set_name(name);                                                          \
-  for (const auto& v : value) {                                                \
-    arg.add_##fieldname(v);                                                    \
-  }                                                                            \
-  return arg;                                                                  \
-}
+#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname) \
+  template <>                                       \
+  C10_EXPORT Argument MakeArgument(                 \
+      const string& name, const vector<T>& value) { \
+    Argument arg;                                   \
+    arg.set_name(name);                             \
+    for (const auto& v : value) {                   \
+      arg.add_##fieldname(v);                       \
+    }                                               \
+    return arg;                                     \
+  }
 
 CAFFE2_MAKE_REPEATED_ARGUMENT(float, floats)
 CAFFE2_MAKE_REPEATED_ARGUMENT(int, ints)
@@ -404,7 +415,7 @@ CAFFE2_MAKE_REPEATED_ARGUMENT(int64_t, ints)
 CAFFE2_MAKE_REPEATED_ARGUMENT(string, strings)
 #undef CAFFE2_MAKE_REPEATED_ARGUMENT
 
-CAFFE2_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
+C10_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
   for (const auto& outp : op.output()) {
     if (outp == output) {
       return true;
@@ -413,7 +424,7 @@ CAFFE2_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
   return false;
 }
 
-CAFFE2_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
+C10_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
   for (const auto& inp : op.input()) {
     if (inp == input) {
       return true;
@@ -423,7 +434,7 @@ CAFFE2_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
 }
 
 // Return the argument index or -1 if it does not exist.
-CAFFE2_EXPORT int GetArgumentIndex(
+C10_EXPORT int GetArgumentIndex(
     const google::protobuf::RepeatedPtrField<Argument>& args,
     const string& name) {
   int index = 0;
@@ -436,7 +447,9 @@ CAFFE2_EXPORT int GetArgumentIndex(
   return -1;
 }
 
-CAFFE2_EXPORT const Argument& GetArgument(const OperatorDef& def, const string& name) {
+C10_EXPORT const Argument& GetArgument(
+    const OperatorDef& def,
+    const string& name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return def.arg(index);
@@ -449,7 +462,7 @@ CAFFE2_EXPORT const Argument& GetArgument(const OperatorDef& def, const string&
   }
 }
 
-CAFFE2_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) {
+C10_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return def.arg(index);
@@ -462,7 +475,7 @@ CAFFE2_EXPORT const Argument& GetArgument(const NetDef& def, const string& name)
   }
 }
 
-CAFFE2_EXPORT bool GetFlagArgument(
+C10_EXPORT bool GetFlagArgument(
     const google::protobuf::RepeatedPtrField<Argument>& args,
     const string& name,
     bool default_value) {
@@ -476,21 +489,19 @@ CAFFE2_EXPORT bool GetFlagArgument(
   return default_value;
 }
 
-CAFFE2_EXPORT bool GetFlagArgument(
+C10_EXPORT bool GetFlagArgument(
     const OperatorDef& def,
     const string& name,
     bool default_value) {
   return GetFlagArgument(def.arg(), name, default_value);
 }
 
-CAFFE2_EXPORT bool GetFlagArgument(
-    const NetDef& def,
-    const string& name,
-    bool default_value) {
+C10_EXPORT bool
+GetFlagArgument(const NetDef& def, const string& name, bool default_value) {
   return GetFlagArgument(def.arg(), name, default_value);
 }
 
-CAFFE2_EXPORT Argument* GetMutableArgument(
+C10_EXPORT Argument* GetMutableArgument(
     const string& name,
     const bool create_if_missing,
     OperatorDef* def) {
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index dc7c365e86c9de..500ddf73434ab9 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -194,7 +194,7 @@ CAFFE2_API bool HasInput(const OperatorDef& op, const std::string& input);
  * does not copy the operator def, so one would need to make sure that the
  * lifetime of the OperatorDef object outlives that of the ArgumentHelper.
  */
-class CAFFE2_EXPORT ArgumentHelper {
+class C10_EXPORT ArgumentHelper {
  public:
   template <typename Def>
   static bool HasArgument(const Def& def, const string& name) {
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index 27b75d8ccd3a65..b2fc9f03b07777 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -360,7 +360,7 @@ class WorkersPool {
     counter_to_decrement_when_ready_.Wait();
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(WorkersPool);
+  C10_DISABLE_COPY_AND_ASSIGN(WorkersPool);
   std::vector<std::unique_ptr<Worker, AlignedDeleter<Worker>>> workers_;
   // The BlockingCounter used to wait for the workers.
   BlockingCounter counter_to_decrement_when_ready_;
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
index cfd1d53a98af64..bd45be9192dcad 100644
--- a/caffe2/utils/zmq_helper.h
+++ b/caffe2/utils/zmq_helper.h
@@ -26,7 +26,7 @@ class ZmqContext {
  private:
   void* ptr_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(ZmqContext);
+  C10_DISABLE_COPY_AND_ASSIGN(ZmqContext);
 };
 
 class ZmqMessage {
@@ -48,7 +48,7 @@ class ZmqMessage {
 
  private:
   zmq_msg_t msg_;
-  AT_DISABLE_COPY_AND_ASSIGN(ZmqMessage);
+  C10_DISABLE_COPY_AND_ASSIGN(ZmqMessage);
 };
 
 class ZmqSocket {
diff --git a/modules/rocksdb/rocksdb.cc b/modules/rocksdb/rocksdb.cc
index b4752b67ca569b..4f8918df41389f 100644
--- a/modules/rocksdb/rocksdb.cc
+++ b/modules/rocksdb/rocksdb.cc
@@ -67,7 +67,7 @@ class RocksDBTransaction : public Transaction {
   rocksdb::DB* db_;
   std::unique_ptr<rocksdb::WriteBatch> batch_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(RocksDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(RocksDBTransaction);
 };
 
 class RocksDB : public DB {

From db2f7de5c3b91776ce4f46b1bddf1c2c17069153 Mon Sep 17 00:00:00 2001
From: "Cheng,Penghui" <penghui.cheng@intel.com>
Date: Tue, 25 Sep 2018 17:36:31 -0700
Subject: [PATCH 40/51] Fallback CreateMutex/AtomicIter operators for mkl-dnn

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11685

Reviewed By: pjh5

Differential Revision: D9928058

Pulled By: wesolwsk

fbshipit-source-id: 734e19c35a684481d9a4d4f0c596e4dceae51ad4
---
 caffe2/operators/atomic_ops.cc | 9 +++++++++
 caffe2/sgd/iter_op.cc          | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc
index 2ce97b0d58c5fe..2c8f17649f516a 100644
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@@ -2,6 +2,11 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
 namespace caffe2 {
 namespace fb {
 namespace {
@@ -85,6 +90,10 @@ class CheckAtomicBoolOp final : public Operator<CPUContext> {
 REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp);
 REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp);
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(CreateMutex, IDEEPFallbackOp<CreateMutexOp, SkipIndices<0>>);
+#endif
+
 REGISTER_CPU_OPERATOR(CreateAtomicBool, CreateAtomicBoolOp);
 REGISTER_CPU_OPERATOR(ConditionalSetAtomicBool, ConditionalSetAtomicBoolOp);
 REGISTER_CPU_OPERATOR(CheckAtomicBool, CheckAtomicBoolOp);
diff --git a/caffe2/sgd/iter_op.cc b/caffe2/sgd/iter_op.cc
index df9e261f2ea7f1..ac964018b99e7f 100644
--- a/caffe2/sgd/iter_op.cc
+++ b/caffe2/sgd/iter_op.cc
@@ -1,5 +1,10 @@
 #include "caffe2/sgd/iter_op.h"
 
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
 namespace caffe2 {
 
 void MutexSerializer::Serialize(
@@ -22,6 +27,10 @@ void MutexDeserializer::Deserialize(const BlobProto& /* unused */, Blob* blob) {
 REGISTER_CPU_OPERATOR(Iter, IterOp<CPUContext>);
 REGISTER_CPU_OPERATOR(AtomicIter, AtomicIterOp<CPUContext>);
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(AtomicIter, IDEEPFallbackOp<AtomicIterOp<CPUContext>>);
+#endif
+
 REGISTER_BLOB_SERIALIZER(
     (TypeMeta::Id<std::unique_ptr<std::mutex>>()),
     MutexSerializer);

From 807de9a1e3cff49dd23fb7df83aea38269ccdd23 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Tue, 25 Sep 2018 19:08:00 -0700
Subject: [PATCH 41/51] fix segfault when grad to a hook fn is None (#12028)

Summary:
- fixes https://github.com/pytorch/pytorch/issues/11751 by checking if a grad is a Python None object before getting cdata from it
- behaviors:

pre-fix
```
>>> a = torch.randn(5, requires_grad=True)
>>> a_list = a.unbind()

>>> a0 = a_list[0]
>>> a0.register_hook
...:    def hook(grad):
...:        print(grad)

>>> a_list[0].backward()
tensor(1.)

>>> print('a_list[0]', a_list[0].grad, a.grad)
('a_list[0]', None, tensor([1., 0., 0., 0., 0.]))

>>> a_list[1].backward() # segfault
```

post-fix
```
>>> a = torch.randn(5, requires_grad=True)
>>> a_list = a.unbind()

>>> a0 = a_list[0]
>>> a0.register_hook
... :   def hook(grad):
... :       print(grad)

>>> a_list[0].backward()
tensor(1.)

>>> print(a_list[0].grad, a.grad)
(None, tensor([1., 0., 0., 0., 0.]))

>>> a_list[1].backward()
None

>>> print(a_list[1].grad, a.grad)
(None, tensor([1., 1., 0., 0., 0.]))
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12028

Differential Revision: D10034094

Pulled By: weiyangfb

fbshipit-source-id: 3f2135325fa7d338b920f57752057e4f6a6c0b1d
---
 test/test_autograd.py               | 27 +++++++++++++++++++++++++++
 torch/csrc/autograd/python_hook.cpp |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 501e4a6ca7e629..f9ccfb6c958e99 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -299,6 +299,33 @@ def hook(*grads):
         self.assertFalse(hook_called[0])
         self.assertIsNone(x.grad)
 
+    def test_grad_nonleaf_register_hook(self):
+        # This checks an edge case for register_hook.
+        # We want to capture grad of a nonleaf tensor,
+        # but avoid segfault during backward of other nonleaf tensors
+        x = torch.randn(5, requires_grad=True)
+        x_list = x.unbind()
+
+        x0 = x_list[0]
+        hook_results = [None]
+
+        def hook(grad):
+            hook_results[0] = grad
+        x0.register_hook(hook)
+
+        x_list[0].backward()
+        self.assertEqual(hook_results[0], torch.tensor(1.))
+        expected_grad = torch.tensor([1., 0, 0, 0, 0])
+        self.assertEqual(x.grad, expected_grad)
+        self.assertIsNone(x_list[0].grad)
+
+        for i in range(1, 5, 1):
+            x_list[i].backward()
+            self.assertEqual(hook_results[0], None)
+            expected_grad[i] = 1.0
+            self.assertEqual(x.grad, expected_grad)
+            self.assertIsNone(x_list[i].grad)
+
     def test_sharded_grad(self):
         leaves = [torch.zeros(5, 5, requires_grad=True) for _ in range(10)]
         intermediates = [l * i + l * l for i, l in enumerate(leaves)]
diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
index 3ceb1f4aa201c6..af02e9e46997e5 100644
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@@ -51,7 +51,7 @@ auto PyFunctionPreHook::operator()(const variable_list& values) -> variable_list
   }
 
   variable_list results(values);
-  results[value_idx] = ((THPVariable*)value.get())->cdata;
+  if (value != Py_None) results[value_idx] = ((THPVariable*)value.get())->cdata;
   return results;
 }
 

From 8ff435c8f60033d6f7795202d145e738a551ee84 Mon Sep 17 00:00:00 2001
From: Ansha Yu <ansha@fb.com>
Date: Tue, 25 Sep 2018 20:49:35 -0700
Subject: [PATCH 42/51] Use tempfile during serialized test comparison (#12021)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12021

TestPilot runs stress tests in parallel. These fail for serialized tests because extracting (and subsequent deletion) of binary data during the process isn't threadsafe. Extract zips into tempfile to avoid this problem.

Also remove some accidentally checked in zips of a test that we didn't end up including for now.

Reviewed By: houseroad

Differential Revision: D10013682

fbshipit-source-id: 6e13b850b38dee4106d3c10a9372747d17b67c5a
---
 .../string_ops_test.test_string_prefix.zip    | Bin 1030 -> 0 bytes
 .../inout.npz                                 | Bin 781 -> 0 bytes
 .../string_ops_test.test_string_prefix/op.pb  | Bin 47 -> 0 bytes
 .../string_ops_test.test_string_suffix.zip    | Bin 1030 -> 0 bytes
 .../inout.npz                                 | Bin 781 -> 0 bytes
 .../string_ops_test.test_string_suffix/op.pb  | Bin 47 -> 0 bytes
 .../serialized_test/serialized_test_util.py   |  34 +++++++++---------
 7 files changed, 17 insertions(+), 17 deletions(-)
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb

diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip
deleted file mode 100644
index e4019f68dfd0e0c0629a26227a82b2272a5bbcb9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1030
zcmWIWW@Zs#0D;x9hQ4~qv)1YZ*&xgc#Q6n!1xZ}&#U(|VdFjPM93VzPL28N=gAz|L
zSTLX{H7&D3i;IgbCp9m<BtwEZ0L8?qPZRd?GBYrMFelK&%)I>461}{FDwr}3u*K8(
zWB(rpO3N}bFmQpD6_l0~1JzY7?b*w9$Ux@E<ZJvVf=j$xg9?Pj&DonnTX^__E|~;5
z-hOsJL_=$bNa44}t)U0mKeRRP{$1GLaDIArPu~ld<WP?V$?fyRuN1#9V`f(3HSP$F
z_CLBWz&Bn+DcrZL{oa9My>s?T;oi%hDz2}Y+4zb7>Ag2B4J$l2ywWlg7ffl(__NSM
zsy-~&Fr--KZrH6d$+R8+m^Q8~axH0(?iO_BF?YD0YIgj&0!P|;xBO&b;k)L$rayS{
z{Yw7)fVm~_rG+_#f9j)p)oP<@nk&$Y9YD+v#3`v|naQc~C6xuKApg#OBmPQS;N*#b
zV*zXDO^TWmvm|fng1pI@3)d}PFmGCXNYIoS;`8IDOyUwM_dcWP$J!;Ht|GRC>8I2z
zE3U0vB8+-$8Vn4mHX;WuH_%343=FH#Wn>az2Bl%_NeL(c1`UlMDl-q}^#E^FedzH5
z(Z#^f(D)k2M2jVK&FDdl&>Ra)(MSOv;LXYgl4J(LbwJva3B*Gfhdm#H)PTSeuphv=
i5?u>=MuTVoX$b-{k+L1cekKMchAf~&8!%roFaQAJi$B@`

diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz
deleted file mode 100644
index 0dfa5f9790c01f5ef1abddf02407855c5b708f45..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 781
zcmWIWW@Zs#U|`^2SRHHVJB>f~|6w3cmXU#h3rJ_?6_l0~>*W<xF74UNb;v;G$mDDM
zCxT17TZ0OO#m(89LtA+Gf-ac^Io^JDKSV=ohe+YK#;u_T**~;3@BUrb-*A3<c2D06
zm*h~71<CF6#IF>;Fk@y`;x+CFjrKpfFTgimMJe33to`1BV!d<rO5xtio+_@dnc4V>
z|LMIqEDb9>IK0v_6BkTr%lNa<L#jS3*D$15=5E-nGRd?Z|ClzeEOISrkM0(9<}r7;
zo@#ddxdKPpdAIyzVd1;xyQV*Q^8HHw{D8S7@1=z~g@5V?pm^14qiLEe(2E^F%n!sV
zsb!hTsqrP11*sta&VD2QN?PFLiGX7PYv)ahniI1mZ|Q=($(ak+EnYBhT6{>*lo{gl
z<EKpG5-RsTqv^-mC7!M#wuI@Y)GRBmtz06EdTbgD45&6D2QD|z#{ANfVHLWJOd`y<
zlM>LwV9?MAq7Vryz#CN`db~h%F)%bVz6LVUVhLR{dJrQt#{yF{Qh*0|v$BCCnSpQ}
JkoIH(@c>A94!-~Z

diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb
deleted file mode 100644
index b1f14dad9aefd8838b2bbba926bd8282f535fc80..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 47
wcmd;LFD@y{%u6p8;s7xU3Q|*~7?gN|!GZxrscD%NT3lRgIjMQ+B^eUT0BH0MuK)l5

diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip
deleted file mode 100644
index cc60f7242ee693ffc9e23f9c6c2fae393238da50..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1030
zcmWIWW@Zs#0D;x9hQ43R-2C)`Y!GGz;`{==f+Q~X;*z4wy!2ur4iKZDAT>pbL5U|A
zEErsxmX=we#l^*zlbV-ak|DtyfMVj*rwMy`nHd;Bm=kDXW?p`2iC$ho6-*fi*y3sY
zvHuSPrDYiz7`VX73Q9|gf$A!k_Uz?4WFT{7@-_Yw!6n|UK?TC%=IqU(Ej)ZdmrQ~j
zZ$G;qqM@}zr0`qg*3g6OAKIFC|1RurI6pnRr|*SJa;V3G<o0>uSBhVlF*7Ui8h3<7
z`ybsG;2W=^6z*Hre(ylB-Z^`vaPMVL71!6yZ2ZLk^xhkmh7}$hUTK+$3#PPX{8{KB
zRUejX7*Z^AH|$oKWZI5@OdD4gxt6p?cMCf6m^)lgH9P)Xfg|m_TYj>z@Lls=(;qzf
zekFf?z}%Ae(!!j=KlM?)YPHcc%@ydy4j|?S;*`{~%;ePglFEWqkbh^t5q~8uaPmaJ
zv4FMnCPmGOS(3MOLEhxdh3ghCm^UpxBxuSE@%iynCUFUsd!NztW9<@8R}ovn^iyh<
z71vfS5k@^W4F(2O8<7K-8)zdi28LDWGBSxUgVHeeqy&@zgN8;Bm6-?gdVn{oKJ<8j
z=we`KXnYN1qQw%rX7nINXpRM@XrurS@MdKLNiqZBIw0-I1mYo#!=4X8YCvEK*bm@b
hiLM1bqd_!)v;+Z}NZAfzKNABJLl#h?4VW((7yzt0Kra9Q

diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz
deleted file mode 100644
index 0dfa5f9790c01f5ef1abddf02407855c5b708f45..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 781
zcmWIWW@Zs#U|`^2SRHHVJB>f~|6w3cmXU#h3rJ_?6_l0~>*W<xF74UNb;v;G$mDDM
zCxT17TZ0OO#m(89LtA+Gf-ac^Io^JDKSV=ohe+YK#;u_T**~;3@BUrb-*A3<c2D06
zm*h~71<CF6#IF>;Fk@y`;x+CFjrKpfFTgimMJe33to`1BV!d<rO5xtio+_@dnc4V>
z|LMIqEDb9>IK0v_6BkTr%lNa<L#jS3*D$15=5E-nGRd?Z|ClzeEOISrkM0(9<}r7;
zo@#ddxdKPpdAIyzVd1;xyQV*Q^8HHw{D8S7@1=z~g@5V?pm^14qiLEe(2E^F%n!sV
zsb!hTsqrP11*sta&VD2QN?PFLiGX7PYv)ahniI1mZ|Q=($(ak+EnYBhT6{>*lo{gl
z<EKpG5-RsTqv^-mC7!M#wuI@Y)GRBmtz06EdTbgD45&6D2QD|z#{ANfVHLWJOd`y<
zlM>LwV9?MAq7Vryz#CN`db~h%F)%bVz6LVUVhLR{dJrQt#{yF{Qh*0|v$BCCnSpQ}
JkoIH(@c>A94!-~Z

diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb
deleted file mode 100644
index d59c5130048038688e692d5d2305a3514929eeff..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 47
wcmd;LFD@y{%u6p8;s7xU3Q|*~7?gN|!Ggi1X=#}iT3lRgIjMQ+B^eUT0BLOwwg3PC

diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index feb5d8e127cb88..67081fa77d025c 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -11,9 +11,9 @@
 import inspect
 import numpy as np
 import os
-import re
 import shutil
 import sys
+import tempfile
 import threading
 from zipfile import ZipFile
 
@@ -140,16 +140,15 @@ def parse_proto(x):
 
         source_dir = self.get_output_dir()
         test_name = self.get_output_filename()
-        full_dir = os.path.join(source_dir, test_name)
-        _prepare_dir(full_dir)
+        temp_dir = tempfile.mkdtemp()
         with ZipFile(os.path.join(source_dir, test_name + '.zip')) as z:
-            loaded = z.extractall(full_dir)
+            z.extractall(temp_dir)
 
-        op_path = os.path.join(full_dir, 'op.pb')
-        inout_path = os.path.join(full_dir, 'inout.npz')
-        loaded = np.load(inout_path, encoding='bytes')
+        op_path = os.path.join(temp_dir, 'op.pb')
+        inout_path = os.path.join(temp_dir, 'inout.npz')
 
         # load serialized input and output
+        loaded = np.load(inout_path, encoding='bytes')
         loaded_inputs = loaded['inputs'].tolist()
         inputs_equal = True
         for (x, y) in zip(inputs, loaded_inputs):
@@ -157,16 +156,16 @@ def parse_proto(x):
                 inputs_equal = False
         loaded_outputs = loaded['outputs'].tolist()
 
-        # load operator
-        with open(op_path, 'rb') as f:
-            loaded_op = f.read()
-
-        op_proto = parse_proto(loaded_op)
-        device_type = loaded['device_type']
-        device_option = caffe2_pb2.DeviceOption(device_type=int(device_type))
-
         # if inputs are not the same, run serialized input through serialized op
         if not inputs_equal:
+            # load operator
+            with open(op_path, 'rb') as f:
+                loaded_op = f.read()
+
+            op_proto = parse_proto(loaded_op)
+            device_type = loaded['device_type']
+            device_option = caffe2_pb2.DeviceOption(device_type=int(device_type))
+
             outputs = hu.runOpOnInput(device_option, op_proto, loaded_inputs)
             grad_ops = _getGradientOrNone(op_proto)
 
@@ -176,12 +175,13 @@ def parse_proto(x):
 
         # assert gradient op is equal
         for i in range(len(grad_ops)):
-            with open(os.path.join(full_dir, 'grad_{}.pb'.format(i)), 'rb') as f:
+            grad_path = os.path.join(temp_dir, 'grad_{}.pb'.format(i))
+            with open(grad_path, 'rb') as f:
                 loaded_grad = f.read()
             grad_proto = parse_proto(loaded_grad)
             self.assertTrue(grad_proto == grad_ops[i])
 
-        shutil.rmtree(full_dir)
+        shutil.rmtree(temp_dir)
 
     def assertSerializedOperatorChecks(
             self,

From b7ebc00979acf93e2fed994ece37966409b0bf4d Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Tue, 25 Sep 2018 23:16:25 -0700
Subject: [PATCH 43/51] Move Blob to ATen/core (#11924)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11924

Previous diffs removed Blob -> caffe2 dependencies, now we can move it to ATen/core.
This is pre-work for allowing storing Blob in IValue.

Reviewed By: ezyang

Differential Revision: D9980641

fbshipit-source-id: 32082a673ec94c42c20b2298adced8bb7ca94d07
---
 aten/src/ATen/core/blob.cpp |   1 +
 aten/src/ATen/core/blob.h   | 235 ++++++++++++++++++++++++++++++++++++
 caffe2/core/blob.h          | 213 +-------------------------------
 3 files changed, 237 insertions(+), 212 deletions(-)
 create mode 100644 aten/src/ATen/core/blob.cpp
 create mode 100644 aten/src/ATen/core/blob.h

diff --git a/aten/src/ATen/core/blob.cpp b/aten/src/ATen/core/blob.cpp
new file mode 100644
index 00000000000000..930255194639b4
--- /dev/null
+++ b/aten/src/ATen/core/blob.cpp
@@ -0,0 +1 @@
+#include <ATen/core/blob.h>
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
new file mode 100644
index 00000000000000..efe0774ebb3e38
--- /dev/null
+++ b/aten/src/ATen/core/blob.h
@@ -0,0 +1,235 @@
+#pragma once
+
+#include <cstddef>
+#include <sstream>
+#include <type_traits>
+#include <typeinfo>
+#include <vector>
+
+#include <ATen/core/typeid.h>
+#include <c10/macros/Macros.h>
+
+namespace caffe2 {
+
+class Tensor;
+
+/**
+ * @brief Blob is a general container that hosts a typed pointer.
+ *
+ * A Blob hosts a pointer as well as its type, and takes charge of deleting it
+ * properly when the blob is deallocated or re-allocated with a new type. A blob
+ * could contain anything, although the most common case is to contain a Tensor.
+ */
+class CAFFE2_API Blob final {
+ public:
+  using DestroyCall = void(void*);
+
+  /**
+   * Initializes an empty Blob.
+   */
+  Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {}
+  ~Blob() {
+    Reset();
+  }
+
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
+  }
+
+  Blob& operator=(Blob&& other) noexcept {
+    Blob(std::move(other)).swap(*this);
+    return *this;
+  }
+
+  /**
+   * Checks if the content stored in the blob is of type T.
+   */
+  template <class T>
+  bool IsType() const noexcept {
+    return meta_.Match<T>();
+  }
+
+  /**
+   * Returns the meta info of the blob.
+   */
+  inline const TypeMeta& meta() const noexcept {
+    return meta_;
+  }
+
+  /**
+   * Returns a printable typename of the blob.
+   */
+  inline const char* TypeName() const noexcept {
+    return meta_.name();
+  }
+
+  /**
+   * @brief Gets the const reference of the stored object. The code checks if
+   * the stored object is of the desired type.
+   */
+  // TODO(jerryzh): add a Get(DeviceType) function?
+  template <class T>
+  const T& Get() const {
+    AT_ASSERTM(
+        IsType<T>(),
+        "wrong type for the Blob instance. Blob contains ",
+        meta_.name(),
+        " while caller expects ",
+        TypeMeta::TypeName<T>());
+    // TODO: after we add Get<Tensor>(DeviceType)
+    // and changed all the callsites, we can add
+    // a static assert here to enforce T != Tensor
+    return *static_cast<const T*>(pointer_);
+  }
+
+  const void* GetRaw() const noexcept {
+    return pointer_;
+  }
+  void* GetRaw() noexcept {
+    return pointer_;
+  }
+
+  /**
+   * @brief Gets a mutable pointer to the stored object.
+   *
+   * If the current object is not of the right type, a new object is created
+   * and the old object is freed. Note that type T should have a default
+   * constructor. Otherwise, create the object yourself first, and use
+   * Reset().
+   */
+  template <class T>
+  T* GetMutable() {
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "GetMutable can't be called with non-default-constructible types. "
+        "Try using specialized methods");
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      // TODO Re-enable logging
+      // VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
+      return Reset<T>(new T());
+    }
+  }
+
+  template <class T>
+  T* GetMutableOrNull() {
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * Sets the underlying object to the allocated one. The Blob then takes over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * This is used when the underlying class T does not have a default ctor, or
+   * complex initializations needs to be done outside the blob.
+   */
+  template <class T>
+  T* Reset(T* allocated) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = TypeMeta::Make<T>();
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = &Destroy<T>;
+    return allocated;
+  }
+
+  inline void* Reset(
+      void* allocated,
+      const TypeMeta& meta,
+      DestroyCall* destroy) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = destroy;
+    return allocated;
+  }
+
+  /**
+   * Releases the ownership, if any, this Blob has on the underlying pointer.
+   * The user is then responsible for freeing the data if needed
+   */
+  inline DestroyCall* Release() {
+    DestroyCall* d = destroy_;
+    destroy_ = nullptr;
+    return d;
+  }
+
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  typename std::remove_const<T>::type* ShareExternal(
+      typename std::remove_const<T>::type* allocated) {
+    return static_cast<T*>(ShareExternal(
+        static_cast<void*>(allocated),
+        TypeMeta::Make<typename std::remove_const<T>::type>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta& meta) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = nullptr;
+    return allocated;
+  }
+
+  /**
+   * Resets the Blob to an empty one.
+   */
+  inline void Reset() {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    pointer_ = nullptr;
+    meta_ = TypeMeta();
+    destroy_ = nullptr;
+  }
+
+  /**
+   * @brief Swaps the underlying storage of two blobs.
+   */
+  void swap(Blob& rhs) {
+    using std::swap;
+    swap(meta_, rhs.meta_);
+    swap(pointer_, rhs.pointer_);
+    swap(destroy_, rhs.destroy_);
+  }
+
+ private:
+  /**
+   * @brief A destroy call that is used to properly deconstruct objects.
+   */
+  template <class T>
+  static void Destroy(void* pointer) {
+    delete static_cast<T*>(pointer);
+  }
+  TypeMeta meta_;
+  void* pointer_ = nullptr;
+  DestroyCall* destroy_ = nullptr;
+
+  C10_DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+inline void swap(Blob& lhs, Blob& rhs) {
+  lhs.swap(rhs);
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index def0f1b859e823..e09a54cbd2df56 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -8,224 +8,13 @@
 #include <vector>
 #include "caffe2/core/common.h"
 
+#include <ATen/core/blob.h>
 #include <ATen/core/typeid.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/tensor.h"
 
 namespace caffe2 {
 
-class Tensor;
-
-/**
- * @brief Blob is a general container that hosts a typed pointer.
- *
- * A Blob hosts a pointer as well as its type, and takes charge of deleting it
- * properly when the blob is deallocated or re-allocated with a new type. A blob
- * could contain anything, although the most common case is to contain a Tensor.
- */
-class CAFFE2_API Blob final {
- public:
-  using DestroyCall = void(void*);
-
-  /**
-   * Initializes an empty Blob.
-   */
-  Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {}
-  ~Blob() { Reset(); }
-
-  Blob(Blob&& other) noexcept : Blob() {
-    swap(other);
-  }
-
-  Blob& operator=(Blob&& other) noexcept {
-    Blob(std::move(other)).swap(*this);
-    return *this;
-  }
-
-  /**
-   * Checks if the content stored in the blob is of type T.
-   */
-  template <class T>
-  bool IsType() const noexcept {
-    return meta_.Match<T>();
-  }
-
-  /**
-   * Returns the meta info of the blob.
-   */
-  inline const TypeMeta& meta() const noexcept { return meta_; }
-
-  /**
-   * Returns a printable typename of the blob.
-   */
-  inline const char* TypeName() const noexcept { return meta_.name(); }
-
-  /**
-   * @brief Gets the const reference of the stored object. The code checks if
-   * the stored object is of the desired type.
-   */
-  // TODO(jerryzh): add a Get(DeviceType) function?
-  template <class T>
-  const T& Get() const {
-    CAFFE_ENFORCE(
-        IsType<T>(),
-        "wrong type for the Blob instance. Blob contains ",
-        meta_.name(),
-        " while caller expects ",
-        TypeMeta::TypeName<T>());
-    // TODO: after we add Get<Tensor>(DeviceType)
-    // and changed all the callsites, we can add
-    // a static assert here to enforce T != Tensor
-    return *static_cast<const T*>(pointer_);
-  }
-
-  const void* GetRaw() const noexcept {
-    return pointer_;
-  }
-  void* GetRaw() noexcept {
-    return pointer_;
-  }
-
-  /**
-   * @brief Gets a mutable pointer to the stored object.
-   *
-   * If the current object is not of the right type, a new object is created
-   * and the old object is freed. Note that type T should have a default
-   * constructor. Otherwise, create the object yourself first, and use
-   * Reset().
-   */
-  template <class T>
-  T* GetMutable() {
-    static_assert(
-        std::is_default_constructible<T>::value,
-        "GetMutable can't be called with non-default-constructible types. "
-        "Try using specialized methods");
-    if (IsType<T>()) {
-      return static_cast<T*>(pointer_);
-    } else {
-      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
-      return Reset<T>(new T());
-    }
-  }
-
-  template <class T>
-  T* GetMutableOrNull() {
-    if (IsType<T>()) {
-      return static_cast<T*>(pointer_);
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * Sets the underlying object to the allocated one. The Blob then takes over
-   * the ownership of the passed in pointer. If there is already an object in
-   * the Blob, the old object is freed.
-   *
-   * This is used when the underlying class T does not have a default ctor, or
-   * complex initializations needs to be done outside the blob.
-   */
-  template <class T>
-  T* Reset(T* allocated) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = TypeMeta::Make<T>();
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = &Destroy<T>;
-    return allocated;
-  }
-
-  inline void*
-  Reset(void* allocated, const TypeMeta& meta, DestroyCall* destroy) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = meta;
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = destroy;
-    return allocated;
-  }
-
-  /**
-   * Releases the ownership, if any, this Blob has on the underlying pointer.
-   * The user is then responsible for freeing the data if needed
-   */
-  inline DestroyCall* Release() {
-    DestroyCall* d = destroy_;
-    destroy_ = nullptr;
-    return d;
-  }
-
-  /**
-   * Sets the underlying object to the allocated one, but does not take over
-   * the ownership of the passed in pointer. If there is already an object in
-   * the Blob, the old object is freed.
-   *
-   * Unlike Reset, this does not take over the ownership of the pointer and the
-   * caller is responsible for making sure that the lifetime of the allocated
-   * blob outlasts the lifetime of any access to this blob, until another Reset
-   * call is made or the blob is destructed.
-   */
-  template <class T>
-  typename std::remove_const<T>::type* ShareExternal(
-      typename std::remove_const<T>::type* allocated) {
-    return static_cast<T*>(ShareExternal(
-        static_cast<void*>(allocated),
-        TypeMeta::Make<typename std::remove_const<T>::type>()));
-  }
-
-  void* ShareExternal(void* allocated, const TypeMeta& meta) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = meta;
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = nullptr;
-    return allocated;
-  }
-
-  /**
-   * Resets the Blob to an empty one.
-   */
-  inline void Reset() {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    pointer_ = nullptr;
-    meta_ = TypeMeta();
-    destroy_ = nullptr;
-  }
-
-  /**
-   * @brief Swaps the underlying storage of two blobs.
-   */
-  void swap(Blob& rhs) {
-    using std::swap;
-    swap(meta_, rhs.meta_);
-    swap(pointer_, rhs.pointer_);
-    swap(destroy_, rhs.destroy_);
-  }
-
- private:
-  /**
-   * @brief A destroy call that is used to properly deconstruct objects.
-   */
-  template <class T>
-  static void Destroy(void* pointer) {
-    delete static_cast<T*>(pointer);
-  }
-  TypeMeta meta_;
-  void* pointer_ = nullptr;
-  DestroyCall* destroy_ = nullptr;
-
-  C10_DISABLE_COPY_AND_ASSIGN(Blob);
-};
-
-inline void swap(Blob& lhs, Blob& rhs) {
-  lhs.swap(rhs);
-}
-
 inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
   bool is_match = blob.meta().Match<Tensor>();
   if (!is_match) {

From 65cbb8226b91565de3fedfc3db3f8c85626f39a0 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 26 Sep 2018 01:02:27 -0700
Subject: [PATCH 44/51] IValue can store Blob (#11414)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11414

caffe2::Blob can be stored in an IValue. This is a precondition for caffe2 to switch from Blob to IValue.

Reviewed By: ezyang

Differential Revision: D9731326

fbshipit-source-id: 462a39d2d9ab6f85b99b1670848c6976a3de417c
---
 aten/src/ATen/core/blob.h     |  7 ++++++-
 aten/src/ATen/core/ivalue.cpp |  6 ++++--
 aten/src/ATen/core/ivalue.h   | 26 ++++++++++++++++++++++++--
 binaries/tutorial_blob.cc     |  4 ++--
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index efe0774ebb3e38..0c82c5b8001c6b 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -6,6 +6,7 @@
 #include <typeinfo>
 #include <vector>
 
+#include <ATen/core/intrusive_ptr.h>
 #include <ATen/core/typeid.h>
 #include <c10/macros/Macros.h>
 
@@ -20,7 +21,7 @@ class Tensor;
  * properly when the blob is deallocated or re-allocated with a new type. A blob
  * could contain anything, although the most common case is to contain a Tensor.
  */
-class CAFFE2_API Blob final {
+class CAFFE2_API Blob final : public c10::intrusive_ptr_target {
  public:
   using DestroyCall = void(void*);
 
@@ -232,4 +233,8 @@ inline void swap(Blob& lhs, Blob& rhs) {
   lhs.swap(rhs);
 }
 
+inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
+  return out << "Blob[" << v.TypeName() << "]";
+}
+
 } // namespace caffe2
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 9e69f70d025861..8dfb1e8ebb75b6 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1,8 +1,10 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/Formatting.h>
 
-#define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
+#define TORCH_FORALL_TAGS(_)                                             \
+  _(None)                                                                \
+  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
+      _(TensorList) _(Blob)
 
 namespace torch { namespace jit {
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 5064f5114e7df9..513845d4c12af0 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -4,6 +4,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/UndefinedTensorImpl.h>
+#include <ATen/core/blob.h>
 #include <ATen/core/intrusive_ptr.h>
 
 #include <type_traits>
@@ -64,8 +65,10 @@ using DoubleList = ConstantList<double>;
 // to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
 // retain/release calls.
 
-#define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
+#define TORCH_FORALL_TAGS(_)                                             \
+  _(None)                                                                \
+  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
+      _(TensorList) _(Blob)
 
 struct CAFFE2_API IValue final {
   IValue()
@@ -125,6 +128,25 @@ struct CAFFE2_API IValue final {
     return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
   }
 
+  IValue(caffe2::Blob blob) : tag(Tag::Blob), is_intrusive_ptr(true) {
+    // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
+    // and
+    //      store it as a Tensor instead.
+    payload.as_intrusive_ptr =
+        c10::make_intrusive<caffe2::Blob>(std::move(blob)).release();
+  }
+  bool isBlob() const {
+    return Tag::Blob == tag;
+  }
+  caffe2::Blob& toBlob() & {
+    AT_ASSERT(isBlob());
+    return *static_cast<caffe2::Blob*>(payload.as_intrusive_ptr);
+  }
+  const caffe2::Blob& toBlob() const& {
+    AT_ASSERT(isBlob());
+    return *static_cast<caffe2::Blob*>(payload.as_intrusive_ptr);
+  }
+
   // Tuple
   IValue(c10::intrusive_ptr<Tuple> v);
   bool isTuple() const { return Tag::Tuple == tag; }
diff --git a/binaries/tutorial_blob.cc b/binaries/tutorial_blob.cc
index f379eac663cbe4..ac74ebb5ffb78b 100644
--- a/binaries/tutorial_blob.cc
+++ b/binaries/tutorial_blob.cc
@@ -47,7 +47,7 @@ int main(int argc, char** argv) {
   LOG(INFO)
       << "Is the blob type float? "
       << myblob.IsType<float>();
-               
+
   const int& myint_const = myblob.Get<int>();
   LOG(INFO)
       << "The value of the int number stored in the blob is: "
@@ -80,7 +80,7 @@ int main(int argc, char** argv) {
 
   std::string* pvec = new std::string();
   myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
-  
+
   LOG(INFO) << "Is the blob now of type string? "
             << myblob.IsType<std::string>();
 

From 21ed7e51b606f9912b658ab63e016d7546f3b75c Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 26 Sep 2018 01:44:03 -0700
Subject: [PATCH 45/51] Blob doesn't allow access to destroyCall anymore
 (#11548)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11548

This removes getting/setting the DestroyCall of a Blob,
paving the way to removing DestroyCall from Blob entirely and using the destructor stored in TypeMeta instead.

Use sites have been fixed in diffs stacked below this.

Reviewed By: dzhulgakov

Differential Revision: D9775191

fbshipit-source-id: 97d72d0c62843849057f295c27f391e63c99c521
---
 aten/src/ATen/core/blob.h | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 0c82c5b8001c6b..17a09f33616e7f 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -141,29 +141,6 @@ class CAFFE2_API Blob final : public c10::intrusive_ptr_target {
     return allocated;
   }
 
-  inline void* Reset(
-      void* allocated,
-      const TypeMeta& meta,
-      DestroyCall* destroy) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = meta;
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = destroy;
-    return allocated;
-  }
-
-  /**
-   * Releases the ownership, if any, this Blob has on the underlying pointer.
-   * The user is then responsible for freeing the data if needed
-   */
-  inline DestroyCall* Release() {
-    DestroyCall* d = destroy_;
-    destroy_ = nullptr;
-    return d;
-  }
-
   /**
    * Sets the underlying object to the allocated one, but does not take over
    * the ownership of the passed in pointer. If there is already an object in

From c8a0b11b7f609d5a6dabcb6fa482a384dff2ffab Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 26 Sep 2018 07:56:45 -0700
Subject: [PATCH 46/51] add autodiff expressions for common operations (#11832)

Summary:
This PR does a few things:

Previously test_jit.py only tested autograd on backward graphs.
This is because we borrow from test_autograd and construct graphs with a small
number of nodes. Because the number of nodes is small (typically 1-2), those graph
do not end up containing autodiff subgraphs, so autodiff never gets tested.

This PR enables autodiff testing by doing the following:
- added disableDebugAutodiffSubgraphInlining fn to graph_executor to disable
  autodiff subgraph inlining.
- (implementation) added autodiffSubgraphNodeThreshold and autodiffSubgraphInlineThreshold.
  These are set to their default values (2, 5) but disableDebugAutodiffSubgraphInlining()
  sets both to 1, disabling subgraph inlining and allowing 1-node autodiff subgraphs.
- The relevant backward jit tests disable autodiff subgraph inlining so they
  will test the autodiff versions of the operators instead of autograd whenever
  an autodiff variant exists.
- We don't run the tests that do inline autodiff subgraphs anymore.
  This has no impact on testing correctness because the assumption is
  that autograd functions are correct and are tested in test_autograd.py

This allows the graph fuser to work better because a lot of these ops were previously not autodiff-compatible but fusible. On a more concrete example, lstm backward contains a lot of tensor-scalar operations; these autodiff formulas help its double backward pass.

Included:
- arithmetic overloads
- abs, acos, asin, atan, ceil, cos, cosh, exp, expm1, floor, fmod, frac, log, log10, log1p, log2 reciprocal, remainder, round, sin, sinh, tan, trunc, rsqrt

TestJitGenerated tests autodiff for all of the added operations.

cc apaszke zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11832

Differential Revision: D10031256

Pulled By: zou3519

fbshipit-source-id: 9daf9900a5ad187743609cd0fbbd10b15411ad93
---
 test/test_jit.py                   |  53 ++++++++---
 torch/csrc/jit/autodiff.cpp        | 141 ++++++++++++++++++++++++++++-
 torch/csrc/jit/graph_executor.cpp  |  20 +++-
 torch/csrc/jit/graph_executor.h    |   1 +
 torch/csrc/jit/script/init.cpp     |   7 ++
 torch/csrc/jit/script/module.h     |   4 +
 torch/csrc/jit/symbolic_variable.h |  27 ++++++
 7 files changed, 239 insertions(+), 14 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 708533ae737760..7e342a9fe7a434 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -23,7 +23,8 @@
 import tempfile
 import shutil
 import warnings
-from test_autograd import method_tests, create_input, unpack_variables, \
+from test_autograd import method_tests as autograd_method_tests
+from test_autograd import create_input, unpack_variables, \
     exclude_tensor_method, non_differentiable, EXCLUDE_GRADCHECK, EXCLUDE_FUNCTIONAL
 from copy import deepcopy
 import random
@@ -7638,6 +7639,18 @@ def forward(self, x, y):
 EXCLUDE_TRACED = {
     'test_split_dim',
     'test_split_dim_neg0',
+
+    # The following fail due to #12024.
+    # A prim::ListConstruct is involved and the indices get traced as DynamicType,
+    # which always require_grad. This causes a crash in autodiff.
+    'test___getitem___adv_index',
+    'test___getitem___adv_index_beg',
+    'test___getitem___adv_index_comb',
+    'test___getitem___adv_index_dup',
+    'test___getitem___adv_index_sub',
+    'test___getitem___adv_index_sub_2',
+    'test___getitem___adv_index_sub_3',
+    'test___getitem___adv_index_var',
 }
 
 EXCLUDE_TYPE_CHECK = {
@@ -7748,11 +7761,17 @@ def new_fn(*tensors_):
 
 
 # create a trace function from input fn
-def create_traced_fn(self, fn):
+#
+# disable_autodiff_subgraph_inlining:
+#   Don't inline autodiff subgraphs so we can test autodiff
+def create_traced_fn(self, fn,
+                     disable_autodiff_subgraph_inlining=False):
     def traced_fn(*inputs, **kwargs):
         fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
         traced = torch.jit.trace(fn_tensors, inputs_tensors)
         self.assertExportImport(traced.graph, inputs_tensors)
+        if disable_autodiff_subgraph_inlining:
+            traced.debug_disable_autodiff_subgraph_inlining()
         output = traced(*inputs_tensors)
         traced_fn.last_graph = traced.graph_for(*inputs_tensors)
         return output
@@ -7773,7 +7792,8 @@ def get_constant(x):
 # create a script function from (name, func_type, output_process_fn),
 # returns a function takes in (args, kwargs) and runs the compiled function and
 # then applies the post process fn to the outputs
-def create_script_fn(self, method_name, func_type, output_process_fn):
+def create_script_fn(self, method_name, func_type, output_process_fn,
+                     disable_autodiff_subgraph_inlining=False):
     def script_fn(*args, **kwargs):
         formals = []
         tensors = []
@@ -7804,6 +7824,8 @@ def script_fn(*args, **kwargs):
         import math
 
         CU = torch.jit.CompilationUnit(script)
+        if disable_autodiff_subgraph_inlining:
+            CU.the_method.debug_disable_autodiff_subgraph_inlining()
         self.assertExportImport(CU.the_method.graph, tensors)
         output = output_process_fn(CU.the_method(*tensors))
         script_fn.last_graph = CU.the_method.graph_for(*tensors)
@@ -8141,7 +8163,7 @@ def func(x):
 ])
 
 
-def add_test(
+def add_autograd_test(
         name,
         self_size,
         args,
@@ -8184,14 +8206,20 @@ def fn(*inputs, **kwargs):
                 check_types = test_name not in EXCLUDE_TYPE_CHECK
 
                 if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name):
+                    # Test with disable_autodiff_subgraph_inlining, which forces the graph
+                    # to contain DifferentiableGraph nodes whenever possible. This allows us
+                    # to test autodiff; we assume that autograd is correct and use autodiff for backprop
                     if test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(self, fn),
+                        check_against_reference(self,
+                                                create_traced_fn(self, fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
                                                 fn, (self_variable,) + args_variable, kwargs_variable,
                                                 check_types=check_types)
 
                     if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(self, name, 'method', output_process_fn),
+                                                create_script_fn(self, name, 'method', output_process_fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
                                                 fn, (self_variable,) + args_variable, kwargs_variable,
                                                 check_types=check_types)
 
@@ -8205,12 +8233,15 @@ def fn(*inputs, **kwargs):
                     f_args_tensor = (self_tensor,) + args_tensor
 
                     if not is_inplace and test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(self, fn), fn,
-                                                f_args_variable, kwargs_variable, check_types=check_types)
+                        check_against_reference(self,
+                                                create_traced_fn(self, fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
+                                                fn, f_args_variable, kwargs_variable, check_types=check_types)
 
                     if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(self, name, 'functional', output_process_fn),
+                                                create_script_fn(self, name, 'functional', output_process_fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
                                                 fn, f_args_variable, kwargs_variable,
                                                 check_types=check_types)
 
@@ -8265,8 +8296,8 @@ def post_add_test(test_name, skipTestIf, do_test):
         setattr(TestJitGenerated, test_name, do_test)
 
 
-for test in method_tests:
-    add_test(*test)
+for test in autograd_method_tests:
+    add_autograd_test(*test)
 
 for test in nn_functional_tests:
     add_nn_test(*test)
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 251a6466ee3a4f..009bf68ae3f6da 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -23,10 +23,20 @@ void wrapDim(int64_t & dim, const std::vector<int64_t> & sizes) {
 }
 
 bool isDifferentiable(Node * n) {
+  // TODO: scalar-tensor ops should be canonicalized
   static OperatorSet differentiable_ops = {
     "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::add(Scalar other, Tensor self) -> Tensor",
     "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::sub(Scalar other, Tensor self) -> Tensor",
     "aten::mul(Tensor self, Tensor other) -> Tensor",
+    "aten::mul(Tensor self, Scalar other) -> Tensor",
+    "aten::mul(Scalar other, Tensor self) -> Tensor",
+    "aten::div(Scalar other, Tensor self) -> Tensor",
+    "aten::div(Tensor self, Tensor other) -> Tensor",
+    "aten::div(Tensor self, Scalar other) -> Tensor",
     "aten::sigmoid(Tensor self) -> Tensor",
     "aten::tanh(Tensor self) -> Tensor",
     "aten::relu(Tensor self) -> Tensor",
@@ -43,9 +53,39 @@ bool isDifferentiable(Node * n) {
     "aten::gt(Tensor self, Tensor other) -> Tensor",
     "aten::ge(Tensor self, Tensor other) -> Tensor",
     "aten::eq(Tensor self, Tensor other) -> Tensor",
-    "aten::ne(Tensor self, Tensor other) -> Tensor"
+    "aten::ne(Tensor self, Tensor other) -> Tensor",
+    "aten::abs(Tensor self) -> Tensor",
+    "aten::acos(Tensor self) -> Tensor",
+    "aten::asin(Tensor self) -> Tensor",
+    "aten::atan(Tensor self) -> Tensor",
+    "aten::ceil(Tensor self) -> Tensor",
+    "aten::cos(Tensor self) -> Tensor",
+    "aten::cosh(Tensor self) -> Tensor",
+    "aten::exp(Tensor self) -> Tensor",
+    "aten::expm1(Tensor self) -> Tensor",
+    "aten::floor(Tensor self) -> Tensor",
+    "aten::fmod(Tensor self, Scalar other) -> Tensor",
+    "aten::frac(Tensor self) -> Tensor",
+    "aten::log(Tensor self) -> Tensor",
+    "aten::log10(Tensor self) -> Tensor",
+    "aten::log1p(Tensor self) -> Tensor",
+    "aten::log2(Tensor self) -> Tensor",
+    "aten::reciprocal(Tensor self) -> Tensor",
+    "aten::remainder(Tensor self, Scalar other) -> Tensor",
+    "aten::round(Tensor self) -> Tensor",
+    "aten::rsqrt(Tensor self) -> Tensor",
+    "aten::sin(Tensor self) -> Tensor",
+    "aten::sinh(Tensor self) -> Tensor",
+    "aten::tan(Tensor self) -> Tensor",
+    "aten::trunc(Tensor self) -> Tensor",
   };
 
+  // TODO: add support for the following fusible operators.
+  // They're a little tricky to implement; max/min require mutability for best perf
+  // "aten::atan2(Tensor self) -> Tensor",
+  // "aten::max(Tensor self) -> Tensor",
+  // "aten::min(Tensor self) -> Tensor"
+
   if (n->kind() == prim::Constant ||
       n->kind() == prim::AutogradAdd ||
       n->kind() == prim::ConstantChunk)
@@ -89,15 +129,42 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor")) {
       return {grads.at(0), grads.at(0) * node->namedInput(attr::alpha), nullptr};
 
+    } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
+      return {grads.at(0), nullptr, nullptr};
+
+    } else if (node->matches("aten::add(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, grads.at(0)};
+
     } else if (node->kind() == prim::AutogradAdd) {
       return {grads.at(0), grads.at(0)};
 
     } else if (node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor")) {
       return {grads.at(0), -grads.at(0) * node->namedInput(attr::alpha), nullptr};
 
+    } else if (node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
+      return {grads.at(0), nullptr, nullptr};
+
+    } else if (node->matches("aten::sub(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, -grads.at(0)};
+
     } else if (node->matches("aten::mul(Tensor self, Tensor other) -> Tensor")) {
       return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
 
+    } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0) * inputs.at(1), nullptr};
+
+    } else if (node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, grads.at(0) * inputs.at(0)};
+
+    } else if (node->matches("aten::div(Tensor self, Tensor other) -> Tensor")) {
+      return {grads.at(0) / inputs.at(1), -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
+
+    } else if (node->matches("aten::div(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0) / inputs.at(1), nullptr};
+
+    } else if (node->matches("aten::div(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
+
     } else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) {
       return {grads.at(0) * outputs.at(0) * (1 - outputs.at(0))};
 
@@ -130,6 +197,78 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::neg(Tensor self) -> Tensor")) {
       return {-grads.at(0)};
 
+    } else if (node->matches("aten::abs(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).sign()};
+
+    } else if (node->matches("aten::acos(Tensor self) -> Tensor")) {
+      return {grads.at(0) * -((-inputs.at(0) * inputs.at(0) + at::Scalar(1)).rsqrt())};
+
+    } else if (node->matches("aten::asin(Tensor self) -> Tensor")) {
+      return {grads.at(0) * (-inputs.at(0) * inputs.at(0) + at::Scalar(1)).rsqrt()};
+
+    } else if (node->matches("aten::atan(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) * inputs.at(0) + at::Scalar(1))};
+
+    } else if (node->matches("aten::ceil(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
+    } else if (node->matches("aten::cos(Tensor self) -> Tensor")) {
+      return {grads.at(0) * -inputs.at(0).sin()};
+
+    } else if (node->matches("aten::cosh(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).sinh()};
+
+    } else if (node->matches("aten::exp(Tensor self) -> Tensor")) {
+      return {grads.at(0) * outputs.at(0)};
+
+    } else if (node->matches("aten::expm1(Tensor self) -> Tensor")) {
+      return {grads.at(0) * (outputs.at(0) + at::Scalar(1))};
+
+    } else if (node->matches("aten::floor(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
+    } else if (node->matches("aten::fmod(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0), nullptr};
+
+    } else if (node->matches("aten::frac(Tensor self) -> Tensor")) {
+      return {grads.at(0)};
+
+    } else if (node->matches("aten::log(Tensor self) -> Tensor")) {
+      return {grads.at(0) / inputs.at(0)};
+
+    } else if (node->matches("aten::log10(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) * 2.3025850929940456)};
+
+    } else if (node->matches("aten::log1p(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) + at::Scalar(1))};
+
+    } else if (node->matches("aten::log2(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) * 0.6931471805599453)};
+
+    } else if (node->matches("aten::reciprocal(Tensor self) -> Tensor")) {
+      return {-grads.at(0) * outputs.at(0) * outputs.at(0)};
+
+    } else if (node->matches("aten::remainder(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0), nullptr};
+
+    } else if (node->matches("aten::round(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
+    } else if (node->matches("aten::rsqrt(Tensor self) -> Tensor")) {
+      return {grads.at(0) * outputs.at(0).pow(3.) * -0.5};
+
+    } else if (node->matches("aten::sin(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).cos()};
+
+    } else if (node->matches("aten::sinh(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).cosh()};
+
+    } else if (node->matches("aten::tan(Tensor self) -> Tensor")) {
+      return {grads.at(0) * (1. + outputs.at(0) * outputs.at(0))};
+
+    } else if (node->matches("aten::trunc(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
     } else if (node->kind() == prim::ConstantChunk) {
       return {SymbolicVariable::cat(grads, node->i(attr::dim))};
 
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 492faade8de612..d071c464721559 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -361,6 +361,14 @@ struct GraphExecutorImpl {
     return state;
   }
 
+  // This function should be used only for testing purposes
+  void debugDisableAutodiffSubgraphInlining() {
+    // Allow single-node autodiff subgraphs
+    autodiffSubgraphNodeThreshold = 1;
+    // Don't inline autodiff subgraphs into autograd functions
+    autodiffSubgraphInlineThreshold = 1;
+  }
+
 private:
   friend struct GraphExecutor;
 
@@ -416,14 +424,14 @@ struct GraphExecutorImpl {
     // Phase 5. Apply non-differentiable optimizations to the graphs we've found
     //          (or the whole grpah if we know we won't need its derivative).
     if (needsGradient(opt_graph)) {
-      auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph);
+      auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph, autodiffSubgraphNodeThreshold);
       for (Node * dnode : diff_nodes) {
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
         Gradient gradient = differentiate(diff_graph);
         runNondiffOptimization(gradient.f);
         packGradient(gradient, dnode);
       }
-      InlineAutodiffSubgraphs(opt_graph);
+      InlineAutodiffSubgraphs(opt_graph, autodiffSubgraphInlineThreshold);
     } else {
       runNondiffOptimization(opt_graph);
     }
@@ -523,6 +531,10 @@ struct GraphExecutorImpl {
   // GraphExecutors can be accessed from multiple threads, so this thread needs to be
   // held every time we access the fallback or plan_cache.
   std::mutex compile_mutex;
+
+  // Some tunable parameters
+  size_t autodiffSubgraphNodeThreshold = 2;
+  size_t autodiffSubgraphInlineThreshold = 5;
 };
 
 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
@@ -544,6 +556,10 @@ GraphExecutorState GraphExecutor::getDebugState() {
   return pImpl->getDebugState();
 }
 
+void GraphExecutor::debugDisableAutodiffSubgraphInlining() {
+  return pImpl->debugDisableAutodiffSubgraphInlining();
+}
+
 
 void runRequiredPasses(const std::shared_ptr<Graph>& g)  {
   specializeUndef(*g);
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index 7e644273a5b07c..08688a8c8cab37 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -36,6 +36,7 @@ struct TORCH_API GraphExecutor {
   std::shared_ptr<Graph> graph() const;
   std::shared_ptr<Graph> graphFor(const Stack& inputs) const;
   GraphExecutorState getDebugState();
+  void debugDisableAutodiffSubgraphInlining();
 private:
   std::shared_ptr<GraphExecutorImpl> pImpl;
 };
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index c09caf4c3702f9..f0dfda81cc0926 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -487,6 +487,12 @@ void initJitScriptBindings(PyObject* module) {
         }
         throw std::runtime_error("Attempted to call get_debug_state on a Module without a compiled forward()");
       })
+      .def("debug_disable_autodiff_subgraph_inlining", [](Module& self) {
+        if (self.find_method("forward")) {
+          Method & m = self.get_method("forward");
+          m.debugDisableAutodiffSubgraphInlining();
+        }
+      })
       .def("forward", [](Module& self, py::args args, py::kwargs kwargs) {
         // We implement this in C++ to avoid incurring the pybind11 dispatch
         // overhead twice: once to call into the method lookup for "forward"
@@ -515,6 +521,7 @@ void initJitScriptBindings(PyObject* module) {
       auto schema = extractSchemaFromDef(def, is_method);
       self.setSchema(schema);
     })
+    .def("debug_disable_autodiff_subgraph_inlining", &Method::debugDisableAutodiffSubgraphInlining)
     .def("pretty_print_schema", &Method::pretty_print_schema);
 
   m.def("_jit_script_compile", [](const Def &def, ResolutionCallback rcb) {
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index caf084d074ba91..50ae9f48fb3c93 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -165,6 +165,10 @@ struct Method {
     return get_executor().getDebugState();
   }
 
+  void debugDisableAutodiffSubgraphInlining() {
+    return get_executor().debugDisableAutodiffSubgraphInlining();
+  }
+
   bool is_optimized() {
     return optimize;
   }
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index 3e38b4323da329..daac5d48d1d895 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -56,6 +56,9 @@ struct SymbolicVariable {
   SymbolicVariable operator*(const SymbolicVariable rhs) const {
     return create(aten::mul, {*this, rhs})[0].typeLike(*this);
   }
+  SymbolicVariable operator/(const SymbolicVariable rhs) const {
+    return create(aten::div, {*this, rhs})[0].typeLike(*this);
+  }
   SymbolicVariable operator*(at::Scalar rhs) const {
     if (isConstInt(rhs, 1))
       return *this;
@@ -170,6 +173,30 @@ struct SymbolicVariable {
     Node * unpack = g->insertNode(g->create(prim::ListUnpack, {output_list}, inputs.size()));
     return fmap<SymbolicVariable>(unpack->outputs());
   }
+  static SymbolicVariable zeros_like(const SymbolicVariable input) {
+    return create(t("zeros_like"), {input})[0];
+  }
+  SymbolicVariable cos() const {
+    return create(t("cos"), {*this})[0];
+  }
+  SymbolicVariable cosh() const {
+    return create(t("cosh"), {*this})[0];
+  }
+  SymbolicVariable pow(at::Scalar other) const {
+    return create(t("pow"), {*this, insertConstant(other)})[0];
+  }
+  SymbolicVariable rsqrt() const {
+    return create(t("rsqrt"), {*this})[0];
+  }
+  SymbolicVariable sign() const {
+    return create(t("sign"), {*this})[0];
+  }
+  SymbolicVariable sin() const {
+    return create(t("sin"), {*this})[0];
+  }
+  SymbolicVariable sinh() const {
+    return create(t("sinh"), {*this})[0];
+  }
   SymbolicVariable sum() const {
     return create(t("sum"), {*this})[0];
   }

From 02d7c88fa489f57bf1266f1db49b089cc7895d70 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Wed, 26 Sep 2018 08:43:38 -0700
Subject: [PATCH 47/51] Unify versions across setup.py, libtorch, and libcaffe2
 (#12053)

Summary:
This unifies our versions across setup.py, libtorch, and libcaffe2. CMake has a default version (bumped to 1.0.0) that can be overridden by setup.py. The versions are also printed as a part of cmake/Summary.cmake to make sure they are correct.

cc Yangqing ezyang soumith goldsborough pjh5
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12053

Differential Revision: D10041878

Pulled By: orionr

fbshipit-source-id: a98a01771f6c008d1016ab63ab785c3a88c3ddb0
---
 CMakeLists.txt               | 27 ++++++++++++++++-----------
 cmake/Summary.cmake          |  2 ++
 cmake/Utils.cmake            | 15 +++++++++++++++
 setup.py                     |  1 +
 tools/build_pytorch_libs.bat |  1 +
 tools/build_pytorch_libs.sh  |  1 +
 torch/CMakeLists.txt         |  6 ------
 7 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7bf587dc30fc97..488605d5ea459e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,12 +10,6 @@ if (NOT MSVC)
   set(CMAKE_C_STANDARD 11)
 endif()
 
-set(CAFFE2_VERSION_MAJOR 0)
-set(CAFFE2_VERSION_MINOR 8)
-set(CAFFE2_VERSION_PATCH 2)
-set(CAFFE2_VERSION
-    "${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}")
-
 # One variable that determines whether the current cmake process is being run
 # with the main Caffe2 library. This is useful for building modules - if
 # modules are built with the main Caffe2 library then one does not need to do
@@ -139,6 +133,22 @@ if (ANDROID OR IOS)
   set(BUILD_ATEN_MOBILE ON)
 endif()
 
+# ---[ Utils
+# TODO: merge the following 3 files into cmake/public/utils.cmake.
+include(cmake/Utils.cmake)
+include(cmake/public/utils.cmake)
+
+# ---[ Version numbers for generated libraries
+set(TORCH_DEFAULT_VERSION "1.0.0")
+set(TORCH_BUILD_VERSION "${TORCH_DEFAULT_VERSION}" CACHE STRING "Torch build version")
+if (NOT TORCH_BUILD_VERSION)
+  # An empty string was specified so force version to the default
+  set(TORCH_BUILD_VERSION "${TORCH_DEFAULT_VERSION}"
+    CACHE STRING "Torch build version" FORCE)
+endif()
+caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
+caffe2_parse_version_str(CAFFE2 ${TORCH_BUILD_VERSION})
+
 # ---[ CMake scripts + modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
@@ -165,11 +175,6 @@ include(cmake/MiscCheck.cmake)
 # External projects
 include(ExternalProject)
 
-# ---[ Utils
-# TODO: merge the following 3 files into cmake/public/utils.cmake.
-include(cmake/Utils.cmake)
-include(cmake/public/utils.cmake)
-
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
 
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3df260f3b49aad..58eae123dd1378 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -17,6 +17,8 @@ function (caffe2_print_configuration_summary)
   message(STATUS "  CMAKE_INSTALL_PREFIX  : ${CMAKE_INSTALL_PREFIX}")
   message(STATUS "")
 
+  message(STATUS "  TORCH_VERSION         : ${TORCH_VERSION}")
+  message(STATUS "  CAFFE2_VERSION        : ${CAFFE2_VERSION}")
   message(STATUS "  BUILD_ATEN_MOBILE     : ${BUILD_ATEN_MOBILE}")
   message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
   message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index c212805a7b0dc7..5505ae1f5c71bf 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -113,6 +113,21 @@ function(caffe_parse_header_single_define LIBNAME HDR_PATH VARNAME)
   endif()
 endfunction()
 
+################################################################################################
+# Parses a version string that might have values beyond major, minor, and patch
+# and set version variables for the library.
+# Usage:
+#   caffe2_parse_version_str(<library_name> <version_string>)
+function(caffe2_parse_version_str LIBNAME VERSIONSTR)
+  string(REGEX REPLACE "^([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${VERSIONSTR}")
+  string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${VERSIONSTR}")
+  string(REGEX REPLACE "[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${VERSIONSTR}")
+  set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+endfunction()
+
 ##############################################################################
 # Helper function to automatically generate __init__.py files where python
 # sources reside but there are no __init__.py present.
diff --git a/setup.py b/setup.py
index 5132d357bdc8bb..94455ed1cf7be7 100644
--- a/setup.py
+++ b/setup.py
@@ -346,6 +346,7 @@ def build_libs(libs):
         build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
     my_env = os.environ.copy()
     my_env["PYTORCH_PYTHON"] = sys.executable
+    my_env["PYTORCH_BUILD_VERSION"] = version
     my_env["CMAKE_PREFIX_PATH"] = full_site_packages
     my_env["NUM_JOBS"] = str(NUM_JOBS)
     my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 123ba5f303e097..c924b593efc23f 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -172,6 +172,7 @@ goto:eof
   cd build
   cmake .. %CMAKE_GENERATOR_COMMAND% ^
                   -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^
+                  -DTORCH_BUILD_VERSION="%PYTORCH_BUILD_VERSION%" ^
                   -DBUILD_TORCH="%BUILD_TORCH%" ^
                   -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^
                   -DNO_API=ON ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 01cb82f49c5967..184c60b7c444f2 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -273,6 +273,7 @@ function build_caffe2() {
 		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
 		       -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
 		       -DBUILDING_WITH_TORCH_LIBS=ON \
+		       -DTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION" \
 		       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
 		       -DBUILD_TORCH=$BUILD_TORCH \
 		       -DBUILD_PYTHON=$BUILD_PYTHON \
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index af5bfc0fdc8ef5..ce337e93c85463 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -12,12 +12,6 @@ endif()
 option(BUILD_TEST "Build torch test binaries" ON)
 option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF)
 
-# TODO: Unify with version from setup.py
-set(TORCH_VERSION_MAJOR 0)
-set(TORCH_VERSION_MINOR 4)
-set(TORCH_VERSION_PATCH 1)
-set(TORCH_VERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}.${TORCH_VERSION_PATCH}")
-
 set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 set(TORCH_ROOT "${TORCH_SRC_DIR}/..")
 

From b535aecd7c84aead583f4e1339e8b0e15a5a92c4 Mon Sep 17 00:00:00 2001
From: vishwakftw <cs15btech11043@iith.ac.in>
Date: Wed, 26 Sep 2018 09:24:15 -0700
Subject: [PATCH 48/51] Fix warnings emitted when testing distributions
 (#12038)

Summary:
The earlier tests had around 80 warnings, and now there are 6 warnings: these are due to JIT

The changes remove the wrapping of a Tensor by a Tensor constructor, which emits warnings due to the changes in https://github.com/pytorch/pytorch/pull/11061 .
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12038

Differential Revision: D10033392

Pulled By: apaszke

fbshipit-source-id: b1faf368e650d062d7983f9932511bee4702a893
---
 test/test_distributions.py        | 240 +++++++++++++++---------------
 torch/distributions/gumbel.py     |   3 +-
 torch/distributions/transforms.py |   4 +-
 torch/distributions/weibull.py    |   2 +-
 4 files changed, 125 insertions(+), 124 deletions(-)

diff --git a/test/test_distributions.py b/test/test_distributions.py
index 8264337a681782..5fbc2003be27e4 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -102,12 +102,12 @@ def is_all_nan(tensor):
     ]),
     Example(Beta, [
         {
-            'concentration1': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
-            'concentration0': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+            'concentration1': torch.randn(2, 3).exp().requires_grad_(),
+            'concentration0': torch.randn(2, 3).exp().requires_grad_(),
         },
         {
-            'concentration1': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True),
-            'concentration0': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True),
+            'concentration1': torch.randn(4).exp().requires_grad_(),
+            'concentration0': torch.randn(4).exp().requires_grad_(),
         },
     ]),
     Example(Categorical, [
@@ -146,29 +146,29 @@ def is_all_nan(tensor):
          'scale': torch.tensor([[1.0], [1.0]])}
     ]),
     Example(Chi2, [
-        {'df': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
-        {'df': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)},
+        {'df': torch.randn(2, 3).exp().requires_grad_()},
+        {'df': torch.randn(1).exp().requires_grad_()},
     ]),
     Example(StudentT, [
-        {'df': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
-        {'df': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)},
+        {'df': torch.randn(2, 3).exp().requires_grad_()},
+        {'df': torch.randn(1).exp().requires_grad_()},
     ]),
     Example(Dirichlet, [
-        {'concentration': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
-        {'concentration': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)},
+        {'concentration': torch.randn(2, 3).exp().requires_grad_()},
+        {'concentration': torch.randn(4).exp().requires_grad_()},
     ]),
     Example(Exponential, [
-        {'rate': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)},
-        {'rate': torch.tensor(torch.randn(1).abs(), requires_grad=True)},
+        {'rate': torch.randn(5, 5).abs().requires_grad_()},
+        {'rate': torch.randn(1).abs().requires_grad_()},
     ]),
     Example(FisherSnedecor, [
         {
-            'df1': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
-            'df2': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'df1': torch.randn(5, 5).abs().requires_grad_(),
+            'df2': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
-            'df1': torch.tensor(torch.randn(1).abs(), requires_grad=True),
-            'df2': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'df1': torch.randn(1).abs().requires_grad_(),
+            'df2': torch.randn(1).abs().requires_grad_(),
         },
         {
             'df1': torch.tensor([1.0]),
@@ -177,22 +177,22 @@ def is_all_nan(tensor):
     ]),
     Example(Gamma, [
         {
-            'concentration': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
-            'rate': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+            'concentration': torch.randn(2, 3).exp().requires_grad_(),
+            'rate': torch.randn(2, 3).exp().requires_grad_(),
         },
         {
-            'concentration': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True),
-            'rate': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True),
+            'concentration': torch.randn(1).exp().requires_grad_(),
+            'rate': torch.randn(1).exp().requires_grad_(),
         },
     ]),
     Example(Gumbel, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
     ]),
     Example(HalfCauchy, [
@@ -200,45 +200,45 @@ def is_all_nan(tensor):
         {'scale': torch.tensor([[1.0], [1.0]])}
     ]),
     Example(HalfNormal, [
-        {'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)},
-        {'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True)},
+        {'scale': torch.randn(5, 5).abs().requires_grad_()},
+        {'scale': torch.randn(1).abs().requires_grad_()},
         {'scale': torch.tensor([1e-5, 1e-5], requires_grad=True)}
     ]),
     Example(Independent, [
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 0,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 1,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 2,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 2,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 3,
         },
     ]),
     Example(Laplace, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
         {
             'loc': torch.tensor([1.0, 0.0], requires_grad=True),
@@ -248,11 +248,11 @@ def is_all_nan(tensor):
     Example(LogNormal, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
         {
             'loc': torch.tensor([1.0, 0.0], requires_grad=True),
@@ -310,11 +310,11 @@ def is_all_nan(tensor):
     Example(Normal, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
         {
             'loc': torch.tensor([1.0, 0.0], requires_grad=True),
@@ -332,8 +332,8 @@ def is_all_nan(tensor):
             'alpha': 1.0
         },
         {
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
-            'alpha': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
+            'alpha': torch.randn(5, 5).abs().requires_grad_()
         },
         {
             'scale': torch.tensor([1.0]),
@@ -342,10 +342,10 @@ def is_all_nan(tensor):
     ]),
     Example(Poisson, [
         {
-            'rate': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'rate': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
-            'rate': torch.tensor(torch.randn(3).abs(), requires_grad=True),
+            'rate': torch.randn(3).abs().requires_grad_(),
         },
         {
             'rate': 0.2,
@@ -382,23 +382,23 @@ def is_all_nan(tensor):
     Example(TransformedDistribution, [
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'transforms': [],
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'transforms': ExpTransform(),
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'transforms': [AffineTransform(torch.randn(3, 5), torch.randn(3, 5)),
                            ExpTransform()],
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'transforms': AffineTransform(1, 2),
         },
     ]),
@@ -418,8 +418,8 @@ def is_all_nan(tensor):
     ]),
     Example(Weibull, [
         {
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
-            'concentration': torch.tensor(torch.randn(1).abs(), requires_grad=True)
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
+            'concentration': torch.randn(1).abs().requires_grad_()
         }
     ])
 ]
@@ -922,7 +922,7 @@ def test_geometric_sample(self):
                                          'Geometric(prob={})'.format(prob))
 
     def test_binomial(self):
-        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
+        p = torch.arange(0.05, 1, 0.1).requires_grad_()
         for total_count in [1, 2, 10]:
             self._gradcheck_log_prob(lambda p: Binomial(total_count, p), [p])
             self._gradcheck_log_prob(lambda p: Binomial(total_count, None, p.log()), [p])
@@ -931,7 +931,7 @@ def test_binomial(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_binomial_log_prob(self):
-        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
+        probs = torch.arange(0.05, 1, 0.1)
         for total_count in [1, 2, 10]:
 
             def ref_log_prob(idx, x, log_prob):
@@ -987,7 +987,7 @@ def test_binomial_vectorized_count(self):
         self.assertEqual(samples.var(dim=0), bin1.variance, prec=0.02)
 
     def test_negative_binomial(self):
-        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
+        p = torch.arange(0.05, 1, 0.1).requires_grad_()
         for total_count in [1, 2, 10]:
             self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p])
             self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p])
@@ -996,7 +996,7 @@ def test_negative_binomial(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_negative_binomial_log_prob(self):
-        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
+        probs = torch.arange(0.05, 1, 0.1)
         for total_count in [1, 2, 10]:
 
             def ref_log_prob(idx, x, log_prob):
@@ -1142,8 +1142,8 @@ def test_one_hot_categorical_enumerate_support(self):
         self._check_enumerate_support(OneHotCategorical, examples)
 
     def test_poisson_shape(self):
-        rate = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        rate = torch.randn(2, 3).abs().requires_grad_()
+        rate_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Poisson(rate).sample().size(), (2, 3))
         self.assertEqual(Poisson(rate).sample((7,)).size(), (7, 2, 3))
         self.assertEqual(Poisson(rate_1d).sample().size(), (1,))
@@ -1152,8 +1152,8 @@ def test_poisson_shape(self):
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_log_prob(self):
-        rate = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        rate = torch.randn(2, 3).abs().requires_grad_()
+        rate_1d = torch.randn(1).abs().requires_grad_()
 
         def ref_log_prob(idx, x, log_prob):
             l = rate.view(-1)[idx].detach()
@@ -1285,9 +1285,9 @@ def pmf(self, samples):
 
     def test_uniform(self):
         low = torch.zeros(5, 5, requires_grad=True)
-        high = torch.tensor(torch.ones(5, 5) * 3, requires_grad=True)
+        high = (torch.ones(5, 5) * 3).requires_grad_()
         low_1d = torch.zeros(1, requires_grad=True)
-        high_1d = torch.tensor(torch.ones(1) * 3, requires_grad=True)
+        high_1d = (torch.ones(1) * 3).requires_grad_()
         self.assertEqual(Uniform(low, high).sample().size(), (5, 5))
         self.assertEqual(Uniform(low, high).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Uniform(low_1d, high_1d).sample().size(), (1,))
@@ -1372,7 +1372,7 @@ def test_halfcauchy(self):
         scale.grad.zero_()
 
     def test_halfnormal(self):
-        std = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        std = torch.randn(5, 5).abs().requires_grad_()
         std_1d = torch.randn(1, requires_grad=True)
         std_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(HalfNormal(std).sample().size(), (5, 5))
@@ -1398,7 +1398,7 @@ def test_halfnormal(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_halfnormal_logprob(self):
-        std = torch.tensor(torch.randn(5, 1).abs(), requires_grad=True)
+        std = torch.randn(5, 1).abs().requires_grad_()
 
         def ref_log_prob(idx, x, log_prob):
             s = std.view(-1)[idx].detach()
@@ -1417,9 +1417,9 @@ def test_halfnormal_sample(self):
 
     def test_lognormal(self):
         mean = torch.randn(5, 5, requires_grad=True)
-        std = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        std = torch.randn(5, 5).abs().requires_grad_()
         mean_1d = torch.randn(1, requires_grad=True)
-        std_1d = torch.randn(1, requires_grad=True)
+        std_1d = torch.randn(1).abs().requires_grad_()
         mean_delta = torch.tensor([1.0, 0.0])
         std_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(LogNormal(mean, std).sample().size(), (5, 5))
@@ -1447,7 +1447,7 @@ def test_lognormal(self):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_lognormal_logprob(self):
         mean = torch.randn(5, 1, requires_grad=True)
-        std = torch.tensor(torch.randn(5, 1).abs(), requires_grad=True)
+        std = torch.randn(5, 1).abs().requires_grad_()
 
         def ref_log_prob(idx, x, log_prob):
             m = mean.view(-1)[idx].detach()
@@ -1533,9 +1533,9 @@ def test_logisticnormal_sample(self):
 
     def test_normal(self):
         loc = torch.randn(5, 5, requires_grad=True)
-        scale = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        scale = torch.randn(5, 5).abs().requires_grad_()
         loc_1d = torch.randn(1, requires_grad=True)
-        scale_1d = torch.randn(1, requires_grad=True)
+        scale_1d = torch.randn(1).abs().requires_grad_()
         loc_delta = torch.tensor([1.0, 0.0])
         scale_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(Normal(loc, scale).sample().size(), (5, 5))
@@ -1590,11 +1590,11 @@ def test_lowrank_multivariate_normal_shape(self):
 
         # construct PSD covariance
         cov_factor = torch.randn(3, 1, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(3).abs(), requires_grad=True)
+        cov_diag = torch.randn(3).abs().requires_grad_()
 
         # construct batch of PSD covariances
         cov_factor_batched = torch.randn(6, 5, 3, 2, requires_grad=True)
-        cov_diag_batched = torch.tensor(torch.randn(6, 5, 3).abs(), requires_grad=True)
+        cov_diag_batched = torch.randn(6, 5, 3).abs().requires_grad_()
 
         # ensure that sample, batch, event shapes all handled correctly
         self.assertEqual(LowRankMultivariateNormal(mean, cov_factor, cov_diag)
@@ -1634,7 +1634,7 @@ def test_lowrank_multivariate_normal_shape(self):
     def test_lowrank_multivariate_normal_log_prob(self):
         mean = torch.randn(3, requires_grad=True)
         cov_factor = torch.randn(3, 1, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(3).abs(), requires_grad=True)
+        cov_diag = torch.randn(3).abs().requires_grad_()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
 
         # check that logprob values match scipy logpdf,
@@ -1650,7 +1650,7 @@ def test_lowrank_multivariate_normal_log_prob(self):
         # Double-check that batched versions behave the same as unbatched
         mean = torch.randn(5, 3, requires_grad=True)
         cov_factor = torch.randn(5, 3, 2, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(5, 3).abs(), requires_grad=True)
+        cov_diag = torch.randn(5, 3).abs().requires_grad_()
 
         dist_batched = LowRankMultivariateNormal(mean, cov_factor, cov_diag)
         dist_unbatched = [LowRankMultivariateNormal(mean[i], cov_factor[i], cov_diag[i])
@@ -1668,7 +1668,7 @@ def test_lowrank_multivariate_normal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         mean = torch.randn(5, requires_grad=True)
         cov_factor = torch.randn(5, 1, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(5).abs(), requires_grad=True)
+        cov_diag = torch.randn(5).abs().requires_grad_()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
 
         self._check_sampler_sampler(LowRankMultivariateNormal(mean, cov_factor, cov_diag),
@@ -1679,7 +1679,7 @@ def test_lowrank_multivariate_normal_sample(self):
     def test_lowrank_multivariate_normal_properties(self):
         loc = torch.randn(5)
         cov_factor = torch.randn(5, 2)
-        cov_diag = torch.tensor(torch.randn(5).abs())
+        cov_diag = torch.randn(5).abs()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
         m1 = LowRankMultivariateNormal(loc, cov_factor, cov_diag)
         m2 = MultivariateNormal(loc=loc, covariance_matrix=cov)
@@ -1694,7 +1694,7 @@ def test_lowrank_multivariate_normal_moments(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         mean = torch.randn(5)
         cov_factor = torch.randn(5, 2)
-        cov_diag = torch.tensor(torch.randn(5).abs())
+        cov_diag = torch.randn(5).abs()
         d = LowRankMultivariateNormal(mean, cov_factor, cov_diag)
         samples = d.rsample((100000,))
         empirical_mean = samples.mean(0)
@@ -1709,13 +1709,13 @@ def test_multivariate_normal_shape(self):
 
         # construct PSD covariance
         tmp = torch.randn(3, 10)
-        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
-        prec = torch.tensor(cov.inverse(), requires_grad=True)
-        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+        cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
+        prec = cov.inverse().requires_grad_()
+        scale_tril = torch.potrf(cov, upper=False).requires_grad_()
 
         # construct batch of PSD covariances
         tmp = torch.randn(6, 5, 3, 10)
-        cov_batched = torch.tensor((tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1), requires_grad=True)
+        cov_batched = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
         prec_batched = [C.inverse() for C in cov_batched.view((-1, 3, 3))]
         prec_batched = torch.stack(prec_batched).view(cov_batched.shape)
         scale_tril_batched = [torch.potrf(C, upper=False) for C in cov_batched.view((-1, 3, 3))]
@@ -1752,9 +1752,9 @@ def test_multivariate_normal_shape(self):
     def test_multivariate_normal_log_prob(self):
         mean = torch.randn(3, requires_grad=True)
         tmp = torch.randn(3, 10)
-        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
-        prec = torch.tensor(cov.inverse(), requires_grad=True)
-        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+        cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
+        prec = cov.inverse().requires_grad_()
+        scale_tril = torch.potrf(cov, upper=False).requires_grad_()
 
         # check that logprob values match scipy logpdf,
         # and that covariance and scale_tril parameters are equivalent
@@ -1773,7 +1773,7 @@ def test_multivariate_normal_log_prob(self):
         # Double-check that batched versions behave the same as unbatched
         mean = torch.randn(5, 3, requires_grad=True)
         tmp = torch.randn(5, 3, 10)
-        cov = torch.tensor((tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1), requires_grad=True)
+        cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
 
         dist_batched = MultivariateNormal(mean, cov)
         dist_unbatched = [MultivariateNormal(mean[i], cov[i]) for i in range(mean.size(0))]
@@ -1790,9 +1790,9 @@ def test_multivariate_normal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         mean = torch.randn(3, requires_grad=True)
         tmp = torch.randn(3, 10)
-        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
-        prec = torch.tensor(cov.inverse(), requires_grad=True)
-        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+        cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
+        prec = cov.inverse().requires_grad_()
+        scale_tril = torch.potrf(cov, upper=False).requires_grad_()
 
         self._check_sampler_sampler(MultivariateNormal(mean, cov),
                                     scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
@@ -1827,8 +1827,8 @@ def test_multivariate_normal_moments(self):
         self.assertEqual(d.variance, empirical_var, prec=0.05)
 
     def test_exponential(self):
-        rate = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
-        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        rate = torch.randn(5, 5).abs().requires_grad_()
+        rate_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Exponential(rate).sample().size(), (5, 5))
         self.assertEqual(Exponential(rate).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Exponential(rate_1d).sample((1,)).size(), (1, 1))
@@ -1863,7 +1863,7 @@ def test_exponential_sample(self):
 
     def test_laplace(self):
         loc = torch.randn(5, 5, requires_grad=True)
-        scale = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        scale = torch.randn(5, 5).abs().requires_grad_()
         loc_1d = torch.randn(1, requires_grad=True)
         scale_1d = torch.randn(1, requires_grad=True)
         loc_delta = torch.tensor([1.0, 0.0])
@@ -1914,10 +1914,10 @@ def test_laplace_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_shape(self):
-        alpha = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        beta = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        alpha_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
-        beta_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        alpha = torch.randn(2, 3).exp().requires_grad_()
+        beta = torch.randn(2, 3).exp().requires_grad_()
+        alpha_1d = torch.randn(1).exp().requires_grad_()
+        beta_1d = torch.randn(1).exp().requires_grad_()
         self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
         self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@@ -1936,10 +1936,10 @@ def ref_log_prob(idx, x, log_prob):
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_gpu_shape(self):
-        alpha = torch.tensor(torch.exp(torch.randn(2, 3).cuda()), requires_grad=True)
-        beta = torch.tensor(torch.exp(torch.randn(2, 3).cuda()), requires_grad=True)
-        alpha_1d = torch.tensor(torch.exp(torch.randn(1).cuda()), requires_grad=True)
-        beta_1d = torch.tensor(torch.exp(torch.randn(1).cuda()), requires_grad=True)
+        alpha = torch.randn(2, 3).cuda().exp().requires_grad_()
+        beta = torch.randn(2, 3).cuda().exp().requires_grad_()
+        alpha_1d = torch.randn(1).cuda().exp().requires_grad_()
+        beta_1d = torch.randn(1).cuda().exp().requires_grad_()
         self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
         self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@@ -1976,10 +1976,10 @@ def test_gamma_gpu_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_pareto(self):
-        scale = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        alpha = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
-        alpha_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        scale = torch.randn(2, 3).abs().requires_grad_()
+        alpha = torch.randn(2, 3).abs().requires_grad_()
+        scale_1d = torch.randn(1).abs().requires_grad_()
+        alpha_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Pareto(scale_1d, 0.5).mean, inf, allow_inf=True)
         self.assertEqual(Pareto(scale_1d, 0.5).variance, inf, allow_inf=True)
         self.assertEqual(Pareto(scale, alpha).sample().size(), (2, 3))
@@ -2008,9 +2008,9 @@ def test_pareto_sample(self):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gumbel(self):
         loc = torch.randn(2, 3, requires_grad=True)
-        scale = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        scale = torch.randn(2, 3).abs().requires_grad_()
         loc_1d = torch.randn(1, requires_grad=True)
-        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        scale_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Gumbel(loc, scale).sample().size(), (2, 3))
         self.assertEqual(Gumbel(loc, scale).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gumbel(loc_1d, scale_1d).sample().size(), (1,))
@@ -2036,8 +2036,8 @@ def test_gumbel_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_fishersnedecor(self):
-        df1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        df2 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        df1 = torch.randn(2, 3).abs().requires_grad_()
+        df2 = torch.randn(2, 3).abs().requires_grad_()
         df1_1d = torch.randn(1).abs()
         df2_1d = torch.randn(1).abs()
         self.assertTrue(is_all_nan(FisherSnedecor(1, 2).mean))
@@ -2067,8 +2067,8 @@ def test_fishersnedecor_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_chi2_shape(self):
-        df = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        df = torch.randn(2, 3).exp().requires_grad_()
+        df_1d = torch.randn(1).exp().requires_grad_()
         self.assertEqual(Chi2(df).sample().size(), (2, 3))
         self.assertEqual(Chi2(df).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Chi2(df_1d).sample((1,)).size(), (1, 1))
@@ -2094,8 +2094,8 @@ def test_chi2_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_studentT(self):
-        df = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        df = torch.randn(2, 3).exp().requires_grad_()
+        df_1d = torch.randn(1).exp().requires_grad_()
         self.assertTrue(is_all_nan(StudentT(1).mean))
         self.assertTrue(is_all_nan(StudentT(1).variance))
         self.assertEqual(StudentT(2).variance, inf, allow_inf=True)
@@ -2135,8 +2135,8 @@ def test_studentT_log_prob(self):
                 self.assertAlmostEqual(float(actual_log_prob[i]), float(expected_log_prob), places=3)
 
     def test_dirichlet_shape(self):
-        alpha = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        alpha_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        alpha = torch.randn(2, 3).exp().requires_grad_()
+        alpha_1d = torch.randn(4).exp().requires_grad_()
         self.assertEqual(Dirichlet(alpha).sample().size(), (2, 3))
         self.assertEqual(Dirichlet(alpha).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Dirichlet(alpha_1d).sample().size(), (4,))
@@ -2163,10 +2163,10 @@ def test_dirichlet_sample(self):
                                     multivariate=True)
 
     def test_beta_shape(self):
-        con1 = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        con0 = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        con1_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
-        con0_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        con1 = torch.randn(2, 3).exp().requires_grad_()
+        con0 = torch.randn(2, 3).exp().requires_grad_()
+        con1_1d = torch.randn(4).exp().requires_grad_()
+        con0_1d = torch.randn(4).exp().requires_grad_()
         self.assertEqual(Beta(con1, con0).sample().size(), (2, 3))
         self.assertEqual(Beta(con1, con0).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Beta(con1_1d, con0_1d).sample().size(), (4,))
@@ -2266,7 +2266,7 @@ def test_cdf_log_prob(self):
         for Dist, params in EXAMPLES:
             for i, param in enumerate(params):
                 dist = Dist(**param)
-                samples = torch.tensor(dist.sample())
+                samples = dist.sample()
                 if samples.dtype.is_floating_point:
                     samples.requires_grad_()
                 try:
@@ -3824,7 +3824,7 @@ def test_equality(self):
 
     def test_forward_inverse_cache(self):
         for transform in self.transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             try:
                 y = transform(x)
             except NotImplementedError:
@@ -3851,7 +3851,7 @@ def test_forward_inverse_cache(self):
 
     def test_forward_inverse_no_cache(self):
         for transform in self.transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             try:
                 y = transform(x)
                 x2 = transform.inv(y.clone())  # bypass cache
@@ -3880,7 +3880,7 @@ def test_univariate_forward_jacobian(self):
         for transform in self.transforms:
             if transform.event_dim > 0:
                 continue
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             try:
                 y = transform(x)
                 actual = transform.log_abs_det_jacobian(x, y)
@@ -3897,7 +3897,7 @@ def test_univariate_inverse_jacobian(self):
         for transform in self.transforms:
             if transform.event_dim > 0:
                 continue
-            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            y = self._generate_data(transform.inv).requires_grad_()
             try:
                 x = transform.inv(y)
                 actual = transform.log_abs_det_jacobian(x, y)
@@ -3977,7 +3977,7 @@ def test_transformed_distribution_shapes(self):
 
     def test_jit_fwd(self):
         for transform in self.unique_transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
 
             def f(x):
                 return transform(x)
@@ -3988,12 +3988,12 @@ def f(x):
                 continue
 
             # check on different inputs
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             self.assertEqual(f(x), traced_f(x))
 
     def test_jit_inv(self):
         for transform in self.unique_transforms:
-            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            y = self._generate_data(transform.inv).requires_grad_()
 
             def f(y):
                 return transform.inv(y)
@@ -4004,12 +4004,12 @@ def f(y):
                 continue
 
             # check on different inputs
-            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            y = self._generate_data(transform.inv).requires_grad_()
             self.assertEqual(f(y), traced_f(y))
 
     def test_jit_jacobian(self):
         for transform in self.unique_transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
 
             def f(x):
                 y = transform(x)
@@ -4021,7 +4021,7 @@ def f(x):
                 continue
 
             # check on different inputs
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             self.assertEqual(f(x), traced_f(x))
 
 
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index b489c8754aa441..7b9deaa8c1a8a8 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -33,7 +33,8 @@ def __init__(self, loc, scale, validate_args=None):
         if isinstance(loc, Number) and isinstance(scale, Number):
             base_dist = Uniform(finfo.tiny, 1 - finfo.eps)
         else:
-            base_dist = Uniform(self.loc.new(self.loc.size()).fill_(finfo.tiny), 1 - finfo.eps)
+            base_dist = Uniform(torch.full_like(self.loc, finfo.tiny),
+                                torch.full_like(self.loc, 1 - finfo.eps))
         transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
                       ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)]
         super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args)
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index a90cceefa5d6ca..00a52164f9780e 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -534,8 +534,8 @@ def _inverse_on_event(self, y):
 
     def _call(self, x):
         flat_x = x.contiguous().view((-1,) + x.shape[-2:])
-        return torch.stack([self._call_on_event(z) for z in flat_x]).view(x.shape)
+        return torch.stack([self._call_on_event(flat_x[i]) for i in range(flat_x.size(0))]).view(x.shape)
 
     def _inverse(self, y):
         flat_y = y.contiguous().view((-1,) + y.shape[-2:])
-        return torch.stack([self._inverse_on_event(z) for z in flat_y]).view(y.shape)
+        return torch.stack([self._inverse_on_event(flat_y[i]) for i in range(flat_y.size(0))]).view(y.shape)
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index 8b5afee400b780..97a50fdd6e4afa 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -27,7 +27,7 @@ class Weibull(TransformedDistribution):
     def __init__(self, scale, concentration, validate_args=None):
         self.scale, self.concentration = broadcast_all(scale, concentration)
         self.concentration_reciprocal = self.concentration.reciprocal()
-        base_dist = Exponential(self.scale.new(self.scale.size()).fill_(1.0))
+        base_dist = Exponential(torch.ones_like(self.scale))
         transforms = [PowerTransform(exponent=self.concentration_reciprocal),
                       AffineTransform(loc=0, scale=self.scale)]
         super(Weibull, self).__init__(base_dist,

From 18f9c07b183a4b6bd81ecf85e85a305fc28c55fb Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Wed, 26 Sep 2018 09:27:23 -0700
Subject: [PATCH 49/51] Enable tracing of tensor factories with an out argument

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12051

Differential Revision: D10044890

Pulled By: apaszke

fbshipit-source-id: 2d794bf408875600bc71f354f0b4961d6b715094
---
 test/test_jit.py                    |  7 ++++-
 tools/autograd/gen_variable_type.py | 40 ++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 7e342a9fe7a434..a448362b470bbf 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1224,13 +1224,18 @@ def run(**kwargs):
 
             def fn(x):
                 return x + torch.ones(2, 3, **kwargs)
-            input = torch.ones(2, 3, **kwargs)
+
+            input_kwargs = kwargs.copy()
+            if 'out' in input_kwargs:
+                del input_kwargs['out']
+            input = torch.ones(2, 3, **input_kwargs)
             self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads)
             # check we recorded 'ones' and did not just record a constant
             tfn = torch.jit.trace(fn, input)
             self.assertTrue("ones" in str(tfn.graph))
         run()
         run(dtype=torch.int, inputs_require_grads=False)
+        run(out=torch.tensor([]))
         if RUN_CUDA:
             run(device="cuda:0")
         if RUN_CUDA_MULTI_GPU:
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index d09a07a7b550c4..9f65e7bd366b94 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -144,7 +144,7 @@
 jit::tracer::ensureUnique("${name}", ${mutable_input});
 """)
 
-ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${input}", ${input});""")
+ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""")
 
 POST_RECORD_TRACE = CodeTemplate("""\
 if (tracer_state) {
@@ -154,6 +154,18 @@
 """)
 
 
+FACTORY_FUNCTION_NAMES = None
+
+
+def find_factory_functions(declarations):
+    global FACTORY_FUNCTION_NAMES
+    FACTORY_FUNCTION_NAMES = set()
+
+    for declaration in declarations:
+        if any(arg['simple_type'] == 'TensorOptions' for arg in declaration['arguments']):
+            FACTORY_FUNCTION_NAMES.add(declaration['api_name'])
+
+
 def should_trace(declaration):
     # Operations involving Storage or Type are not traceable at the moment
     if any(arg['simple_type'] in {'Storage', 'Type'} for arg in declaration['arguments']):
@@ -185,17 +197,30 @@ def record_trace_outputs(declaration):
 
 def format_trace(declaration):
     local = {}
+    local['trace_name'] = trace_name = uninplace_api_name(declaration['api_name'])
+
+    # *_out functions take the result as a first argument, but since we're
+    # going to de-inplace the call, we need to remove it from the argument list
+    trace_inputs = declaration['arguments']
+    if declaration['name'].endswith('_out'):
+        trace_inputs = trace_inputs[1:]
+    trace_input_spec = [(i['name'], i['name']) for i in trace_inputs]
+
+    # factories are a bit special because their out-of-place overloads
+    # take an extra TensorOptions argument, which is missing in the _out function
+    has_factory_name = trace_name in FACTORY_FUNCTION_NAMES
+    is_out_overload = any(arg['name'] == 'result' for arg in declaration['arguments'])
+    if has_factory_name and is_out_overload:
+        trace_input_spec.append(('result', 'result.options()'))
+
+    local['add_trace_inputs'] = \
+        '\n'.join(ADD_TRACE_INPUT.substitute(name=name, input=value) for name, value in trace_input_spec)
 
-    add_trace_inputs = []
-    for argument in declaration['arguments']:
-        add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
-    local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
-    local['inplace_guard'] = ''
     # Record inplace operations as out-of-place operations (e.g.,
     # not add_ but add)
     # TODO: Add a proper concept of side effects to the IR, and
     # properly record inplace operations.
-    local['trace_name'] = uninplace_api_name(declaration['api_name'])
+    local['inplace_guard'] = ''
     if local['trace_name'] != declaration['api_name']:
         local['inplace_guard'] = INPLACE_GUARD.substitute(name=declaration['api_name'],
                                                           mutable_input=declaration['arguments'][0]['name'])
@@ -214,6 +239,7 @@ def gen_variable_type(out, aten_declarations, template_path):
     implementation of each function dispatches to the base tensor type to
     compute the output. The grad_fn is attached to differentiable functions.
     """
+    find_factory_functions(aten_declarations)
 
     VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h')
     VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp')

From 44a17a038be26b3b7a82260ea2176494a4a2d485 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Wed, 26 Sep 2018 13:49:33 -0500
Subject: [PATCH 50/51] I believe we need to export these API for ROCm.

---
 aten/src/ATen/cuda/CUDAContext.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 58248acfe17951..0a4649d9c41ad4 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -54,15 +54,13 @@ Allocator* getCUDADeviceAllocator() {
 }
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
-  cusparseHandle_t getCurrentCUDASparseHandle() {
-    return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
-  }
+cusparseHandle_t getCurrentCUDASparseHandle() {
+  return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
+}
 
-  cublasHandle_t getCurrentCUDABlasHandle() {
-    return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
-  }
-#endif
+cublasHandle_t getCurrentCUDABlasHandle() {
+  return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
+}
 
 } // namespace cuda
 

From 83cf9eb689842e1b56cca399db43b6042ef91080 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Wed, 26 Sep 2018 13:59:50 -0500
Subject: [PATCH 51/51] Do not ifdef this out either.

---
 aten/src/ATen/cuda/CUDAContext.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 83a890da4d535e..3a480d2ca4e4e3 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -59,10 +59,8 @@ CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 CAFFE2_API Allocator* getCUDADeviceAllocator();
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
 CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
 CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
-#endif
 
 
 } // namespace cuda