From bdbbcf068a45495765ea14a722fa46ed26f2bd84 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Thu, 26 Jul 2018 08:21:28 -0700
Subject: [PATCH 01/10] Temporarily disable test_unique on rocm since it keeps
 running into segfault (#9872)

Summary:
petrex

https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-clang3.8-rocm1.7.1-ubuntu16.04-test/3758/console
https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-clang3.8-rocm1.7.1-ubuntu16.04-test/3757/console
https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-clang3.8-rocm1.7.1-ubuntu16.04-test/3752/console
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9872

Reviewed By: ezyang

Differential Revision: D9013335

Pulled By: bddppq

fbshipit-source-id: 80490a0fd4a86aa9c8454378c0edddc57d135c4e
---
 caffe2/python/hypothesis_test.py              |  3 ++-
 caffe2/python/hypothesis_test_util.py         |  7 ++++++-
 .../operator_test/spatial_bn_op_test.py       | 21 ++++++-------------
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 81bedce653612..cb9932bc4542a 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -763,12 +763,13 @@ def ftrl(w, nz, i, g, alpha):
         self.assertReferenceChecks(gc, op, [var, nz, indices, grad, alpha],
                                    ftrl)
 
+    # TODO: (bddppq) test_unique keeps running into segfault on rocm 1.8.2
     @given(input=hu.tensor(max_value=20,
                            max_dim=1,
                            dtype=np.int32,
                            elements=st.integers(min_value=0, max_value=10)),
            with_remapping=st.booleans(),
-           **hu.gcs)
+           **hu.gcs_no_hip)
     def test_unique(self, input, with_remapping, gc, dc):
         op = core.CreateOperator(
             "Unique",
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index e501a7d41d3ec..f640f6db20eff 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -252,7 +252,11 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 cpu_do = caffe2_pb2.DeviceOption()
 gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
 hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP)
-device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) + ([hip_do] if workspace.has_hip_support else [])
+# (bddppq) Do not rely on this no_hip option! It's just used to
+# temporarily skip some flaky tests on ROCM before it's getting more mature.
+_device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
+device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])
+
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
     [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
@@ -275,6 +279,7 @@ def gradient_checker_device_option():
 
 gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
 gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip))
 
 
 @contextlib.contextmanager
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index cbc83bed116c4..6854be44164b4 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -28,14 +28,11 @@ class TestSpatialBN(hu.HypothesisTestCase):
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
            inplace=st.sampled_from([True, False]),
-           **hu.gcs)
+           # Currently HIP SpatialBN only supports 2D
+           **hu.gcs_no_hip)
     def test_spatialbn_test_mode_3d(
             self, size, input_channels, batch_size, seed, order, epsilon,
             inplace, gc, dc):
-        # Currently HIP SpatialBN only supports 2D
-        if _run_in_hip(gc, dc):
-            return
-
         op = core.CreateOperator(
             "SpatialBN",
             ["X", "scale", "bias", "mean", "var"],
@@ -77,14 +74,11 @@ def reference_spatialbn_test(X, scale, bias, mean, var):
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
            inplace=st.sampled_from([True, False]),
-           **hu.gcs)
+           # Currently HIP SpatialBN only supports 2D
+           **hu.gcs_no_hip)
     def test_spatialbn_test_mode_1d(
             self, size, input_channels, batch_size, seed, order, epsilon,
             inplace, gc, dc):
-        # Currently HIP SpatialBN only supports 2D
-        if _run_in_hip(gc, dc):
-            return
-
         op = core.CreateOperator(
             "SpatialBN",
             ["X", "scale", "bias", "mean", "var"],
@@ -249,14 +243,11 @@ def test_spatialbn_train_mode_gradient_check(
            seed=st.integers(0, 65535),
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           **hu.gcs)
+           # Currently HIP SpatialBN only supports 2D
+           **hu.gcs_no_hip)
     def test_spatialbn_train_mode_gradient_check_1d(
             self, size, input_channels, batch_size, seed, order, epsilon,
             gc, dc):
-        # Currently HIP SpatialBN only supports 2D
-        if _run_in_hip(gc, dc):
-            return
-
         op = core.CreateOperator(
             "SpatialBN",
             ["X", "scale", "bias", "mean", "var"],

From 2c7e7e37a60a294e1a0583b7d92bff6b1e61cf55 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Thu, 26 Jul 2018 09:23:39 -0700
Subject: [PATCH 02/10] Corrected doc in class RNNCell (#9866)

Summary:
fixes #9642
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9866

Differential Revision: D9012131

Pulled By: weiyangfb

fbshipit-source-id: d2849b1a50234dbdb335dffab4835c9de85183c3
---
 torch/nn/modules/rnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index c780807791407..9f726d12e7a72 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -579,7 +579,7 @@ class RNNCell(RNNCellBase):
 
     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
-            `(input_size x hidden_size)`
+            `(hidden_size x input_size)`
         weight_hh: the learnable hidden-hidden weights, of shape
             `(hidden_size x hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`

From cd5adc7b5f7a5d4415a936c5ceca219f14dbb319 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Thu, 26 Jul 2018 10:11:53 -0700
Subject: [PATCH 03/10] Remove template parameter from Tensor (#13)

Summary:
Pull Request resolved: https://github.com/facebookresearch/weakly-supervised-action-detection/pull/13

Pull Request resolved: https://github.com/pytorch/translate/pull/166

Pull Request resolved: https://github.com/pytorch/pytorch/pull/9125

Closes https://github.com/pytorch/pytorch/pull/9125

Use inheritance for polymorphism, and remove template parameter
This is to change the templating in call sites, the core implementations will change later

Before Caffe2 Tensor class was compile-time fixed to bind to a particular device/context. With this change, we're making it a runtime property (stored inside the tensor), but preserve the same semantics. For example, one has to specify device type in order to create a Tensor - there are no uninitialized tensors. More specifically the changes are:

1. We added an extra argument *DeviceType* to most of the constructors of the tensor, e.g. (Tensor(DeviceType type)),
2. Semantics of constructor Tensor(const Tensor<SrcContext>& src, ContextForCopy* context); is changed, in this constructor, the second context is passed in to enable us to call the templated Copy function, it could be in a different context as source and target previously, now we'll enforce that the context should have same device type as src, if it is provided.
3. To preserve 'get-or-construct' semantics of Blob, we added specialized getter Blob::GetMutableTensor that verifies both that Blob contains a Tensor and that it's of a correct type
4. Specifically, Tensor type is not default-constructible any more (as we don't have unknown device tensors) and thus some of the code handling STL containers needs to change

Note: Some changes are postponed just to keep this diff a bit smaller. Please see `TODO`s.

Reviewed By: xw285cornell

Differential Revision: D8121878

fbshipit-source-id: 4a5e9a677ba4ac82095df959851a054c81eccf81
---
 binaries/benchmark_helper.cc                  |   6 +-
 binaries/benchmark_helper.h                   |   2 +-
 binaries/core_overhead_benchmark.cc           |   6 +-
 binaries/print_core_object_sizes.cc           |   3 +-
 binaries/speed_benchmark.cc                   |   2 +-
 caffe2/contrib/aten/aten_op_template.h        |  14 +-
 caffe2/contrib/gloo/common.cc                 |   2 +-
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  10 +-
 .../contrib/nervana/nervana_fc_op_gpu_test.cc |   6 +-
 caffe2/contrib/warpctc/ctc_op.h               |  14 +-
 caffe2/core/allocator.cc                      |   2 +-
 caffe2/core/blob.h                            |  29 +-
 caffe2/core/blob_gpu_test.cc                  |  34 +-
 caffe2/core/blob_serialization.cc             | 416 +++++++-
 caffe2/core/blob_serialization.h              | 445 +-------
 caffe2/core/blob_serialization_gpu.cc         |  15 +-
 caffe2/core/blob_test.cc                      | 105 +-
 caffe2/core/context.cc                        |  13 +
 caffe2/core/context.h                         | 150 ++-
 caffe2/core/context_base.cc                   |   5 +
 caffe2/core/context_base.h                    | 187 ++++
 caffe2/core/context_gpu.cu                    |  48 +-
 caffe2/core/context_gpu.h                     |  89 +-
 caffe2/core/context_test.cc                   |   2 +-
 caffe2/core/dispatch/CMakeLists.txt           |   1 +
 caffe2/core/dispatch/OpSchema.h               |  38 +-
 caffe2/core/dispatch/OpSchema_test.cpp        |  13 +-
 caffe2/core/hip/blob_serialization_hip.cc     |  12 +-
 caffe2/core/hip/context_hip.cc                |  79 +-
 caffe2/core/hip/context_hip.h                 |  88 +-
 caffe2/core/int8_serialization.cc             |   2 +-
 caffe2/core/operator.h                        |  75 +-
 caffe2/core/plan_executor.cc                  |   3 +-
 caffe2/core/predictor.cc                      |   8 +-
 caffe2/core/predictor_test.cc                 |  10 +-
 caffe2/core/tensor.cc                         |  64 +-
 caffe2/core/tensor.h                          | 226 ++--
 caffe2/core/tensor_int8.h                     |   3 +-
 caffe2/core/typeid.cc                         |   3 +-
 caffe2/core/typeid.h                          |  50 +-
 caffe2/core/workspace.h                       |   6 +-
 .../fully_connected_op_decomposition.h        |  12 +-
 .../operators/fully_connected_op_prune.h      |   8 +-
 .../operators/fully_connected_op_sparse.h     |   4 +-
 .../operators/sparse_matrix_reshape_op.h      |   1 -
 caffe2/ideep/operators/concat_split_op.cc     |   4 +-
 .../ideep/operators/operator_fallback_ideep.h |   9 +-
 caffe2/ideep/operators/utility_ops.cc         |   8 +-
 caffe2/ideep/utils/ideep_context.h            |  72 +-
 caffe2/ideep/utils/ideep_register.cc          |  12 +-
 caffe2/image/image_input_op.h                 |  45 +-
 caffe2/image/transform_gpu.cu                 |  33 +-
 caffe2/image/transform_gpu.h                  |   9 +-
 caffe2/mkl/mkl_utils_test.cc                  |  14 +-
 caffe2/mkl/operators/conv_op.cc               |   6 +-
 caffe2/mkl/operators/conv_op_mkldnn.cc        |   2 +-
 caffe2/mkl/operators/operator_fallback_mkl.h  |   4 +-
 caffe2/mkl/operators/packed_fc_op.cc          |   4 +-
 caffe2/mkl/operators/pool_op.cc               |   4 +-
 caffe2/mkl/operators/utility_ops.cc           |   4 +-
 caffe2/mkl/utils/mkl_context.cc               |   8 +
 caffe2/mkl/utils/mkl_context.h                |  93 +-
 caffe2/mobile/contrib/CMakeLists.txt          |   9 +-
 .../contrib/arm-compute/operators/copy_op.cc  |   4 +-
 .../arm-compute/test/gl_operator_test.h       |   2 +-
 caffe2/mobile/contrib/ios/ios_caffe.cc        |   2 +-
 .../mobile/contrib/ios/ios_caffe_predictor.cc |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |   6 +-
 caffe2/mobile/contrib/ios/pool_test.cc        |   2 +-
 caffe2/mobile/contrib/ios/resize_test.cc      |   2 +-
 caffe2/mobile/contrib/nnapi/nnapi.cc          |   2 +-
 .../mobile/contrib/nnapi/nnapi_benchmark.cc   |  22 +-
 caffe2/mobile/contrib/nnapi/nnapi_test.cc     |  28 +-
 caffe2/mobile/contrib/opengl/CMakeLists.txt   |  17 +-
 .../mobile/contrib/opengl/test/opengl_test.cc |  94 +-
 caffe2/mobile/contrib/snpe/snpe_op.cc         |   3 +-
 .../mobile/contrib/snpe/snpe_op_benchmark.cc  |  34 +-
 caffe2/mobile/contrib/ulp2/ulp.cc             |  16 +-
 caffe2/mobile/contrib/ulp2/ulp_neon.cc        |   2 +-
 caffe2/mobile/contrib/ulp2/ulp_test.cc        |  28 +-
 caffe2/mpi/mpi_gpu_test.cc                    |  17 +-
 caffe2/mpi/mpi_ops.h                          |  15 +-
 caffe2/observers/profile_observer_gpu.cc      |  23 +-
 caffe2/operators/accuracy_op.cc               |  37 +-
 caffe2/operators/accuracy_op.cu               |   2 +-
 caffe2/operators/affine_channel_op.cc         |  17 +-
 caffe2/operators/affine_channel_op.cu         |  17 +-
 caffe2/operators/apmeter_op.cc                |   4 +-
 caffe2/operators/assert_op.h                  |   2 +-
 caffe2/operators/atomic_ops.cc                |   6 +-
 caffe2/operators/batch_gather_ops.cu          |   6 +-
 caffe2/operators/batch_gather_ops.h           |   7 +-
 caffe2/operators/batch_matmul_op.h            |   4 +-
 caffe2/operators/batch_matmul_op_gpu_test.cc  |   8 +-
 caffe2/operators/batch_matmul_op_test.cc      |   4 +-
 caffe2/operators/bbox_transform_op.cc         |   6 +-
 caffe2/operators/boolean_mask_ops.cc          |  23 +-
 caffe2/operators/boolean_mask_ops.cu          |  29 +-
 caffe2/operators/boolean_unmask_ops.cu        |  22 +-
 caffe2/operators/boolean_unmask_ops_test.cc   |   6 +-
 caffe2/operators/box_with_nms_limit_op.cc     |  18 +-
 caffe2/operators/ceil_op.cu                   |   2 +-
 caffe2/operators/channel_backprop_stats_op.cc |   6 +-
 caffe2/operators/channel_backprop_stats_op.cu |   4 +-
 caffe2/operators/channel_backprop_stats_op.h  |   4 +-
 caffe2/operators/channel_shuffle_op_gpu.cu    |  14 +-
 caffe2/operators/channel_stats_op.cc          |   6 +-
 caffe2/operators/channel_stats_op.cu          |   4 +-
 caffe2/operators/channel_stats_op.h           |   4 +-
 caffe2/operators/clip_op.cc                   |   4 +-
 caffe2/operators/clip_op.cu                   |  24 +-
 ...ect_and_distribute_fpn_rpn_proposals_op.cc |  16 +-
 caffe2/operators/concat_split_op.h            |  10 +-
 caffe2/operators/conditional_op.cc            |   4 +-
 caffe2/operators/conv_op.h                    |  16 +-
 caffe2/operators/conv_op_impl.h               |  12 +-
 caffe2/operators/conv_op_shared.cc            |   6 +-
 caffe2/operators/conv_op_shared.h             |   4 +-
 caffe2/operators/conv_op_shared_gpu.cc        |   8 +-
 caffe2/operators/conv_pool_op_base.h          |  20 +-
 caffe2/operators/conv_transpose_op.h          |   8 +-
 caffe2/operators/conv_transpose_op_impl.h     |  12 +-
 caffe2/operators/conv_transpose_op_mobile.h   |   2 +-
 .../operators/conv_transpose_op_mobile_impl.h |   6 +-
 .../conv_transpose_op_mobile_test.cc          |  18 +-
 .../operators/conv_transpose_unpool_op_base.h |   5 +-
 .../cosine_embedding_criterion_op.cc          |   4 +-
 .../cosine_embedding_criterion_op.cu          |   6 +-
 caffe2/operators/counter_ops.h                |  10 +-
 caffe2/operators/cross_entropy_op.cc          |  26 +-
 caffe2/operators/cross_entropy_op.cu          |  59 +-
 .../operators/ctc_beam_search_decoder_op.cc   |   3 +-
 caffe2/operators/ctc_greedy_decoder_op.cc     |   5 +-
 caffe2/operators/dataset_ops.cc               |  62 +-
 caffe2/operators/dataset_ops.h                |   3 +-
 caffe2/operators/deform_conv_op.h             |  16 +-
 caffe2/operators/deform_conv_op_impl.h        |   8 +-
 .../operators/depthwise_3x3_conv_op_cudnn.cu  |   4 +-
 caffe2/operators/distance_op.cc               |  24 +-
 caffe2/operators/distance_op.cu               |  60 +-
 caffe2/operators/distance_op.h                |   4 +-
 caffe2/operators/dropout_op.cc                |  20 +-
 caffe2/operators/dropout_op.cu                |  18 +-
 caffe2/operators/dropout_op_cudnn.cc          |  11 +-
 caffe2/operators/elementwise_linear_op.cc     |  10 +-
 caffe2/operators/elementwise_linear_op.cu     |  23 +-
 caffe2/operators/elementwise_logical_ops.cc   |   4 +-
 caffe2/operators/elementwise_logical_ops.h    |   4 +-
 caffe2/operators/elementwise_op_test.h        |  32 +-
 caffe2/operators/elementwise_ops.cu           |   6 +-
 caffe2/operators/elementwise_ops.h            |   4 +-
 caffe2/operators/elementwise_ops_utils.cc     |  39 +
 caffe2/operators/elementwise_ops_utils.h      |  44 +-
 caffe2/operators/enforce_finite_op.cu         |   2 +-
 caffe2/operators/enforce_finite_op.h          |   4 +-
 caffe2/operators/ensure_cpu_output_op.h       |  10 +-
 caffe2/operators/expand_op.h                  |   2 +-
 caffe2/operators/feature_maps_ops.h           |  18 +-
 caffe2/operators/filler_op.cc                 |   7 +-
 caffe2/operators/filler_op.cu                 |   6 +-
 caffe2/operators/filler_op.h                  |  38 +-
 caffe2/operators/find_op.cu                   |   2 +-
 caffe2/operators/flatten_op.h                 |   2 +-
 caffe2/operators/floor_op.cu                  |   2 +-
 caffe2/operators/fully_connected_op.h         |   5 +-
 .../operators/gather_fused_8bit_rowwise_op.h  |   2 +-
 caffe2/operators/gather_ranges_to_dense_op.h  |   4 +-
 caffe2/operators/generate_proposals_op.cc     |   4 +-
 .../operators/generate_proposals_op_test.cc   |  16 +-
 caffe2/operators/given_tensor_fill_op.h       |  12 +-
 caffe2/operators/group_norm_op.h              |   4 +-
 caffe2/operators/gru_unit_op.h                |  10 +-
 caffe2/operators/h_softmax_op.cc              |  72 +-
 caffe2/operators/h_softmax_op.h               |   6 +-
 caffe2/operators/half_float_ops.cu            |   4 +-
 caffe2/operators/if_op.h                      |   2 +-
 caffe2/operators/index_ops.cc                 |  24 +-
 caffe2/operators/instance_norm_op.cu          |  24 +-
 caffe2/operators/instance_norm_op.h           |   8 +-
 caffe2/operators/integral_image_op.cu         |  14 +-
 caffe2/operators/integral_image_op.h          |   2 +-
 caffe2/operators/jsd_op.cc                    |   4 +-
 caffe2/operators/last_n_window_collector.cc   |   6 +-
 caffe2/operators/layer_norm_op.cu             |  14 +-
 caffe2/operators/layer_norm_op.h              |  14 +-
 caffe2/operators/leaky_relu_op.cc             |   2 +-
 caffe2/operators/leaky_relu_op.cu             |   4 +-
 caffe2/operators/lengths_pad_op.h             |  11 +-
 caffe2/operators/lengths_tile_op.h            |   9 +-
 caffe2/operators/listwise_l2r_op.cc           |   6 +-
 caffe2/operators/listwise_l2r_op.h            |  18 +-
 caffe2/operators/load_save_op.h               |   2 +-
 .../local_response_normalization_op.cc        |  43 +-
 .../local_response_normalization_op.cu        |  39 +-
 .../local_response_normalization_op.h         |   8 +-
 caffe2/operators/locally_connected_op.h       |  40 +-
 caffe2/operators/locally_connected_op_impl.h  |  24 +-
 caffe2/operators/logit_op.cu                  |   6 +-
 caffe2/operators/lp_pool_op.cc                |  12 +-
 caffe2/operators/lp_pool_op.cu                | 176 ++--
 caffe2/operators/lpnorm_op.cc                 |  14 +-
 caffe2/operators/lstm_unit_op.h               |   4 +-
 caffe2/operators/map_ops.h                    |  10 +-
 .../operators/margin_ranking_criterion_op.cc  |  12 +-
 .../operators/margin_ranking_criterion_op.cu  |   6 +-
 caffe2/operators/max_pool_with_index.cu       |  42 +-
 caffe2/operators/mem_query_op.cu              |  10 +-
 caffe2/operators/multi_class_accuracy_op.cc   |   4 +-
 caffe2/operators/multi_class_accuracy_op.cu   |   4 +-
 caffe2/operators/norm_planar_yuv_op.cc        |   2 +-
 caffe2/operators/normalize_ops.cu             |   7 +-
 caffe2/operators/numpy_tile_op.h              |   7 +-
 caffe2/operators/one_hot_ops.cc               |   6 +-
 caffe2/operators/one_hot_ops.cu               |   6 +-
 caffe2/operators/one_hot_ops.h                |   6 +-
 caffe2/operators/onnx_while_op.h              |  32 +-
 caffe2/operators/onnxifi_op.cc                |   2 +-
 caffe2/operators/operator_fallback_gpu.h      |   9 +-
 .../operators/operator_fallback_gpu_test.cc   |  10 +-
 caffe2/operators/order_switch_ops.cc          |  22 +-
 caffe2/operators/order_switch_ops.cu          |  18 +-
 caffe2/operators/pack_rnn_sequence_op.h       |   2 +-
 caffe2/operators/pack_segments.cc             |   6 +-
 caffe2/operators/pack_segments.cu             |  20 +-
 caffe2/operators/pack_segments.h              |  20 +-
 caffe2/operators/pad_op.cc                    |   8 +-
 caffe2/operators/pad_op_gpu.cu                |   8 +-
 caffe2/operators/partition_ops.h              |   4 +-
 caffe2/operators/percentile_op.h              |   4 +-
 caffe2/operators/perplexity_op.cc             |   2 +-
 caffe2/operators/perplexity_op.cu             |   2 +-
 .../piecewise_linear_transform_op.cc          |   6 +-
 .../piecewise_linear_transform_op.cu          |  24 +-
 .../operators/piecewise_linear_transform_op.h |   6 +-
 caffe2/operators/pool_op.cu                   | 972 +++++++++---------
 caffe2/operators/pool_op_cudnn.cu             |  14 +-
 caffe2/operators/prelu_op.cc                  |   4 +-
 caffe2/operators/prelu_op.cu                  |   6 +-
 caffe2/operators/prepend_dim_op.h             |   4 +-
 caffe2/operators/quant_decode_op.h            |  38 +-
 caffe2/operators/reducer_functors.h           |  26 +-
 caffe2/operators/reduction_front_back_ops.h   |   2 +-
 caffe2/operators/reduction_ops.cc             |   4 +-
 caffe2/operators/reduction_ops.cu             |  15 +-
 caffe2/operators/reduction_ops.h              |   6 +-
 caffe2/operators/relu_n_op.cc                 |   4 +-
 caffe2/operators/remove_data_blocks_op.h      |   2 +-
 caffe2/operators/reservoir_sampling.cc        |   2 +-
 caffe2/operators/reshape_op.h                 |   9 +-
 caffe2/operators/reshape_op_gpu_test.cc       |   6 +-
 caffe2/operators/resize_op.cc                 |  10 +-
 caffe2/operators/resize_op.cu                 |   6 +-
 caffe2/operators/reverse_packed_segs_op.h     |   6 +-
 caffe2/operators/rmac_regions_op.cc           |   4 +-
 caffe2/operators/rmac_regions_op.cu           |   5 +-
 caffe2/operators/rmac_regions_op.h            |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |  11 +-
 .../rnn/recurrent_network_executor.h          |   4 +-
 caffe2/operators/rnn/recurrent_network_op.h   |  36 +-
 caffe2/operators/rnn/recurrent_op_cudnn.cc    |  18 +-
 caffe2/operators/rnn/recurrent_op_cudnn.h     |  10 +-
 caffe2/operators/roi_align_gradient_op.cc     |   4 +-
 caffe2/operators/roi_align_gradient_op.cu     |   8 +-
 caffe2/operators/roi_align_op.cc              |   6 +-
 caffe2/operators/roi_align_op.cu              |   2 +-
 caffe2/operators/roi_align_op_gpu_test.cc     |  18 +-
 caffe2/operators/roi_pool_op.cc               |   4 +-
 caffe2/operators/roi_pool_op.cu               |  74 +-
 caffe2/operators/scale_op.cc                  |  16 +-
 caffe2/operators/segment_reduction_op.h       |   4 +-
 caffe2/operators/segment_reduction_op_gpu.cu  |  67 +-
 caffe2/operators/selu_op.cc                   |   5 +-
 caffe2/operators/selu_op.cu                   |   8 +-
 caffe2/operators/sequence_ops.cc              |  10 +-
 caffe2/operators/sequence_ops.cu              |  18 +-
 caffe2/operators/sequence_ops.h               |  12 +-
 caffe2/operators/shape_op.h                   |   6 +-
 .../operators/sinusoid_position_encoding_op.h |   2 +-
 caffe2/operators/slice_op.cu                  |  16 +-
 caffe2/operators/slice_op.h                   |  34 +-
 caffe2/operators/softmax_op.cc                |   4 +-
 caffe2/operators/softmax_op.h                 |  10 +-
 caffe2/operators/softmax_ops.cu               |  38 +-
 caffe2/operators/softmax_shared.cc            |   2 +-
 caffe2/operators/softmax_with_loss_op.cc      |   8 +-
 caffe2/operators/softmax_with_loss_op.h       |  21 +-
 caffe2/operators/softplus_op.cc               |   4 +-
 caffe2/operators/softplus_op.cu               |   7 +-
 caffe2/operators/space_batch_op.h             |   8 +-
 caffe2/operators/space_batch_op_gpu.cu        |  14 +-
 caffe2/operators/sparse_to_dense_mask_op.h    |   8 +-
 caffe2/operators/sparse_to_dense_op.h         |   6 +-
 .../spatial_batch_norm_gradient_op.cc         |  10 +-
 caffe2/operators/spatial_batch_norm_op.cc     |   4 +-
 .../operators/spatial_softmax_with_loss_op.cc |  48 +-
 .../operators/spatial_softmax_with_loss_op.h  |  21 +-
 caffe2/operators/stats_ops.cc                 |  10 +-
 caffe2/operators/string_ops.cc                |   4 +-
 caffe2/operators/string_ops_test.cc           |  24 +-
 caffe2/operators/stump_func_op.cu             |   2 +-
 caffe2/operators/stylizer_ops.cc              |  12 +-
 caffe2/operators/summarize_op.cc              |   8 +-
 caffe2/operators/summarize_op.cu              |   6 +-
 caffe2/operators/swish_op.cc                  |   4 +-
 caffe2/operators/tensor_protos_db_input.h     |  21 +-
 caffe2/operators/thresholded_relu_op.cc       |   6 +-
 caffe2/operators/thresholded_relu_op.cu       |   7 +-
 caffe2/operators/tile_op.h                    |  14 +-
 caffe2/operators/top_k.cu                     |  18 +-
 caffe2/operators/tt_linear_op.h               |   8 +-
 caffe2/operators/unique_ops.cu                |   6 +-
 caffe2/operators/unique_ops.h                 |   6 +-
 caffe2/operators/utility_ops.cc               |   2 +-
 caffe2/operators/utility_ops.cu               |  17 +-
 caffe2/operators/utility_ops.h                |  60 +-
 caffe2/operators/utility_ops_gpu_test.cc      |   6 +-
 caffe2/operators/utility_ops_test.cc          |   6 +-
 .../operators/weighted_multi_sampling_op.cc   |   6 +-
 caffe2/operators/weighted_sample_op.cc        |  12 +-
 caffe2/operators/weighted_sample_op.cu        |   8 +-
 caffe2/operators/weighted_sample_op.h         |   2 +-
 caffe2/operators/while_op.h                   |   2 +-
 caffe2/operators/workspace_ops.cc             |   3 +-
 caffe2/opt/fusion.cc                          |   8 +-
 caffe2/opt/onnxifi_transformer.cc             |   4 +-
 caffe2/python/pybind_state.cc                 |  44 +-
 caffe2/python/pybind_state.h                  |  58 +-
 caffe2/python/pybind_state_dlpack.h           |   4 +-
 caffe2/python/pybind_state_gpu.cc             |   1 -
 caffe2/python/pybind_state_hip.cc             |   1 -
 caffe2/python/pybind_state_int8.cc            |   3 +-
 caffe2/queue/blobs_queue_db.h                 |   4 +-
 caffe2/queue/queue_ops.h                      |   2 +-
 caffe2/queue/rebatching_queue.cc              |   8 +-
 caffe2/queue/rebatching_queue_ops.h           |   4 +-
 caffe2/sgd/adam_op.h                          |   8 +-
 caffe2/sgd/adam_op_gpu.cu                     |   2 +-
 caffe2/sgd/fp16_momentum_sgd_op.h             |   5 +-
 caffe2/sgd/fp32_momentum_sgd_op.h             |   5 +-
 caffe2/sgd/iter_op.h                          |   9 +-
 caffe2/sgd/learning_rate_op.h                 |   4 +-
 caffe2/sgd/momentum_sgd_op.h                  |  10 +-
 caffe2/sgd/yellowfin_op.h                     |  32 +-
 .../contrib/depthwise/depthwise3x3_conv_op.cc |   6 +-
 .../depthwise/depthwise3x3_conv_op_test.cc    |   2 +-
 caffe2/share/contrib/nnpack/conv_op.cc        |  12 +-
 caffe2/share/contrib/nnpack/nnpack_test.cc    |   2 +-
 caffe2/utils/filler.h                         |   4 +-
 caffe2/utils/hip/math_blas_hip_test.cc        |  56 +-
 caffe2/utils/hip/math_hip.cc                  |  16 +-
 caffe2/utils/math.h                           |   9 +-
 caffe2/utils/math_cpu.cc                      |  40 +-
 caffe2/utils/math_gpu.cu                      |  16 +-
 caffe2/utils/math_gpu_test.cc                 | 105 +-
 caffe2/utils/math_test.cc                     |  56 +-
 caffe2/utils/smart_tensor_printer.cc          |   6 +-
 caffe2/utils/smart_tensor_printer.h           |  10 +-
 caffe2/utils/smart_tensor_printer_test.cc     |   2 +-
 caffe2/video/video_input_op.h                 |  28 +-
 modules/detectron/group_spatial_softmax_op.h  |   2 +-
 modules/detectron/select_smooth_l1_loss_op.h  |   4 +-
 .../detectron/sigmoid_cross_entropy_loss_op.h |  10 +-
 modules/detectron/sigmoid_focal_loss_op.h     |   8 +-
 modules/detectron/smooth_l1_loss_op.h         |   4 +-
 modules/detectron/softmax_focal_loss_op.h     |   4 +-
 365 files changed, 4167 insertions(+), 3491 deletions(-)
 create mode 100644 caffe2/core/context_base.cc
 create mode 100644 caffe2/core/context_base.h

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 52b51174cf34d..f240ea45f26f7 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -160,7 +160,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -197,7 +197,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
     tensor->Resize(std::vector<caffe2::TIndex>());
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       (tensor->mutable_data<std::string>())[0] = tensor_proto->string_data(0);
@@ -286,7 +286,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>(),
+              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
               output_prefix,
               name);
         }
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
index 0a52e16a50079..7e75f557f88ad 100644
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@@ -35,7 +35,7 @@ void writeTextOutput(
     const string& output_prefix,
     const string& name) {
   string output_name = output_prefix + "/" + name + ".txt";
-  caffe2::TensorSerializer<ContextType> ser;
+  caffe2::TensorSerializer ser;
   caffe2::BlobProto blob_proto;
   ser.Serialize(
       *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
diff --git a/binaries/core_overhead_benchmark.cc b/binaries/core_overhead_benchmark.cc
index 74f19d58e32cd..5cb0a62797553 100644
--- a/binaries/core_overhead_benchmark.cc
+++ b/binaries/core_overhead_benchmark.cc
@@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
 
 static void BM_CudaPointerAffinity(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
+  Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
   float* ptr = tensor.mutable_data<float>();
   while (state.KeepRunning()) {
     volatile int id = GetGPUIDForPointer(ptr);
@@ -198,7 +198,7 @@ static void BM_RawAllocDeallocCPU(benchmark::State& state) {
 BENCHMARK(BM_RawAllocDeallocCPU);
 
 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
-  Tensor<CPUContext> tensor;
+  Tensor tensor(CPU);
   // small allocation
   tensor.Resize(32, 32);
   while (state.KeepRunning()) {
@@ -210,7 +210,7 @@ BENCHMARK(BM_TensorAllocDeallocCPU);
 
 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  Tensor<CUDAContext> tensor;
+  Tensor tensor(CUDA);
   // small allocation
   tensor.Resize(32, 32);
   while (state.KeepRunning()) {
diff --git a/binaries/print_core_object_sizes.cc b/binaries/print_core_object_sizes.cc
index 2000c349fb8b9..f99ef09ca4e97 100644
--- a/binaries/print_core_object_sizes.cc
+++ b/binaries/print_core_object_sizes.cc
@@ -28,8 +28,7 @@
 
 int main(int /* unused */, char** /* unused */) {
   PRINT_SIZE(caffe2::Blob);
-  PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
-  PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::Tensor);
   PRINT_SIZE(caffe2::CPUContext);
   PRINT_SIZE(caffe2::CUDAContext);
   PRINT_SIZE(caffe2::OperatorBase);
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index 196be4a77946c..cf6d400fe1e00 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index feccafd514cbb..9d646d04bf71b 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -54,11 +54,11 @@ class ATenOp : public Operator<Context> {
     #undef DEFINE_CASE
   }
 
-  at::Type & typeFor(const Tensor<Context> & ten) {
+  at::Type& typeFor(const Tensor& ten) {
     return at::getType(backend(), atScalarTypeFor(ten.meta()));
   }
-  at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
-    auto& ten = const_cast<Tensor<Context>&>(ten_);
+  at::Tensor tensorWrapping(const Tensor& ten_) {
+    auto& ten = const_cast<Tensor&>(ten_);
     return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
   }
 
@@ -88,7 +88,7 @@ class ATenOp : public Operator<Context> {
     }
     CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
   }
-  void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
+  void assignTo(Tensor* dst, const at::Tensor& src_) {
     at::Tensor src = src_.contiguous();
     auto at_sizes = src.sizes();
     std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
@@ -121,7 +121,7 @@ class ATenOp : public Operator<Context> {
     return s.toLong();
   }
 
-  void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
+  void assignTo(Tensor* dst, at::Type& inferred_type, at::Scalar scalar) {
     switch(inferred_type.scalarType()) {
       #define DEFINE_CASE(ctype,aten_name,native) \
         case at::k##aten_name: { \
@@ -134,8 +134,8 @@ class ATenOp : public Operator<Context> {
         CAFFE_THROW("Unknown ATen Type");
     }
   }
-  template<typename T>
-  void assignToValue(Tensor<Context> * dst, T v) {
+  template <typename T>
+  void assignToValue(Tensor* dst, T v) {
     dst->Resize(std::vector<TIndex>());
     math::Set(1, v, dst->template mutable_data<T>(), &context_);
   }
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
index a3f20b301c0d3..21ce0343d8181 100644
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutable<TensorCPU>();
+  auto* res = status_blob->GetMutableTensor(CPU);
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index 102c854736815..9722d5891334d 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -17,17 +17,17 @@ nccl::NCCLExecution getNCCLElements(
   ex.elements.resize(op->InputSize());
   for (auto i = 0; i < op->InputSize(); ++i) {
     auto& el = ex.elements[i];
-    el.src = &(op->Input<TensorCUDA>(i));
+    el.src = &(op->Input<Tensor>(i, CUDA));
     if (op->OutputSize() == 1) {
       // Reduce op
       if (i == ex.root) {
-        el.dst = op->Output<TensorCUDA>(0);
+        el.dst = op->Output<Tensor>(0, CUDA);
       }
     } else if (i < op->OutputSize()) {
-      el.dst = op->Output<TensorCUDA>(i);
+      el.dst = op->Output<Tensor>(i, CUDA);
     }
     // TODO - expensive (>1ms) - cache these.
-    el.device = GetGPUIDForPointer(op->Input<TensorCUDA>(i).raw_data());
+    el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
   }
 
   return ex;
@@ -38,7 +38,7 @@ namespace {
 template <typename T>
 bool AllInputsAre(OperatorBase* op) {
   for (auto i = 0; i < op->InputSize(); ++i) {
-    if (op->Input<TensorCUDA>(i).IsType<T>()) {
+    if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
       continue;
     } else {
       return false;
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
index 3eb0fc3ace3f4..012eea69c9dc6 100644
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
@@ -54,8 +54,8 @@ TEST(NervanaFullyConnectedTest, Test) {
   EXPECT_TRUE(op->Run());
   Blob* Yblob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, Yblob);
-  auto& Y = Yblob->Get<Tensor<CUDAContext>>();
-  TensorCPU Y_cpu(Y);
+  auto& Y = Yblob->Get<Tensor>();
+  Tensor Y_cpu(Y, CPU);
   EXPECT_EQ(Y.size(), 5 * 6);
   for (int i = 0; i < Y.size(); ++i) {
     CHECK_LT(Y_cpu.data<float>()[i], 10.11);
diff --git a/caffe2/contrib/warpctc/ctc_op.h b/caffe2/contrib/warpctc/ctc_op.h
index 748e3a595206f..6c27c907726b8 100644
--- a/caffe2/contrib/warpctc/ctc_op.h
+++ b/caffe2/contrib/warpctc/ctc_op.h
@@ -47,26 +47,26 @@ class CTCOp final : public Operator<Context> {
     const auto& inputs = Input(INPUTS);
     const auto minibatchSize = inputs.dim(1);
     const auto alphabetSize = inputs.dim(2);
-    const auto& labels = OperatorBase::template Input<TensorCPU>(LABELS);
+    const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
     const auto& labelLengths =
-        OperatorBase::template Input<TensorCPU>(LABEL_LENGTHS);
+        OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
     const auto& inputLengths =
-        OperatorBase::template Input<TensorCPU>(INPUT_LENGTHS);
+        OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
 
     // outputs
-    Tensor<Context>* gradients = nullptr;
+    Tensor* gradients = nullptr;
     TensorCPU* costs;
-    Tensor<Context>* workspace;
+    Tensor* workspace;
     if (!is_test_) {
       // [grads, costs, workspace] to maintain backward compatibility
       gradients = Output(0);
       gradients->ResizeLike(inputs);
-      costs = OperatorBase::template Output<TensorCPU>(1);
+      costs = OperatorBase::template Output<Tensor>(1, CPU);
       costs->ResizeLike(labelLengths);
       workspace = Output(2);
     } else {
       // [costs, workspace]
-      costs = OperatorBase::template Output<TensorCPU>(0);
+      costs = OperatorBase::template Output<Tensor>(0, CPU);
       costs->ResizeLike(labelLengths);
       workspace = Output(1);
     }
diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc
index 4edc4915ea69b..10fa078cf4b82 100644
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@@ -26,7 +26,7 @@ void SetCPUAllocator(CPUAllocator* alloc) {
   g_cpu_allocator.reset(alloc);
 }
 
-MemoryAllocationReporter CPUContext::reporter_;
+MemoryAllocationReporter CPUStaticContext::reporter_;
 
 void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
   std::lock_guard<std::mutex> guard(mutex_);
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index c7c020e7a7cc0..93659de70c9c1 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -9,8 +9,9 @@
 
 #include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
-#include "caffe2/core/typeid.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"
 
 namespace caffe2 {
@@ -60,6 +61,20 @@ class Blob {
   template <class T>
   bool IsType() const { return meta_.Match<T>(); }
 
+  // TODO(jerryzh): Remove template
+  template <class T>
+  bool IsType(DeviceType device_type) const {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "IsType(DeviceType) only available on "
+        "Tensor types.");
+    auto* tensor = static_cast<Tensor*>(pointer_);
+    if (tensor && tensor->GetDeviceType() == device_type) {
+      return true;
+    }
+    return false;
+  }
+
   /**
    * Returns the meta info of the blob.
    */
@@ -74,6 +89,7 @@ class Blob {
    * @brief Gets the const reference of the stored object. The code checks if
    * the stored object is of the desired type.
    */
+  // TODO(jerryzh): add a Get(DeviceType) function?
   template <class T>
   const T& Get() const {
     CAFFE_ENFORCE(
@@ -123,6 +139,17 @@ class Blob {
     }
   }
 
+  inline Tensor* GetMutableTensor(DeviceType device_type) {
+    if (IsType<Tensor>() &&
+        static_cast<Tensor*>(pointer_)->GetDeviceType() == device_type) {
+      return static_cast<Tensor*>(pointer_);
+    } else {
+      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
+              << " DeviceType:" << device_type;
+      return Reset<Tensor>(new Tensor(device_type));
+    }
+  }
+
   /**
    * Sets the underlying object to the allocated one. The Blob then takes over
    * the ownership of the passed in pointer. If there is already an object in
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 498f0b5deb55a..536ad02f4ea0c 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -17,7 +17,7 @@ TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
 
 TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
   if (!caffe2::HasCudaGPU()) return;
-  TensorCUDA tensor;
+  Tensor tensor(CUDA);
   EXPECT_EQ(tensor.ndim(), 0);
   vector<int> dims(3);
   dims[0] = 2;
@@ -38,7 +38,7 @@ TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCUDA tensor(dims);
+  Tensor tensor(dims, CUDA);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 3);
@@ -65,8 +65,8 @@ TYPED_TEST(TensorGPUTest, TensorShareData) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCUDA tensor(dims);
-  TensorCUDA other_tensor(dims);
+  Tensor tensor(dims, CUDA);
+  Tensor other_tensor(dims, CUDA);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@@ -82,8 +82,8 @@ TYPED_TEST(TensorGPUTest, TensorShareDataCanUseDifferentShapes) {
   dims[2] = 5;
   vector<int> alternate_dims(1);
   alternate_dims[0] = 2 * 3 * 5;
-  TensorCUDA tensor(dims);
-  TensorCUDA other_tensor(alternate_dims);
+  Tensor tensor(dims, CUDA);
+  Tensor other_tensor(alternate_dims, CUDA);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(other_tensor.ndim(), 1);
@@ -99,8 +99,8 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCUDA tensor(dims);
-  TensorCUDA other_tensor(dims);
+  Tensor tensor(dims, CUDA);
+  Tensor other_tensor(dims, CUDA);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@@ -115,7 +115,7 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
 TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
   if (!HasCudaGPU()) return;
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  TensorCUDA tensor;
+  Tensor tensor(CUDA);
   EXPECT_EQ(tensor.ndim(), 0);
   EXPECT_THROW(tensor.data<TypeParam>(), EnforceNotMet);
 }
@@ -126,12 +126,12 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
       return;                                                              \
     }                                                                      \
     Blob blob;                                                             \
-    TensorCPU cpu_tensor;                                                  \
+    Tensor cpu_tensor(CPU);                                                \
     cpu_tensor.Resize(2, 3);                                               \
     for (int i = 0; i < 6; ++i) {                                          \
       cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
     }                                                                      \
-    blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor);                   \
+    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
     string serialized = blob.Serialize("test");                            \
     BlobProto proto;                                                       \
     CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@@ -148,8 +148,8 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                     \
-    EXPECT_TRUE(new_blob.IsType<TensorCUDA>());                            \
-    TensorCPU new_cpu_tensor(blob.Get<TensorCUDA>());                      \
+    EXPECT_TRUE(new_blob.IsType<Tensor>(CUDA));                            \
+    Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(1), 3);                                   \
@@ -172,7 +172,7 @@ TEST_SERIALIZATION_GPU_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerializationMultiDevices) {
   Blob blob;
-  TensorCPU tensor;
+  Tensor tensor(CPU);
   tensor.Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor.mutable_data<float>()[i] = i;
@@ -180,7 +180,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
   for (int gpu_id = 0; gpu_id < NumCudaDevices(); ++gpu_id) {
     DeviceGuard guard(gpu_id);
     CUDAContext context(gpu_id);
-    blob.Reset(new TensorCUDA(tensor, &context));
+    blob.Reset(new Tensor(tensor, &context, CUDA));
     string serialized = blob.Serialize("test");
     BlobProto proto;
     CAFFE_ENFORCE(proto.ParseFromString(serialized));
@@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(blob.Deserialize(serialized));
-    EXPECT_TRUE(blob.IsType<TensorCUDA>());
+    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
-    EXPECT_TRUE(blob.IsType<TensorCUDA>());
+    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index a7cbb4186a68c..b870aa39067e6 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -33,7 +33,7 @@ class StringSerializer : public BlobSerializerBase {
   StringSerializer() {}
   ~StringSerializer() {}
   /**
-   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * Serializes a Blob. Note that this blob has to contain Tensor,
    * otherwise this function produces a fatal error.
    */
   void Serialize(
@@ -83,12 +83,242 @@ std::string Blob::Serialize(const string& name) const {
   return data;
 }
 
-// Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
-template <>
-void TensorSerializer<CPUContext>::StoreDeviceDetail(
-    const Tensor<CPUContext>& /*input*/,
-    TensorProto* /*proto*/) {}
+void TensorSerializer::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
+}
+
+void TensorSerializer::SerializeWithChunkSize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) {
+  CAFFE_ENFORCE(blob.IsType<Tensor>());
+  const auto& tensor = blob.template Get<Tensor>();
+  if (chunk_size == kNoChunking) {
+    chunk_size = tensor.size() + 1; // to account for empty tensors
+  } else if (chunk_size == kDefaultChunkSize) {
+    chunk_size = FLAGS_caffe2_tensor_chunk_size;
+  }
+
+  auto processChunk = [&](int64_t chunkStart) {
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type(kTensorBlobType);
+    TensorProto& proto = *blob_proto.mutable_tensor();
+    proto.set_name(name);
+    this->Serialize(
+        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
+    acceptor(
+        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
+        blob_proto.SerializeAsString());
+  };
+
+#ifndef __ANDROID__
+  std::vector<std::future<void>> futures;
+  // Poorman's IOBound ThreadPool
+  SimpleQueue<size_t> chunkQueue;
+  auto task = [&]() {
+    size_t chunkStart;
+    while (chunkQueue.Pop(&chunkStart)) {
+      processChunk(chunkStart);
+    }
+  };
+  if (tensor.size() > chunk_size) {
+    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
+      futures.emplace_back(std::async(std::launch::async, task));
+    }
+  }
+#endif
+
+  VLOG(1) << "Serializing blob " << name;
+  // Serialize whole vector. If vector is empty, it's shape still needs to be
+  // serialized in empty proto
+  for (size_t chunkBegin = 0;
+       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
+       chunkBegin += chunk_size) {
+    VLOG(2) << "Starting a chunk at " << chunkBegin;
+#ifndef __ANDROID__
+    if (tensor.size() > chunk_size) {
+      chunkQueue.Push(chunkBegin);
+    } else {
+      // Sync mode for small tensors
+      processChunk(chunkBegin);
+    }
+#else
+    // Since Android does not have std::future, we will always do sync mode
+    processChunk(chunkBegin);
+#endif
+  }
+
+#ifndef __ANDROID__
+  chunkQueue.NoMoreJobs();
+  for (auto& fut : futures) {
+    fut.get();
+  }
+#endif
+}
+
+void TensorSerializer::Serialize(
+    const Tensor& input,
+    const string& /*name*/,
+    TensorProto* proto_ptr,
+    size_t chunkBegin,
+    int32_t chunkSize) {
+  CAFFE_ENFORCE(
+      chunkBegin <= input.size(),
+      "Chunk begin is out of tensor: ",
+      chunkBegin,
+      ' ',
+      input.size());
+  if (chunkBegin + chunkSize > input.size()) {
+    chunkSize = input.size() - chunkBegin;
+  }
+
+  CAFFE_ENFORCE(
+      input.raw_data() || chunkSize == 0,
+      "The input does not have data input yet. This is probably because you "
+      "created a tensor of non-zero shape but never filled its data via "
+      "mutable_data() calls. This means that it makes no sense to serialize "
+      "the tensor content.");
+
+  TensorProto& proto = *proto_ptr;
+  proto.mutable_segment()->set_begin(chunkBegin);
+  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
+
+  for (int i = 0; i < input.ndim(); ++i) {
+    proto.add_dims(input.dim(i));
+  }
+  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
+  proto.set_data_type(data_type);
+  StoreDeviceDetail(input, &proto);
+  auto uniq_ptr = input.GetStaticContext()->CreateContext();
+  // A lot of copypaste is error prone. Should we create a macro for this?
+  switch (data_type) {
+    case TensorProto_DataType_FLOAT:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<float>() + chunkBegin,
+          proto.mutable_float_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT32:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<int>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_BYTE:
+      LOG(FATAL) << "This should not happen. When serializing, "
+                    "BYTE is deprecated and moved to UINT8.";
+      break;
+    case TensorProto_DataType_STRING: {
+      proto.mutable_string_data()->Reserve(chunkSize);
+      const string* content = input.template data<string>();
+      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+        proto.add_string_data(content[i]);
+      }
+      break;
+    }
+    case TensorProto_DataType_BOOL:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<bool>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_UINT8:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<uint8_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT8:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<int8_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_UINT16:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<uint16_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT16:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<int16_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT64:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<int64_t>() + chunkBegin,
+          proto.mutable_int64_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_FLOAT16: {
+      if (FLAGS_caffe2_serialize_fp16_as_bytes) {
+        const int kValue = 1;
+        CAFFE_ENFORCE_EQ(
+            reinterpret_cast<const char*>(&kValue)[0],
+            1,
+            "Serialization of FLOAT16 on big endian platform "
+            "is not written yet.");
+        unique_ptr<char[]> buffer(new char[2 * chunkSize]);
+        this->context_->template CopyToCPU<char>(
+            2 * chunkSize,
+            reinterpret_cast<const char*>(
+                input.template data<float16>() + chunkBegin),
+            buffer.get());
+        this->context_->FinishDeviceComputation();
+        proto.set_byte_data(buffer.release(), 2 * chunkSize);
+      } else {
+        detail::CopyToProtoWithCast(
+            chunkSize,
+            reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
+                chunkBegin,
+            proto.mutable_int32_data(),
+            uniq_ptr.get());
+      }
+    } break;
+    case TensorProto_DataType_DOUBLE:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<double>() + chunkBegin,
+          proto.mutable_double_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_UNDEFINED: {
+      proto.mutable_string_data()->Reserve(chunkSize);
+      Blob temp_blob;
+      const char* raw_data = static_cast<const char*>(input.raw_data());
+      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+        temp_blob.ShareExternal(
+            const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
+        proto.add_string_data(temp_blob.Serialize(""));
+      }
+    } break;
+      // Note: we intentially do not provide "default:" so if any new data types
+      // are added, the compiler should warn the user to add the case here.
+  }
+}
 
+int GetGPUIDForPointer(const void* ptr);
+
+void TensorSerializer::StoreDeviceDetail(
+    const Tensor& input,
+    TensorProto* proto) {
+  input.ExtractDeviceOption(proto->mutable_device_detail());
+}
 // The actual serialization registry objects.
 CAFFE_DEFINE_TYPED_REGISTRY(
     BlobSerializerRegistry,
@@ -127,12 +357,176 @@ void Blob::Deserialize(const BlobProto& blob_proto) {
   }
 }
 
+void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
+  auto tensor_proto = blob_proto.tensor();
+  Deserialize(
+      tensor_proto,
+      blob->GetMutableTensor(
+          static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
+}
+
+void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
+  // We create a local context for deserializing. Since Caffe2 contexts are
+  // usually lightweight, this should not involve too much overhead.
+  auto uniq_ptr =
+      tensor->GetStaticContext()->CreateContext(proto.device_detail());
+  auto context = uniq_ptr.get();
+  context->SwitchToDevice(0);
+  vector<TIndex> dims;
+  for (const TIndex d : proto.dims()) {
+    dims.push_back(d);
+  }
+  tensor->Resize(dims);
+
+  int64_t chunkBegin = 0;
+  auto chunkEnd = tensor->size();
+  if (proto.has_segment()) {
+    chunkBegin = proto.segment().begin();
+    chunkEnd = proto.segment().end();
+  }
+  CAFFE_ENFORCE(
+      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
+      "Invalid chunk ",
+      chunkBegin,
+      ' ',
+      chunkEnd,
+      " with total tensor size ",
+      tensor->size());
+  auto chunkSize = chunkEnd - chunkBegin;
+
+  switch (proto.data_type()) {
+    case TensorProto_DataType_FLOAT:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.float_data(),
+          tensor->template mutable_data<float>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT32:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_BYTE:
+      // Since BYTE stores the data in a string field instead of a repreated
+      // field we will have it special cased.
+      CAFFE_ENFORCE_EQ(
+          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
+      context->template CopyToCPU<uint8_t>(
+          chunkSize,
+          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
+          tensor->template mutable_data<uint8_t>() + chunkBegin);
+      break;
+    case TensorProto_DataType_STRING:
+      // Special handing of string because it is a non-fundamental type.
+      {
+        string* content = tensor->template mutable_data<string>();
+        for (int i = 0; i < chunkSize; ++i) {
+          content[i + chunkBegin] = proto.string_data(i);
+        }
+      }
+      break;
+    case TensorProto_DataType_BOOL:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<bool>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_UINT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint8_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int8_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_UINT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint16_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int16_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT64:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int64_data(),
+          tensor->template mutable_data<int64_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_FLOAT16:
+      if (proto.has_byte_data()) {
+        const int kValue = 1;
+        CAFFE_ENFORCE_EQ(
+            reinterpret_cast<const char*>(&kValue)[0],
+            1,
+            "Serialization of FLOAT16 on big endian platform "
+            "is not written yet.");
+        CAFFE_ENFORCE_EQ(
+            2 * chunkSize,
+            proto.byte_data().size(),
+            "Incorrect proto field size.");
+        context->template CopyToCPU<float16>(
+            chunkSize,
+            reinterpret_cast<const float16*>(proto.byte_data().data()),
+            tensor->template mutable_data<float16>() + chunkBegin);
+      } else {
+        // Backward compatibility with models which used int32_data field
+        detail::CopyFromProtoWithCast(
+            chunkSize,
+            proto.int32_data(),
+            reinterpret_cast<uint16_t*>(
+                tensor->template mutable_data<float16>()) +
+                chunkBegin,
+            context);
+      }
+      break;
+    case TensorProto_DataType_DOUBLE:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.double_data(),
+          tensor->template mutable_data<double>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_UNDEFINED: {
+      Blob temp_blob;
+      void* raw_ptr = nullptr;
+      for (int i = 0; i < chunkSize; ++i) {
+        temp_blob.Deserialize(proto.string_data(i));
+        if (i == 0) {
+          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
+        }
+        temp_blob.meta().copy()(
+            temp_blob.GetRaw(),
+            static_cast<char*>(raw_ptr) +
+                (i + chunkBegin) * temp_blob.meta().itemsize(),
+            1);
+      }
+    }
+  }
+  context->FinishDeviceComputation();
+}
+
 namespace {
-// Serialize TensorCPU.
-REGISTER_BLOB_SERIALIZER(
-    (TypeMeta::Id<TensorCPU>()),
-    TensorSerializer<CPUContext>);
-REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
+// Serialize Tensor
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<Tensor>()), TensorSerializer);
+REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer);
 // Serialize std::string
 REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
 REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
index 94af8a9fcacac..18cb95d541b4b 100644
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@@ -42,13 +42,12 @@ inline unique_ptr<BlobSerializerBase> CreateSerializer(CaffeTypeId id) {
  * TensorSerializer takes in a blob that contains a Tensor, and serializes it
  * into a TensorProto protocol buffer.
  */
-template <class Context>
 class TensorSerializer : public BlobSerializerBase {
  public:
-  TensorSerializer() : context_() {}
+  TensorSerializer() {}
   ~TensorSerializer() override {}
   /**
-   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * Serializes a Blob. Note that this blob has to contain Tensor,
    * otherwise this function produces a fatal error.
    */
   void Serialize(
@@ -61,13 +60,17 @@ class TensorSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor,
       int chunk_size) override;
 
-  void Serialize(const Tensor<Context>& tensor, const string& name,
-                 TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
+  void Serialize(
+      const Tensor& tensor,
+      const string& name,
+      TensorProto* proto,
+      size_t chunkBegin,
+      int32_t chunkSize);
 
  private:
   // A utility function to store the device context detauls.
-  void StoreDeviceDetail(const Tensor<Context>& input, TensorProto* proto);
-  Context context_;
+  void StoreDeviceDetail(const Tensor& input, TensorProto* proto);
+  unique_ptr<BaseContext> context_;
 };
 
 /**
@@ -98,11 +101,10 @@ inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
  * tensor, change the TensorProto's corresponding fields before calling
  * Deserialize.
  */
-template <class Context>
 class TensorDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override;
-  void Deserialize(const TensorProto& proto, Tensor<Context>* tensor);
+  void Deserialize(const TensorProto& proto, Tensor* tensor);
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -110,12 +112,12 @@ class TensorDeserializer : public BlobDeserializerBase {
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace detail {
-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyToProtoAsIs(
     const size_t size,
     const SrcType* src,
     google::protobuf::RepeatedField<DstType>* field,
-    Context* context) {
+    BaseContext* context) {
   static_assert(
       sizeof(SrcType) == sizeof(DstType),
       "The source type and dest type cannot be copied as-is. Did "
@@ -124,23 +126,22 @@ inline void CopyToProtoAsIs(
   for (int i = 0; i < size; ++i) {
     field->Add(0);
   }
-  context->template Copy<SrcType, Context, CPUContext>(
+  context->template CopyToCPU<SrcType>(
       size, src, reinterpret_cast<SrcType*>(field->mutable_data()));
   // Make sure that we finish the copy into the protobuf.
   context->FinishDeviceComputation();
 }
 
-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyToProtoWithCast(
     const size_t size,
     const SrcType* src,
     google::protobuf::RepeatedField<DstType>* field,
-    Context* context) {
+    BaseContext* context) {
   // TODO: we are having one unnecessary copy here if the context is already
   // CPUContext. Remove it if it is performance critical.
   unique_ptr<SrcType[]> buffer(new SrcType[size]);
-  context->template Copy<SrcType, Context, CPUContext>(
-      size, src, buffer.get());
+  context->template CopyToCPU<SrcType>(size, src, buffer.get());
   context->FinishDeviceComputation();
   field->Reserve(size);
   for (int i = 0; i < size; ++i) {
@@ -148,27 +149,27 @@ inline void CopyToProtoWithCast(
   }
 }
 
-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyFromProtoAsIs(
     const size_t size,
     const google::protobuf::RepeatedField<SrcType>& field,
     DstType* dst,
-    Context* context) {
+    BaseContext* context) {
   static_assert(
       sizeof(SrcType) == sizeof(DstType),
       "The source type and dest type cannot be copied as-is. Did "
       "you mean CopyFromProtoWithCast?");
   CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
-  context->template Copy<DstType, CPUContext, Context>(
+  context->template CopyFromCPU<DstType>(
       size, reinterpret_cast<const DstType*>(field.data()), dst);
 }
 
-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyFromProtoWithCast(
     const size_t size,
     const google::protobuf::RepeatedField<SrcType>& field,
     DstType* dst,
-    Context* context) {
+    BaseContext* context) {
   CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
   // TODO: we are having one unnecessary copy here if the context is already
   // CPUContext. Remove it if it is performance critical.
@@ -177,410 +178,10 @@ inline void CopyFromProtoWithCast(
   for (int i = 0; i < size; ++i) {
     buffer[i] = static_cast<DstType>(src[i]);
   }
-  context->template Copy<DstType, CPUContext, Context>(size, buffer.get(), dst);
+  context->template CopyFromCPU<DstType>(size, buffer.get(), dst);
 }
 
 }  // namespace detail
-
-template <class Context>
-void TensorSerializer<Context>::Serialize(
-    const Blob& blob,
-    const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor) {
-  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
-}
-
-template <class Context>
-void TensorSerializer<Context>::SerializeWithChunkSize(
-    const Blob& blob,
-    const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor,
-    int chunk_size) {
-  CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
-  const auto& tensor = blob.template Get<Tensor<Context>>();
-  if (chunk_size == kNoChunking) {
-    chunk_size = tensor.size() + 1; // to account for empty tensors
-  } else if (chunk_size == kDefaultChunkSize) {
-    chunk_size = FLAGS_caffe2_tensor_chunk_size;
-  }
-
-  auto processChunk = [&](int64_t chunkStart) {
-    BlobProto blob_proto;
-    blob_proto.set_name(name);
-    blob_proto.set_type(kTensorBlobType);
-    TensorProto& proto = *blob_proto.mutable_tensor();
-    proto.set_name(name);
-    this->Serialize(
-        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
-    acceptor(
-        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
-        blob_proto.SerializeAsString());
-  };
-
-#ifndef __ANDROID__
-  std::vector<std::future<void>> futures;
-  // Poorman's IOBound ThreadPool
-  SimpleQueue<size_t> chunkQueue;
-  auto task = [&]() {
-    size_t chunkStart;
-    while (chunkQueue.Pop(&chunkStart)) {
-      processChunk(chunkStart);
-    }
-  };
-  if (tensor.size() > chunk_size) {
-    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
-      futures.emplace_back(std::async(std::launch::async, task));
-    }
-  }
-#endif
-
-  VLOG(1) << "Serializing blob " << name;
-  // Serialize whole vector. If vector is empty, it's shape still needs to be
-  // serialized in empty proto
-  for (size_t chunkBegin = 0;
-       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
-       chunkBegin += chunk_size) {
-    VLOG(2) << "Starting a chunk at " << chunkBegin;
-#ifndef __ANDROID__
-    if (tensor.size() > chunk_size) {
-      chunkQueue.Push(chunkBegin);
-    } else {
-      // Sync mode for small tensors
-      processChunk(chunkBegin);
-    }
-#else
-    // Since Android does not have std::future, we will always do sync mode
-    processChunk(chunkBegin);
-#endif
-  }
-
-#ifndef __ANDROID__
-  chunkQueue.NoMoreJobs();
-  for (auto& fut : futures) {
-    fut.get();
-  }
-#endif
-}
-
-template <class Context>
-void TensorSerializer<Context>::Serialize(
-    const Tensor<Context>& input,
-    const string& /*name*/,
-    TensorProto* proto_ptr,
-    size_t chunkBegin,
-    int32_t chunkSize) {
-  CAFFE_ENFORCE(
-      chunkBegin <= input.size(),
-      "Chunk begin is out of tensor: ",
-      chunkBegin,
-      ' ',
-      input.size());
-  if (chunkBegin + chunkSize > input.size()) {
-    chunkSize = input.size() - chunkBegin;
-  }
-
-  CAFFE_ENFORCE(
-      input.raw_data() || chunkSize == 0,
-      "The input does not have data input yet. This is probably because you "
-      "created a tensor of non-zero shape but never filled its data via "
-      "mutable_data() calls. This means that it makes no sense to serialize "
-      "the tensor content.");
-
-  TensorProto& proto = *proto_ptr;
-  proto.mutable_segment()->set_begin(chunkBegin);
-  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
-
-  for (int i = 0; i < input.ndim(); ++i) {
-    proto.add_dims(input.dim(i));
-  }
-  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
-  proto.set_data_type(data_type);
-  StoreDeviceDetail(input, &proto);
-
-  // A lot of copypaste is error prone. Should we create a macro for this?
-  switch (data_type) {
-  case TensorProto_DataType_FLOAT:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<float>() + chunkBegin,
-        proto.mutable_float_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT32:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<int>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_BYTE:
-    LOG(FATAL) << "This should not happen. When serializing, "
-                  "BYTE is deprecated and moved to UINT8.";
-    break;
-  case TensorProto_DataType_STRING:
-    {
-      proto.mutable_string_data()->Reserve(chunkSize);
-      const string* content = input.template data<string>();
-      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
-        proto.add_string_data(content[i]);
-      }
-      break;
-    }
-  case TensorProto_DataType_BOOL:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<bool>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_UINT8:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<uint8_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT8:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<int8_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_UINT16:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<uint16_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT16:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<int16_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT64:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<int64_t>() + chunkBegin,
-        proto.mutable_int64_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_FLOAT16: {
-    if (FLAGS_caffe2_serialize_fp16_as_bytes) {
-      const int kValue = 1;
-      CAFFE_ENFORCE_EQ(
-          reinterpret_cast<const char*>(&kValue)[0],
-          1,
-          "Serialization of FLOAT16 on big endian platform "
-          "is not written yet.");
-      unique_ptr<char[]> buffer(new char[2 * chunkSize]);
-      this->context_.template Copy<char, Context, CPUContext>(
-          2 * chunkSize,
-          reinterpret_cast<const char*>(
-              input.template data<float16>() + chunkBegin),
-          buffer.get());
-      this->context_.FinishDeviceComputation();
-      proto.set_byte_data(buffer.release(), 2 * chunkSize);
-    } else {
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
-              chunkBegin,
-          proto.mutable_int32_data(),
-          &this->context_);
-    }
-  } break;
-  case TensorProto_DataType_DOUBLE:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<double>() + chunkBegin,
-        proto.mutable_double_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_UNDEFINED: {
-    proto.mutable_string_data()->Reserve(chunkSize);
-    Blob temp_blob;
-    const char* raw_data = static_cast<const char*>(input.raw_data());
-    for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
-      temp_blob.ShareExternal(
-          const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
-      proto.add_string_data(temp_blob.Serialize(""));
-    }
-  } break;
-    // Note: we intentially do not provide "default:" so if any new data types
-    // are added, the compiler should warn the user to add the case here.
-  }
-}
-
-template <class Context>
-void TensorDeserializer<Context>::Deserialize(
-    const BlobProto& blob_proto,
-    Blob* blob) {
-  Deserialize(blob_proto.tensor(), blob->GetMutable<Tensor<Context>>());
-}
-
-template <class Context>
-void TensorDeserializer<Context>::Deserialize(
-    const TensorProto& proto,
-    Tensor<Context>* tensor) {
-  // We create a local context for deserializing. Since Caffe2 contexts are
-  // usually lightweighted, this should not involve too much overhead.
-  Context context(proto.device_detail());
-  context.SwitchToDevice(0);
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
-    dims.push_back(d);
-  }
-  tensor->Resize(dims);
-
-  int64_t chunkBegin = 0;
-  auto chunkEnd = tensor->size();
-  if (proto.has_segment()) {
-    chunkBegin = proto.segment().begin();
-    chunkEnd = proto.segment().end();
-  }
-  CAFFE_ENFORCE(
-      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
-      "Invalid chunk ",
-      chunkBegin,
-      ' ',
-      chunkEnd,
-      " with total tensor size ",
-      tensor->size());
-  auto chunkSize = chunkEnd - chunkBegin;
-
-  switch (proto.data_type()) {
-    case TensorProto_DataType_FLOAT:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.float_data(),
-          tensor->template mutable_data<float>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT32:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_BYTE:
-      // Since BYTE stores the data in a string field instead of a repreated
-      // field we will have it special cased.
-      CAFFE_ENFORCE_EQ(
-          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
-      context.template Copy<uint8_t, Context, CPUContext>(
-          chunkSize,
-          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
-          tensor->template mutable_data<uint8_t>() + chunkBegin);
-      break;
-    case TensorProto_DataType_STRING:
-      // Special handing of string because it is a non-fundamental type.
-      {
-        string* content = tensor->template mutable_data<string>();
-        for (int i = 0; i < chunkSize; ++i) {
-          content[i + chunkBegin] = proto.string_data(i);
-        }
-      }
-      break;
-    case TensorProto_DataType_BOOL:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<bool>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_UINT8:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<uint8_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT8:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int8_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_UINT16:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<uint16_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT16:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int16_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT64:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.int64_data(),
-          tensor->template mutable_data<int64_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_FLOAT16:
-      if (proto.has_byte_data()) {
-        const int kValue = 1;
-        CAFFE_ENFORCE_EQ(
-            reinterpret_cast<const char*>(&kValue)[0],
-            1,
-            "Serialization of FLOAT16 on big endian platform "
-            "is not written yet.");
-        CAFFE_ENFORCE_EQ(
-            2 * chunkSize,
-            proto.byte_data().size(),
-            "Incorrect proto field size.");
-        context.template Copy<float16, Context, CPUContext>(
-            chunkSize,
-            reinterpret_cast<const float16*>(proto.byte_data().data()),
-            tensor->template mutable_data<float16>() + chunkBegin);
-      } else {
-        // Backward compatibility with models which used int32_data field
-        detail::CopyFromProtoWithCast(
-            chunkSize,
-            proto.int32_data(),
-            reinterpret_cast<uint16_t*>(
-                tensor->template mutable_data<float16>()) +
-                chunkBegin,
-            &context);
-      }
-      break;
-    case TensorProto_DataType_DOUBLE:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.double_data(),
-          tensor->template mutable_data<double>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_UNDEFINED: {
-      Blob temp_blob;
-      void* raw_ptr = nullptr;
-      for (int i = 0; i < chunkSize; ++i) {
-        temp_blob.Deserialize(proto.string_data(i));
-        if (i == 0) {
-          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
-        }
-        temp_blob.meta().copy()(
-            temp_blob.GetRaw(),
-            static_cast<char*>(raw_ptr) +
-                (i + chunkBegin) * temp_blob.meta().itemsize(),
-            1);
-      }
-    }
-  }
-  context.FinishDeviceComputation();
-}
-
 }  // namespace caffe2
 
 #endif  // CAFFE2_CORE_BLOB_SERIALIZATION_H_
diff --git a/caffe2/core/blob_serialization_gpu.cc b/caffe2/core/blob_serialization_gpu.cc
index 76349f3173dbe..4d675354531c8 100644
--- a/caffe2/core/blob_serialization_gpu.cc
+++ b/caffe2/core/blob_serialization_gpu.cc
@@ -4,20 +4,7 @@
 
 namespace caffe2 {
 
-template <>
-void TensorSerializer<CUDAContext>::StoreDeviceDetail(
-    const Tensor<CUDAContext>& input, TensorProto* proto) {
-  auto* device_detail = proto->mutable_device_detail();
-  device_detail->set_device_type(CUDA);
-  device_detail->set_cuda_gpu_id(
-      GetGPUIDForPointer(input.raw_data()));
-}
-
 namespace {
-REGISTER_BLOB_SERIALIZER(
-    (TypeMeta::Id<TensorCUDA>()),
-    TensorSerializer<CUDAContext>);
-REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer<CUDAContext>);
+REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer);
 }
 }  // namespace caffe2
-
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 3fafbf2fc5028..40e53a2840ae8 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -47,7 +47,7 @@ class BlobTestFooSerializer : public BlobSerializerBase {
   BlobTestFooSerializer() {}
   ~BlobTestFooSerializer() {}
   /**
-   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * Serializes a Blob. Note that this blob has to contain Tensor,
    * otherwise this function produces a fatal error.
    */
   void Serialize(
@@ -181,7 +181,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
 
   auto* ptr = tensor.mutable_data<int>();
   EXPECT_TRUE(ptr != nullptr);
@@ -200,7 +200,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
 
   // share the data with other tensor so that the pointer won't be reused
   // when we reallocate
-  TensorCPU other_tensor(dims);
+  Tensor other_tensor(dims, CPU);
   other_tensor.ShareData(tensor);
   // but double is bigger, so it should allocate a new one
   auto* doubleptr = tensor.mutable_data<double>();
@@ -215,7 +215,7 @@ TEST(TensorNonTypedTest, NonDefaultConstructible) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
 
   // this doesn't compile - good!
   // auto* ptr = tensor.mutable_data<BlobTestNonDefaultConstructible>();
@@ -232,7 +232,7 @@ TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
 TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
 
 TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
-  TensorCPU tensor;
+  Tensor tensor(CPU);
   EXPECT_EQ(tensor.ndim(), 0);
   vector<int> dims(3);
   dims[0] = 2;
@@ -253,7 +253,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 3);
@@ -279,7 +279,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedZeroDim) {
   dims[0] = 2;
   dims[1] = 0;
   dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 0);
@@ -293,7 +293,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 3);
@@ -317,7 +317,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
 
 TYPED_TEST(TensorCPUTest, TensorInitializedScalar) {
   vector<int> dims;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   EXPECT_EQ(tensor.ndim(), 0);
   EXPECT_EQ(tensor.size(), 1);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
@@ -329,8 +329,8 @@ TYPED_TEST(TensorCPUTest, TensorShareData) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(dims, CPU);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@@ -349,7 +349,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointer) {
   dims[1] = 3;
   dims[2] = 5;
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2*3*5]);
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   tensor.ShareExternalPointer(raw_buffer.get());
   EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
   EXPECT_EQ(tensor.data<TypeParam>(), raw_buffer.get());
@@ -366,7 +366,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
   dims[1] = 3;
   dims[2] = 5;
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2 * 3 * 5]);
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   TypeMeta meta = TypeMeta::Make<TypeParam>();
   tensor.ShareExternalPointer(raw_buffer.get(), meta);
   EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
@@ -380,7 +380,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
 
 TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
-  TensorCPU tensor;
+  Tensor tensor(CPU);
   ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
 }
 
@@ -391,8 +391,8 @@ TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
   dims[2] = 5;
   vector<int> alternate_dims(1);
   alternate_dims[0] = 2 * 3 * 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(alternate_dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(alternate_dims, CPU);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(other_tensor.ndim(), 1);
@@ -413,8 +413,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterResize) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(dims, CPU);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@@ -431,8 +431,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterFreeMemory) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(dims, CPU);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@@ -449,7 +449,7 @@ TYPED_TEST(TensorCPUTest, KeepOnShrink) {
   FLAGS_caffe2_max_keep_on_shrink_memory = LLONG_MAX;
 
   vector<int> dims{2, 3, 5};
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   TypeParam* ptr = tensor.mutable_data<TypeParam>();
   EXPECT_TRUE(ptr != nullptr);
   // Expanding - will reallocate
@@ -480,7 +480,7 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
   FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 4 * sizeof(TypeParam);
 
   vector<int> dims{1, 8, 8};
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
   TypeParam* ptr = tensor.mutable_data<TypeParam>();
   EXPECT_TRUE(ptr != nullptr);
   // Shrinking - will not reallocate
@@ -501,19 +501,19 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
 }
 
 TYPED_TEST(TensorCPUDeathTest, CannotAccessRawDataWhenEmpty) {
-  TensorCPU tensor;
+  Tensor tensor(CPU);
   EXPECT_EQ(tensor.ndim(), 0);
   ASSERT_ANY_THROW(tensor.raw_data());
 }
 
 TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
-  TensorCPU tensor;
+  Tensor tensor(CPU);
   EXPECT_EQ(tensor.ndim(), 0);
   ASSERT_ANY_THROW(tensor.data<TypeParam>());
 }
 
 TEST(TensorTest, TensorNonFundamentalType) {
-  TensorCPU tensor(vector<int>{2, 3, 4});
+  Tensor tensor(vector<int>{2, 3, 4}, CPU);
   EXPECT_TRUE(tensor.mutable_data<std::string>() != nullptr);
   const std::string* ptr = tensor.data<std::string>();
   for (int i = 0; i < tensor.size(); ++i) {
@@ -522,14 +522,14 @@ TEST(TensorTest, TensorNonFundamentalType) {
 }
 
 TEST(TensorTest, TensorNonFundamentalTypeClone) {
-  TensorCPU tensor(vector<int>{2, 3, 4});
+  Tensor tensor(vector<int>{2, 3, 4}, CPU);
   std::string* ptr = tensor.mutable_data<std::string>();
   EXPECT_TRUE(ptr != nullptr);
   for (int i = 0; i < tensor.size(); ++i) {
     EXPECT_TRUE(ptr[i] == "");
     ptr[i] = "filled";
   }
-  TensorCPU dst_tensor = tensor.Clone();
+  Tensor dst_tensor = tensor.Clone();
   const std::string* dst_ptr = dst_tensor.data<std::string>();
   for (int i = 0; i < dst_tensor.size(); ++i) {
     EXPECT_TRUE(dst_ptr[i] == "filled");
@@ -549,7 +549,7 @@ TEST(TensorTest, Tensor64BitDimension) {
   // Initialize a large tensor.
   TIndex large_number =
       static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  TensorCPU tensor(vector<TIndex>{large_number});
+  Tensor tensor(vector<TIndex>{large_number}, CPU);
   EXPECT_EQ(tensor.ndim(), 1);
   EXPECT_EQ(tensor.dim(0), large_number);
   EXPECT_EQ(tensor.size(), large_number);
@@ -581,7 +581,7 @@ TEST(TensorTest, Tensor64BitDimension) {
 TEST(TensorDeathTest, CannotCastDownLargeDims) {
   TIndex large_number =
       static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  TensorCPU tensor(vector<TIndex>{large_number});
+  Tensor tensor(vector<TIndex>{large_number}, CPU);
   EXPECT_EQ(tensor.ndim(), 1);
   EXPECT_EQ(tensor.dim(0), large_number);
   ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
@@ -590,7 +590,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
   TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
     Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
     tensor->Resize(2, 3);                                                 \
     for (int i = 0; i < 6; ++i) {                                         \
       tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@@ -611,7 +611,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -624,7 +624,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                           \
   TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
     Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
     tensor->Resize(0, 3);                                                 \
     tensor->mutable_data<TypeParam>();                                    \
     string serialized = blob.Serialize("test");                           \
@@ -640,7 +640,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -659,7 +659,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerialization_CustomType) {
   Blob blob;
-  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor->mutable_data<BlobTestFoo>()[i].val = i;
@@ -671,7 +671,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
+  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -686,7 +686,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, float16) {
   const TIndex kSize = 3000000;
   Blob blob;
-  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(kSize);
   for (int i = 0; i < tensor->size(); ++i) {
     tensor->mutable_data<float16>()[i].x = i % 10000;
@@ -714,7 +714,7 @@ TEST(TensorTest, float16) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
+  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -850,7 +850,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
   {
     VLOG(1) << "Test begin";
     Blob blob;
-    TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+    Tensor* tensor = blob.GetMutableTensor(CPU);
     VLOG(1) << "Allocating blob";
     tensor->Resize(d1, d2);
     auto mutableData = tensor->mutable_data<TypeParam>();
@@ -893,7 +893,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsType<TensorCPU>());
+    EXPECT_TRUE(new_blob->IsType<Tensor>(CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
@@ -1020,7 +1020,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
   int64_t size = d1 * d2;
 
   Blob blob;
-  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(d1, d2);
   tensor->mutable_data<float>();
   std::mutex mutex;
@@ -1070,10 +1070,9 @@ TEST(BlobTest, CastingMessage) {
 }
 
 TEST(TensorConstruction, UnitializedCopyTest) {
-  CPUContext context;
-  TensorCPU x;
-  TensorCPU y(x, &context);
-  TensorCPU z = x.Clone();
+  Tensor x(CPU);
+  Tensor y(x, CPU);
+  Tensor z = x.Clone();
   // should be uninitialized
   EXPECT_EQ(x.size(), -1);
   EXPECT_EQ(y.size(), -1);
@@ -1082,14 +1081,11 @@ TEST(TensorConstruction, UnitializedCopyTest) {
 }
 
 TEST(TensorConstruction, CopyConstructorTest) {
-  CPUContext context;
-
-  TensorCPU x;
+  Tensor x(CPU);
   x.Resize(5);
   x.mutable_data<float>()[0] = 1;
-  TensorCPU y = x.Clone();
-  TensorCPU z(x, &context);
-  TensorCPU w;
+  Tensor y = x.Clone();
+  Tensor z(x, CPU);
 
   EXPECT_EQ(*x.data<float>(), 1);
   EXPECT_EQ(*y.data<float>(), 1);
@@ -1100,13 +1096,12 @@ TEST(TensorConstruction, CopyConstructorTest) {
   EXPECT_EQ(*z.data<float>(), 1);
 }
 
-TEST(TensorConstruction, MoveConstructorTest) {
-  CPUContext context;
-
-  TensorCPU x;
+TEST(TensorConstruction, MoveAssignmentOpTest) {
+  Tensor x(CPU);
   x.Resize(5);
   x.mutable_data<float>()[0] = 1;
-  TensorCPU y = std::move(x);
+  Tensor y(CPU);
+  y = std::move(x);
 
   EXPECT_EQ(*y.data<float>(), 1);
 }
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
index 427e8bb60aa31..05af9a8cdb12c 100644
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@@ -7,6 +7,12 @@
 
 namespace caffe2 {
 
+// We put this here because context.h rather than context_base.h is included in
+// user code
+// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
+CAFFE2_API BaseStaticContext*
+    BaseContext::static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
+
 uint32_t RandomNumberSeed() {
   // Originally copied from folly::randomNumberSeed (at 418ad4)
   // modified to use chrono instead of sys/time.h
@@ -24,4 +30,11 @@ uint32_t RandomNumberSeed() {
       kPrime2 * tv_sec + kPrime3 * tv_usec;
 }
 
+BaseStaticContext* GetCPUStaticContext() {
+  static CPUStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(CPU, GetCPUStaticContext());
+
 } // namespace caffe2
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index df3b0f20772d6..017bc51744800 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -7,6 +7,7 @@
 #include <unordered_map>
 
 #include "caffe2/core/allocator.h"
+#include "caffe2/core/context_base.h"
 #include "caffe2/core/event.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/typeid.h"
@@ -16,6 +17,8 @@ CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
 namespace caffe2 {
 
+BaseStaticContext* GetCPUStaticContext();
+
 /**
  * A function to generate a random number seed that is unique in a best-effort
  * basis, using an ever-incrementing seed and the current time.
@@ -26,44 +29,15 @@ uint32_t RandomNumberSeed();
  * The CPU Context, representing the bare minimum of what a Context class in
  * Caffe2 should implement.
  *
+ * // TODO modify docs
  * See operator.h, especially Operator<Context>, for how Context are used in
  * actual operator implementations that are associated with specific devices.
  * In general, the Context class is passed in as a template argument, and
  * the operator can use the functions defined in the context to execute whatever
  * computation it has.
  *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes have the freedom to choose what functions it
- * implements, but there are a few functions that you should consider
- * implementing if you want to write your own context class:
- * - void SwitchToDevice(): any necessary code to switch to the device before
- *     running anything.
- * - void WaitEvent(const Event& ev): make the current context to wait on
- *     an event. For example, for cuda, this is the equivalent of
- *     cudaStreamWaitEvent. For CPU context, it essentially synchronizes the
- *     event.
- * - void Record(Event* ev): record the async activities on the current context
- *     to the event. For example, for cuda, this is the equivalent of
- *     cudaEventRecord on the current stream. For CPU context, it is always
- *     synchronous.
- * - void FinishDeviceComputation(): any wrapping-up work after all the
- *     computation of the operator is done. If there are errors during the
- *     execution, throw exception. For example, in a CUDAContext, this function
- *     carries out a stream synchronization and spots potential errors for
- *     the cuda kernel calls.
- * - static std::pair<void*, MemoryDeleter> New(size_t nbytes): allocates
-       memory and returns a deleter.
- * - template <class SrcContext, class DstContext> void CopyBytes(...): does
- *     cross context memory copy.
- * - template <typename T, class SrcContext, class DstContext> void Copy(...):
- *     usually a simple wrapper around the above CopyBytes function.
- *
- * We intentionally did not create a base class for the various possible Context
- * classes there might be, since they are intended to be specified during
- * compile time using templates rather than via polymorphism. You should also
- * not have classes derived from existing context classes.
  */
-class CPUContext final {
+class CPUContext final : public BaseContext {
  public:
   typedef std::mt19937 rand_gen_type;
   CPUContext() : random_seed_(RandomNumberSeed()) {}
@@ -74,23 +48,30 @@ class CPUContext final {
     CAFFE_ENFORCE_EQ(option.device_type(), CPU);
   }
 
-  ~CPUContext() noexcept {}
+  ~CPUContext() noexcept override {}
+
+  BaseStaticContext* GetStaticContext() const override {
+    return GetCPUStaticContext();
+  }
 
-  inline void SwitchToDevice(int /*stream_id*/) {}
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
+  static BaseStaticContext* StaticContext() {
+    return GetCPUStaticContext();
   }
 
-  inline void WaitEvent(const Event& ev) {
+  inline void SwitchToDevice(int /*stream_id*/) override {}
+
+  using BaseContext::SwitchToDevice;
+
+  inline void WaitEvent(const Event& ev) override {
     ev.Wait(CPU, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(CPU, this, err_msg);
   }
 
-  inline void FinishDeviceComputation() {}
+  inline void FinishDeviceComputation() override {}
 
   inline rand_gen_type& RandGenerator() {
     if (!random_generator_.get()) {
@@ -99,16 +80,35 @@ class CPUContext final {
     return *random_generator_.get();
   }
 
-  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
-    if (FLAGS_caffe2_report_cpu_memory_usage) {
-      reporter_.New(data_and_deleter.first, nbytes);
-      data_and_deleter.second = ReportAndDelete;
+  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return StaticContext()->New(nbytes);
+  }
+
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
+      override {
+    if (nbytes == 0) {
+      return;
     }
-    return data_and_deleter;
+    CAFFE_ENFORCE(src);
+    CAFFE_ENFORCE(dst);
+    memcpy(dst, src, nbytes);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  bool SupportsNonFundamentalTypes() const override {
+    // CPU non fumdamental type copy OK
+    return true;
   }
 
-  // Two copy functions that deals with cross-device copies.
   template <class SrcContext, class DstContext>
   inline void CopyBytes(size_t nbytes, const void* src, void* dst);
 
@@ -147,26 +147,31 @@ class CPUContext final {
 
   // CPU streams are not implemented and are silently ignored by CPU ops,
   // return true to signal executor to schedule a CPU op
-  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+  static bool IsStreamFree(
+      const DeviceOption& /* option */,
+      int /* stream_id */) {
     return true;
   }
 
+  DeviceType GetDevicetype() const override {
+    return CPU;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return CPU;
+  }
+
  protected:
   // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
   int random_seed_{1701};
   std::unique_ptr<rand_gen_type> random_generator_;
-  CAFFE2_API static MemoryAllocationReporter reporter_;
-
- private:
-  static void ReportAndDelete(void* ptr) {
-    reporter_.Delete(ptr);
-    GetCPUAllocator()->GetDeleter()(ptr);
-  }
 };
 
-template<>
+template <>
 inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
-    size_t nbytes, const void* src, void* dst) {
+    size_t nbytes,
+    const void* src,
+    void* dst) {
   if (nbytes == 0) {
     return;
   }
@@ -175,6 +180,41 @@ inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
   memcpy(dst, src, nbytes);
 }
 
+// TODO(jerryzh): merge CPUStaticContext with Allocator
+class CPUStaticContext : public BaseStaticContext {
+ public:
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
+    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      reporter_.New(data_and_deleter.first, nbytes);
+      data_and_deleter.second = ReportAndDelete;
+    }
+    return data_and_deleter;
+  }
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<CPUContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<CPUContext>(option);
+  }
+
+  DeviceType GetDeviceType() override {
+    return CPU;
+  }
+
+ protected:
+  CAFFE2_API static MemoryAllocationReporter reporter_;
+
+ private:
+  static void ReportAndDelete(void* ptr) {
+    reporter_.Delete(ptr);
+    GetCPUAllocator()->GetDeleter()(ptr);
+  }
+};
+
 }  // namespace caffe2
 
 #endif  // CAFFE2_CORE_CONTEXT_H_
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
new file mode 100644
index 0000000000000..08ff7755121cd
--- /dev/null
+++ b/caffe2/core/context_base.cc
@@ -0,0 +1,5 @@
+#include "context_base.h"
+
+namespace caffe2 {
+
+}
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
new file mode 100644
index 0000000000000..c3b3b4958910a
--- /dev/null
+++ b/caffe2/core/context_base.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+#include <unordered_map>
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/event.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+class BaseContext;
+
+/* BaseStaticContext defines the interface for static context, which contains
+   functions that are invoked statically before in Tensor class, e.g. New,
+   We will merge this with Allocator later.
+ */
+class BaseStaticContext {
+ public:
+  virtual ~BaseStaticContext() noexcept {}
+
+  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) const = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext(const DeviceOption&) = 0;
+
+  virtual DeviceType GetDeviceType() = 0;
+
+  /*
+   * @brief: Sets the DeviceOption for argument `device` based on the
+   * current context and the a data pointer
+   */
+  virtual void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) {
+    device->set_device_type(GetDeviceType());
+  }
+};
+
+/**
+ * Virtual interface for the Context class in Caffe2.
+ *
+ * A Context defines all the necessities to run an operator on a specific
+ * device. Specific Context classes needs to implement all the pure virtual
+ * functions in the BaseContext class.
+ * TODO: add docs after this is finalized.
+ */
+class BaseContext {
+ public:
+  virtual ~BaseContext() noexcept {}
+
+  virtual BaseStaticContext* GetStaticContext() const = 0;
+
+  /* Sorry for the naming, will get rid of this in future diff */
+  virtual DeviceType GetDevicetype() const = 0;
+
+  virtual void SwitchToDevice(int /*stream_id*/) = 0;
+
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  virtual void WaitEvent(const Event& ev) = 0;
+
+  virtual void Record(Event* ev, const char* err_msg = nullptr) const = 0;
+
+  virtual void FinishDeviceComputation() = 0;
+
+  // This used to be arbitrary cross-device copy, but it turns out everyone
+  // did direct CPU-X copy, so we just make three functions for it (to avoid
+  // double dispatch).  This will get obsoleted by C10. where copies
+  // will be proper operators (and get to rely on multiple dispatch there.)
+  virtual void
+  CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToDevice(
+      size_t nbytes,
+      const void* src,
+      void* dst,
+      DeviceType type) {
+    if (type == CPU) {
+      CopyBytesToCPU(nbytes, src, dst);
+    } else if (type == GetDevicetype()) {
+      CopyBytesSameDevice(nbytes, src, dst);
+    } else {
+      CAFFE_THROW("CopyBytesToDevice can only copy to CPU or between same "
+                  "device. Can't copy from: ", GetDevicetype(), " to", type);
+    }
+  }
+
+  template <typename T>
+  inline void CopySameDevice(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopySameDevice requires fundamental types");
+    CopyBytesSameDevice(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopyFromCPU requires fundamental types");
+    CopyBytesFromCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyToCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
+    CopyBytesToCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  virtual bool SupportsNonFundamentalTypes() const {
+    return false;
+  }
+
+  inline void EnforceMetaCopyOK() {
+    CAFFE_ENFORCE(
+        SupportsNonFundamentalTypes(), "Context requires fundamental types");
+  }
+
+  inline void CopyItemsSameDevice(
+      const TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  inline void
+  CopyItemsFromCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  inline void
+  CopyItemsToCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesToCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  CAFFE2_API static BaseStaticContext*
+      static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
+
+  template <int d>
+  friend struct StaticContextFunctionRegisterer;
+};
+
+template <int d>
+struct StaticContextFunctionRegisterer {
+  explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
+    static_assert(d < COMPILE_TIME_MAX_DEVICE_TYPES, "");
+    BaseContext::static_context_[d] = ptr;
+  }
+};
+
+#define REGISTER_STATIC_CONTEXT(d, f)                                \
+  namespace {                                                        \
+  static StaticContextFunctionRegisterer<d> g_static_context_##d(f); \
+  }
+
+#define GET_STATIC_CONTEXT(d) BaseContext::static_context_[d]
+} // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index b8f14d80b54e8..ad0d80774450a 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -59,7 +59,6 @@ CAFFE2_DEFINE_int(
 
 namespace caffe2 {
 
-
 thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
 
 // TODO(jiayq): these variables shouldn't be currently accessed during static
@@ -100,19 +99,6 @@ CudaMemoryPoolType GetCudaMemoryPoolType() {
   return g_cuda_memory_pool_type;
 }
 
-vector<TIndex> GetCUDATensorInfo(
-    const void* c,
-    bool* shares_data,
-    size_t* capacity,
-    DeviceOption* device) {
-  vector<TIndex> dims =
-      GetTensorInfo<CUDAContext>(c, shares_data, capacity, device);
-  const Tensor<CUDAContext>* tc = static_cast<const Tensor<CUDAContext>*>(c);
-  device->set_device_type(CUDA);
-  device->set_cuda_gpu_id(GetGPUIDForPointer(tc->raw_data()));
-  return dims;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // A wrapper to allow us to lazily initialize all cuda environments that Caffe
 // uses. This gets done the first time a caffe2::CUDAContext::New() gets called
@@ -163,14 +149,6 @@ static void Caffe2InitializeCuda() {
     }
   }
 
-  RegisterTypeCallFunction(
-    TypeMeta::Id<Tensor<CUDAContext>>(),
-    GetTensorType<CUDAContext>
-  );
-
-  RegisterTensorInfoFunction(
-      TypeMeta::Id<Tensor<CUDAContext>>(), GetCUDATensorInfo);
-
 #ifdef CAFFE2_USE_CUDNN
   // Check the versions of cuDNN that were compiled and linked with are compatible
   CheckCuDNNVersions();
@@ -252,21 +230,6 @@ struct Caffe2CudaInitializerHelper {
     }
   }
 };
-
-struct TensorCUDAStatGetter : BlobStatGetter {
-  size_t sizeBytes(const Blob& blob) const override {
-    const auto& tensor = blob.Get<TensorCUDA>();
-    auto nbytes = tensor.nbytes();
-    if (nbytes > 0 && tensor.IsType<std::string>()) {
-      const auto* data = tensor.data<std::string>();
-      for (int i = 0; i < tensor.size(); ++i) {
-        nbytes += data[i].size();
-      }
-    }
-    return nbytes;
-  }
-};
-REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter);
 } // namespace
 
 /**
@@ -343,7 +306,7 @@ void TrackMemoryAlloc(size_t nbytes) {
 }
 }
 
-std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
+std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
   // Lock the mutex
   std::lock_guard<std::mutex> lock(CUDAContext::mutex());
   // A one-time caffe2 cuda initializer.
@@ -381,7 +344,7 @@ std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
   return {nullptr, Delete};
 }
 
-void CUDAContext::Delete(void* ptr) {
+void CUDAStaticContext::Delete(void* ptr) {
   // lock the mutex
   std::lock_guard<std::mutex> lock(CUDAContext::mutex());
 
@@ -433,4 +396,11 @@ void CUDAContext::Delete(void* ptr) {
   }
 }
 
+BaseStaticContext* GetCUDAStaticContext() {
+  static CUDAStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(CUDA, GetCUDAStaticContext());
+
 }  // namespace caffe2
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 1668d4f2ab7e4..a76fcd6a16505 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -7,6 +7,7 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/context_base.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/numa.h"
 #include "caffe2/core/tensor.h"
@@ -134,37 +135,46 @@ class ThreadLocalCUDAObjects {
 #endif // CAFFE2_USE_CUDNN
 };
 
-class CUDAContext final {
+BaseStaticContext* GetCUDAStaticContext();
+
+class CUDAContext final : public BaseContext {
  public:
   // The default cuda context constructor.
   explicit CUDAContext(const int gpu_id = -1);
   explicit CUDAContext(const DeviceOption& option);
 
-  ~CUDAContext() {
+  ~CUDAContext() override {
     if (curand_generator_) {
       CURAND_CHECK(curandDestroyGenerator(curand_generator_));
     }
     FinishDeviceComputation();
   }
 
-  inline void SwitchToDevice(int stream_id) {
+  BaseStaticContext* GetStaticContext() const override {
+    return GetCUDAStaticContext();
+  }
+
+  static BaseStaticContext* StaticContext() {
+    return GetCUDAStaticContext();
+  }
+
+  inline void SwitchToDevice(int stream_id) override {
     set_stream_id(stream_id);
     CaffeCudaSetDevice(gpu_id_);
   }
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
 
-  inline void WaitEvent(const Event& ev) {
+  using BaseContext::SwitchToDevice;
+
+  inline void WaitEvent(const Event& ev) override {
     ev.Wait(CUDA, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(CUDA, this, err_msg);
   }
 
-  void FinishDeviceComputation() {
+  void FinishDeviceComputation() override {
     cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) {
@@ -211,7 +221,9 @@ class CUDAContext final {
     return curand_generator_;
   }
 
-  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return StaticContext()->New(nbytes);
+  }
 
   // Get a mutex to lock out cudaMalloc / cudaFree calls when
   // NCCL kernels are being launched. Should remove threat of
@@ -233,6 +245,21 @@ class CUDAContext final {
         cuda_objects_.GetStream(gpu_id_, stream_id_)));
   }
 
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
+  }
+
   template <typename T, class SrcContext, class DstContext>
   inline void Copy(int n, const T* src, T* dst) {
     CopyBytes<SrcContext, DstContext>(n * sizeof(T),
@@ -261,8 +288,15 @@ class CUDAContext final {
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
+  DeviceType GetDevicetype() const override {
+    return CUDA;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return CUDA;
+  }
+
  protected:
-  static void Delete(void* data);
   void set_stream_id(int stream_id) {
     stream_id_ = stream_id;
   }
@@ -350,8 +384,37 @@ struct PinnedCPUAllocator final : CPUAllocator {
   DefaultCPUAllocator baseAllocator_;
 };
 
-// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
-typedef Tensor<CUDAContext> TensorCUDA;
+class CUDAStaticContext final : public BaseStaticContext {
+ public:
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<CUDAContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<CUDAContext>(option);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
+    return caffe2::make_unique<CUDAContext>(gpu_id);
+  }
+
+  DeviceType GetDeviceType() override {
+    return CUDA;
+  }
+
+  void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    device->set_device_type(GetDeviceType());
+    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
+  }
+
+ protected:
+  static void Delete(void* data);
+};
+
+using TensorCUDA = Tensor;
 
 }  // namespace caffe2
 
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index e2338d7f70481..a6e44846e9e0b 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -26,7 +26,7 @@ TEST(CPUContextTest, TestAllocDealloc) {
   }
   DeviceOption option;
   CPUContext context(option);
-  context.Copy<float, CPUContext, CPUContext>(10, data, dst_data);
+  context.CopyToCPU<float>(10, data, dst_data);
   for (int i = 0; i < 10; ++i) {
     EXPECT_FLOAT_EQ(dst_data[i], i);
   }
diff --git a/caffe2/core/dispatch/CMakeLists.txt b/caffe2/core/dispatch/CMakeLists.txt
index 841bfca164684..c028bfa2b9307 100644
--- a/caffe2/core/dispatch/CMakeLists.txt
+++ b/caffe2/core/dispatch/CMakeLists.txt
@@ -18,6 +18,7 @@ set(TEST_SOURCES
 
 add_library(dispatch OBJECT ${LIB_SOURCES})
 target_enable_style_warnings(dispatch)
+add_dependencies(dispatch Caffe2_PROTO)
 
 if(BUILD_TEST)
     add_executable(dispatch_test ${TEST_SOURCES} $<TARGET_OBJECTS:dispatch>)
diff --git a/caffe2/core/dispatch/OpSchema.h b/caffe2/core/dispatch/OpSchema.h
index bdfd14ed42396..6a7da5a8ea310 100644
--- a/caffe2/core/dispatch/OpSchema.h
+++ b/caffe2/core/dispatch/OpSchema.h
@@ -1,13 +1,12 @@
 #pragma once
 
 #include "caffe2/core/dispatch/DispatchKey.h"
+#include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/Metaprogramming.h"
 #include "caffe2/utils/Array.h"
 
 namespace caffe2 {
-template<class Context> class Tensor;
-class CPUContext;
-class CUDAContext;
+class Tensor;
 }  // namespace caffe2
 
 namespace c10 {
@@ -18,26 +17,29 @@ namespace details {
  * If Arg is a Tensor or reference to a Tensor, provide the member constant value equal to true.  Otherwise
  * return false.
  */
-template<class Arg> using is_tensor_arg = guts::is_instantiation_of<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
+template<class Arg>
+using is_tensor_arg = std::
+  is_same<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
+
+inline DeviceTypeId to_device_type_id(caffe2::DeviceType device_type) {
+  switch (device_type) {
+    case caffe2::CPU:
+      return DeviceTypeId::CPU;
+    case caffe2::CUDA:
+      return DeviceTypeId::CUDA;
+    default:
+      return DeviceTypeId::UNDEFINED;
+  }
+}
 
 // TODO get rid of tensor_to_dispatch_key once c2::Tensor is de-templatized. This then fits into a template lambda instead of a functor.
-template<class TensorType, class Enable = void> struct tensor_to_dispatch_key_ final {};
-template<class TensorType>
-struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CPUContext>>::value>> final {
-    static TensorParameterDispatchKey call(const TensorType& tensor) {
-      return TensorParameterDispatchKey{DeviceTypeId::CPU, LayoutId(0), tensor.meta().id()};
-    }
-};
-template<class TensorType>
-struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CUDAContext>>::value>> final {
-    static TensorParameterDispatchKey call(const TensorType& tensor) {
-      return TensorParameterDispatchKey{DeviceTypeId::CUDA, LayoutId(0), tensor.meta().id()};
-    }
-};
 struct tensor_to_dispatch_key final {
     template<class TensorType>
     TensorParameterDispatchKey operator()(const TensorType& tensor) const {
-      return tensor_to_dispatch_key_<TensorType, void>::call(tensor);
+      return TensorParameterDispatchKey{
+          to_device_type_id(tensor.GetDeviceType()),
+          LayoutId(0),
+          tensor.meta().id()};
     }
 };
 
diff --git a/caffe2/core/dispatch/OpSchema_test.cpp b/caffe2/core/dispatch/OpSchema_test.cpp
index 77936a0347a04..3c079886c989e 100644
--- a/caffe2/core/dispatch/OpSchema_test.cpp
+++ b/caffe2/core/dispatch/OpSchema_test.cpp
@@ -4,16 +4,13 @@
 using namespace c10;
 using namespace caffe2;
 
-static_assert(details::is_tensor_arg<Tensor<CPUContext>>::value, "");
-static_assert(details::is_tensor_arg<const Tensor<CPUContext> &>::value, "");
-static_assert(details::is_tensor_arg<Tensor<CPUContext> &&>::value, "");
-static_assert(details::is_tensor_arg<Tensor<CUDAContext>>::value, "");
-static_assert(details::is_tensor_arg<const Tensor<CUDAContext> &>::value, "");
-static_assert(details::is_tensor_arg<Tensor<CUDAContext> &&>::value, "");
+static_assert(details::is_tensor_arg<Tensor>::value, "");
+static_assert(details::is_tensor_arg<const Tensor &>::value, "");
+static_assert(details::is_tensor_arg<Tensor &&>::value, "");
 static_assert(!details::is_tensor_arg<int>::value, "");
 
 struct SchemaDef final {
-  using Signature = bool (int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int);
+  using Signature = bool (int, Tensor, float, Tensor, Tensor, unsigned int);
   static constexpr guts::array<const char*, 6> parameter_names = {{
       "1", "2", "3", "4", "5", "6"
   }};
@@ -21,4 +18,4 @@ struct SchemaDef final {
 static_assert(6 == OpSchema<SchemaDef>::signature::num_args, "test num_dispatch_args");
 static_assert(3 == OpSchema<SchemaDef>::signature::num_tensor_args, "test num_dispatch_args");
 static_assert(std::is_same<bool, typename OpSchema<SchemaDef>::signature::return_type>::value, "test num_dispatch_args");
-static_assert(std::is_same<guts::typelist::typelist<int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
+static_assert(std::is_same<guts::typelist::typelist<int, Tensor, float, Tensor, Tensor, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
diff --git a/caffe2/core/hip/blob_serialization_hip.cc b/caffe2/core/hip/blob_serialization_hip.cc
index d472456b98ccf..144bc3ce5257f 100644
--- a/caffe2/core/hip/blob_serialization_hip.cc
+++ b/caffe2/core/hip/blob_serialization_hip.cc
@@ -4,17 +4,7 @@
 
 namespace caffe2 {
 
-template <>
-void TensorSerializer<HIPContext>::StoreDeviceDetail(const Tensor<HIPContext>& input,
-                                                     TensorProto* proto)
-{
-    auto* device_detail = proto->mutable_device_detail();
-    device_detail->set_device_type(HIP);
-    device_detail->set_hip_gpu_id(GetGPUIDForPointer(input.raw_data()));
-}
-
 namespace {
-REGISTER_BLOB_SERIALIZER((TypeMeta::Id<TensorHIP>()), TensorSerializer<HIPContext>);
-REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer<HIPContext>);
+REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer);
 }
 } // namespace caffe2
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 86a5fe6a376c4..889553650a149 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -50,8 +50,6 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
 
 namespace caffe2 {
 
-CAFFE_KNOWN_TYPE(Tensor<HIPContext>);
-
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
 
 // TODO(jiayq): these variables shouldn't be currently accessed during static
@@ -88,16 +86,6 @@ static long g_last_rep  = 0;
 
 HipMemoryPoolType GetHipMemoryPoolType() { return g_hip_memory_pool_type; }
 
-vector<TIndex>
-GetHipTensorInfo(const void* c, bool* shares_data, size_t* capacity, DeviceOption* device)
-{
-    vector<TIndex> dims          = GetTensorInfo<HIPContext>(c, shares_data, capacity, device);
-    const Tensor<HIPContext>* tc = static_cast<const Tensor<HIPContext>*>(c);
-    device->set_device_type(HIP);
-    device->set_hip_gpu_id(GetGPUIDForPointer(tc->raw_data()));
-    return dims;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // A wrapper to allow us to lazily initialize all HIP environments that Caffe
 // uses. This gets done the first time a caffe2::HIPContext::New() gets called
@@ -151,10 +139,6 @@ static void Caffe2InitializeHip()
         }
     }
 
-    RegisterTypeCallFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetTensorType<HIPContext>);
-
-    RegisterTensorInfoFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetHipTensorInfo);
-
     // CheckMiOpenVersions();
 }
 
@@ -327,20 +311,17 @@ void TrackMemoryAlloc(size_t nbytes)
 }
 }
 
-std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
-{
-    // Lock the mutex
-    std::lock_guard<std::mutex> lock(HIPContext::mutex());
-    // A one-time caffe2 cuda initializer.
-    static Caffe2HipInitializerHelper g_hip_initializer_;
-    void* ptr = nullptr;
+std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
+  // Lock the mutex
+  std::lock_guard<std::mutex> lock(HIPContext::mutex());
+  // A one-time caffe2 cuda initializer.
+  static Caffe2HipInitializerHelper g_hip_initializer_;
+  void* ptr = nullptr;
 
-    if(FLAGS_caffe2_gpu_memory_tracking)
-    {
-        TrackMemoryAlloc(nbytes);
-    }
-    switch(g_hip_memory_pool_type)
-    {
+  if (FLAGS_caffe2_gpu_memory_tracking) {
+    TrackMemoryAlloc(nbytes);
+  }
+  switch (g_hip_memory_pool_type) {
     case HipMemoryPoolType::NONE:
         HIP_ENFORCE(hipMalloc(&ptr, nbytes));
         if(FLAGS_caffe2_gpu_memory_tracking)
@@ -362,24 +343,21 @@ std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
     return {nullptr, Delete};
 }
 
-void HIPContext::Delete(void* ptr)
-{
-    // lock the mutex
-    std::lock_guard<std::mutex> lock(HIPContext::mutex());
-
-    if(FLAGS_caffe2_gpu_memory_tracking)
-    {
-        auto sz_it = g_size_map.find(ptr);
-        DCHECK(sz_it != g_size_map.end());
-        auto aff_it = g_hip_device_affiliation.find(ptr);
-        DCHECK(aff_it != g_hip_device_affiliation.end());
-        g_total_mem -= sz_it->second;
-        g_total_by_gpu_map[aff_it->second] -= sz_it->second;
-        g_size_map.erase(sz_it);
-    }
-
-    switch(g_hip_memory_pool_type)
-    {
+void HIPStaticContext::Delete(void* ptr) {
+  // lock the mutex
+  std::lock_guard<std::mutex> lock(HIPContext::mutex());
+
+  if (FLAGS_caffe2_gpu_memory_tracking) {
+    auto sz_it = g_size_map.find(ptr);
+    DCHECK(sz_it != g_size_map.end());
+    auto aff_it = g_hip_device_affiliation.find(ptr);
+    DCHECK(aff_it != g_hip_device_affiliation.end());
+    g_total_mem -= sz_it->second;
+    g_total_by_gpu_map[aff_it->second] -= sz_it->second;
+    g_size_map.erase(sz_it);
+  }
+
+  switch (g_hip_memory_pool_type) {
     case HipMemoryPoolType::NONE:
     {
         // If memory pool is not set up, use simple hipFree.
@@ -415,4 +393,11 @@ void HIPContext::Delete(void* ptr)
     }
 }
 
+BaseStaticContext* GetHIPStaticContext() {
+  static HIPStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(HIP, GetHIPStaticContext());
+
 } // namespace caffe2
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index 577ccd6792824..36644f3715188 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -119,37 +119,46 @@ class ThreadLocalHIPObjects {
   vector<miopenHandle_t> miopen_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
 };
 
-class HIPContext final {
+BaseStaticContext* GetHIPStaticContext();
+
+class HIPContext final : public BaseContext {
  public:
   // The default HIP context constructor.
   explicit HIPContext(const int gpu_id = -1);
   explicit HIPContext(const DeviceOption& option);
 
-  ~HIPContext() {
+  ~HIPContext() override {
     if (hiprand_generator_) {
       HIPRAND_CHECK(hiprandDestroyGenerator(hiprand_generator_));
     }
     FinishDeviceComputation();
   }
 
-  inline void SwitchToDevice(int stream_id) {
+  BaseStaticContext* GetStaticContext() const override {
+    return GetHIPStaticContext();
+  }
+
+  static BaseStaticContext* StaticContext() {
+    return GetHIPStaticContext();
+  }
+
+  inline void SwitchToDevice(int stream_id) override {
     set_stream_id(stream_id);
     CaffeHipSetDevice(gpu_id_);
   }
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
 
-  inline void WaitEvent(const Event& ev) {
+  using BaseContext::SwitchToDevice;
+
+  inline void WaitEvent(const Event& ev) override {
     ev.Wait(HIP, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(HIP, this, err_msg);
   }
 
-  void FinishDeviceComputation() {
+  void FinishDeviceComputation() override {
     hipStreamSynchronize(hip_objects_.GetStream(gpu_id_, stream_id_));
     hipError_t error = hipGetLastError();
     if (error != hipSuccess) {
@@ -194,7 +203,9 @@ class HIPContext final {
     return hiprand_generator_;
   }
 
-  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return StaticContext()->New(nbytes);
+  }
 
   // Get a mutex to lock out hipMalloc / hipFree calls when
   // NCCL kernels are being launched. Should remove threat of
@@ -218,6 +229,21 @@ class HIPContext final {
         hip_objects_.GetStream(gpu_id_, stream_id_)));
   }
 
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytes<HIPContext, HIPContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
+  }
+
   template <typename T, class SrcContext, class DstContext>
   inline void Copy(int n, const T* src, T* dst) {
     CopyBytes<SrcContext, DstContext>(
@@ -245,6 +271,14 @@ class HIPContext final {
     return hipStreamQuery(stream) == hipSuccess;
   }
 
+  DeviceType GetDevicetype() const override {
+    return HIP;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return HIP;
+  }
+
  protected:
   static void Delete(void* data);
   void set_stream_id(int stream_id) {
@@ -338,8 +372,38 @@ struct PinnedCPUAllocator final : CPUAllocator {
   DefaultCPUAllocator baseAllocator_;
 };
 
-// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
-typedef Tensor<HIPContext> TensorHIP;
+class HIPStaticContext final : public BaseStaticContext {
+ public:
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<HIPContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<HIPContext>(option);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
+    return caffe2::make_unique<HIPContext>(gpu_id);
+  }
+
+  DeviceType GetDeviceType() override {
+    return HIP;
+  }
+
+
+  void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    device->set_device_type(GetDeviceType());
+    device->set_hip_gpu_id(GetGPUIDForPointer(data));
+  }
+
+ protected:
+  static void Delete(void* data);
+};
+
+typedef Tensor TensorHIP;
 
 } // namespace caffe2
 
diff --git a/caffe2/core/int8_serialization.cc b/caffe2/core/int8_serialization.cc
index 4003c1f1384e8..190cf5797f01f 100644
--- a/caffe2/core/int8_serialization.cc
+++ b/caffe2/core/int8_serialization.cc
@@ -56,7 +56,7 @@ class Int8TensorCPUSerializer : public BlobSerializerBase {
   CPUContext context_;
 };
 
-class Int8TensorCPUDeserializer : public TensorDeserializer<CPUContext> {
+class Int8TensorCPUDeserializer : public TensorDeserializer {
  public:
   void Deserialize(const BlobProto& blob_proto, Blob* blob) override {
     const QTensorProto& proto = blob_proto.qtensor();
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 325ccd3761afb..45f44049e49c6 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -79,11 +79,45 @@ class OperatorBase : public Observable<OperatorBase> {
     }
   }
 
+  // TODO(jerryzh): Remove template
+  // and the type argument?
+  // This is to keep the API changes minimal and make refactoring
+  // a bit easier
+  template <typename T>
+  inline const T& Input(int idx, DeviceType type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "Input(int, DeviceType) is only available for Tensor");
+    DCHECK_LT(idx, inputs_.size());
+    try {
+      // TODO(jerryzh): We'll need to check device type in Get<T>() later
+      // Get<T>() -> Get<T>(type)
+      const auto& tensor = inputs_.at(idx)->template Get<T>();
+      return tensor;
+    } catch (::caffe2::EnforceNotMet& enf) {
+      if (has_debug_def()) {
+        enf.AppendMessage(".\nOffending Blob name: ");
+        enf.AppendMessage(debug_def().input(idx));
+        enf.AppendMessage(".\n");
+      }
+      throw enf;
+    }
+  }
+
   template <typename T>
   inline T* Output(int idx) {
     return outputs_.at(idx)->template GetMutable<T>();
   }
 
+  // TODO(jerryzh): Remove this template
+  template <typename T>
+  inline T* Output(int idx, DeviceType type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "Output(int, DeviceType) is only available for Tensor");
+    return outputs_.at(idx)->GetMutableTensor(type);
+  }
+
   template <typename T>
   inline T* Output(int idx, T* allocated) {
     outputs_.at(idx)->Reset(allocated);
@@ -103,11 +137,29 @@ class OperatorBase : public Observable<OperatorBase> {
     return inputs_.at(idx)->template IsType<T>();
   }
 
+  template <typename T>
+  inline bool InputIsType(int idx, DeviceType device_type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "InputIsType(idx, DeviceType) only available on "
+        "Tensor types.");
+    return inputs_.at(idx)->template IsType<T>(device_type);
+  }
+
   template <typename T>
   inline bool OutputIsType(int idx) {
     return outputs_.at(idx)->template IsType<T>();
   }
 
+  template <typename T>
+  inline bool OutputIsType(int idx, DeviceType type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "OutputIsType(idx, DeviceType) only available on "
+        "Tensor types.");
+    return outputs_.at(idx)->template IsType<T>(type);
+  }
+
   inline int InputSize() const {
     return inputs_.size();
   }
@@ -380,11 +432,14 @@ class Operator : public OperatorBase {
   }
   ~Operator() noexcept override {}
 
-  inline const Tensor<Context>& Input(int idx) {
-    return OperatorBase::template Input<Tensor<Context>>(idx);
+  inline const Tensor& Input(
+      int idx,
+      DeviceType type = Context::GetDeviceType()) {
+    return OperatorBase::template Input<Tensor>(idx, type);
   }
-  inline Tensor<Context>* Output(int idx) {
-    return OperatorBase::template Output<Tensor<Context>>(idx);
+
+  inline Tensor* Output(int idx, DeviceType type = Context::GetDeviceType()) {
+    return OperatorBase::template Output<Tensor>(idx, type);
   }
 
   void WaitEvent(const Event& ev, int stream_id = -1) final {
@@ -712,8 +767,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
       return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>::             \
           template call<Op>(op, meta);                                         \
     }                                                                          \
-    template <typename Op, typename Context>                                   \
-    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Tensor& tensor) {                           \
       return call<Op>(op, tensor.meta());                                      \
     }                                                                          \
     template <typename Op>                                                     \
@@ -728,8 +783,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
     static bool call(Op* /* unused */, const TypeMeta& meta) {                 \
       CAFFE_THROW("Unsupported type of tensor: ", meta.name());                \
     }                                                                          \
-    template <typename Op, typename Context>                                   \
-    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Tensor& tensor) {                           \
       return call<Op>(op, tensor.meta());                                      \
     }                                                                          \
     template <typename Op>                                                     \
@@ -746,8 +801,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
     static bool call(Op* op, const TypeMeta&) {                                \
       return op->template DoRunWithOtherType<ExtraArgs...>();                  \
     }                                                                          \
-    template <typename Op, typename Context>                                   \
-    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Tensor& tensor) {                           \
       return call<Op>(op, tensor.meta());                                      \
     }                                                                          \
     template <typename Op>                                                     \
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index fba9c9d56a1c8..1944874437d73 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -131,8 +131,7 @@ struct WorkspaceIdInjector {
           "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
       int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
       Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor =
-          global_ws_id_blob->template GetMutable<TensorCPU>();
+      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
       global_ws_id_tensor->Resize();
       global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
       VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
index 2aaa7a2dac3a3..cb80f90aa02c5 100644
--- a/caffe2/core/predictor.cc
+++ b/caffe2/core/predictor.cc
@@ -14,7 +14,7 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->template IsType<TensorCPU>(), "Blob is not a CPU Tensor: ", name);
+      blob->template IsType<Tensor>(CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 void shareInputTensor(
@@ -24,7 +24,7 @@ void shareInputTensor(
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  auto* tensor = blob->template GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->ResizeLike(*input);
   tensor->ShareData(*input);
 }
@@ -33,7 +33,7 @@ TensorCPU* extractOutputTensor(Workspace* ws, const std::string& name) {
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return blob->template GetMutable<TensorCPU>();
+  return blob->GetMutableTensor(CPU);
 }
 
 // We don't use the getNet() from predictor_utils.cc here because that file
@@ -115,7 +115,7 @@ Predictor::Predictor(
   for (const auto& name : predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = ws_.CreateBlob(name);
-      blob->template GetMutable<TensorCPU>();
+      blob->GetMutableTensor(CPU);
     }
   }
 
diff --git a/caffe2/core/predictor_test.cc b/caffe2/core/predictor_test.cc
index a37dbbb9e8d39..c8c00538eaa6c 100644
--- a/caffe2/core/predictor_test.cc
+++ b/caffe2/core/predictor_test.cc
@@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
     const std::vector<TIndex>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
-  auto* t = blob->GetMutable<TensorCPU>();
+  auto* t = blob->GetMutableTensor(CPU);
   t->Resize(dims);
   math::RandUniform<float, CPUContext>(
       t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@@ -178,7 +178,7 @@ class PredictorTest : public testing::Test {
 
 TEST_F(PredictorTest, SimpleBatchSized) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorVector input{inputData->template GetMutable<TensorCPU>()};
+  Predictor::TensorVector input{inputData->GetMutableTensor(CPU)};
   Predictor::TensorVector output;
   p_->run(input, &output);
   EXPECT_EQ(output.size(), 1);
@@ -190,8 +190,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
 
 TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorMap input{
-      {"data", inputData->template GetMutable<TensorCPU>()}};
+  Predictor::TensorMap input{{"data", inputData->GetMutableTensor(CPU)}};
   Predictor::TensorVector output;
   p_->run_map(input, &output);
   EXPECT_EQ(output.size(), 1);
@@ -216,8 +215,7 @@ class PredictorMetaNetDefTest : public testing::Test {
 
 TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorMap input{
-      {"data", inputData->template GetMutable<TensorCPU>()}};
+  Predictor::TensorMap input{{"data", inputData->GetMutableTensor(CPU)}};
   Predictor::TensorVector output;
   p_->run_map(input, &output);
   EXPECT_EQ(output.size(), 1);
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 1f115e14f6715..83e907be0e263 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -43,9 +43,33 @@ TensorPrinter::~TensorPrinter() {
   }
 }
 
-static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_ {
-  {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorType<CPUContext>}
-};
+void TensorPrinter::PrintMeta(const Tensor& tensor) {
+  if (to_file_) {
+    (*log_file_) << MetaStr(tensor) << std::endl;
+  } else {
+    LOG(INFO) << MetaStr(tensor);
+  }
+}
+
+std::string TensorPrinter::MetaStr(const Tensor& tensor) {
+  std::stringstream meta_stream;
+  meta_stream << "Tensor " << tensor_name_ << " of type "
+              << tensor.meta().name() << ". Dims: (";
+  for (const auto dim : tensor.dims()) {
+    meta_stream << dim << ",";
+  }
+  meta_stream << "): ";
+  return meta_stream.str();
+}
+
+TypeMeta GetTensorType(const void* c) {
+  const Tensor* tc = static_cast<const Tensor*>(c);
+  return tc->meta();
+}
+
+// TODO(jerryzh): Remove
+static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_{
+    {TypeMeta::Id<Tensor>(), GetTensorType}};
 
 TypeCall GetTypeCallFunction(CaffeTypeId id) {
   auto f = type_call_registry_.find(id);
@@ -59,9 +83,26 @@ void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c) {
   type_call_registry_[id] = c;
 }
 
+int GetGPUIDForPointer(const void* ptr);
+
+vector<TIndex> GetTensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  const Tensor* tc = static_cast<const Tensor*>(c);
+  *shares_data = tc->shares_data();
+  *capacity = tc->capacity_nbytes();
+  tc->ExtractDeviceOption(device);
+  return tc->dims();
+}
+
+// since we only have one tensor, probably need to remove this at some point?
 static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
-    {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorInfo<CPUContext>}};
+    {TypeMeta::Id<Tensor>(), GetTensorInfo}};
 
+// TODO: Remove this code in a separate diff, since we only have one
+// GetTensorInfo function now
 TensorInfoCall GetTensorInfoFunction(CaffeTypeId id) {
   auto f = tensor_info_call_registry_.find(id);
   if (f == tensor_info_call_registry_.end()) {
@@ -74,11 +115,20 @@ void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c) {
   tensor_info_call_registry_[id] = c;
 }
 
+void TensorVectorResize(std::vector<Tensor>& tensors,
+                        int size,
+                        DeviceType type) {
+  tensors.reserve(size);
+  for (auto i = 0; i < size; ++i) {
+    tensors.emplace_back(type);
+  }
+}
+
 namespace {
 
-struct TensorCPUStatGetter : BlobStatGetter {
+struct TensorStatGetter : BlobStatGetter {
   size_t sizeBytes(const Blob& blob) const override {
-    const auto& tensor = blob.Get<TensorCPU>();
+    const auto& tensor = blob.Get<Tensor>();
     auto nbytes = tensor.nbytes();
     if (nbytes > 0 && tensor.IsType<std::string>()) {
       const auto* data = tensor.data<std::string>();
@@ -89,7 +139,7 @@ struct TensorCPUStatGetter : BlobStatGetter {
     return nbytes;
   }
 };
-REGISTER_BLOB_STAT_GETTER(TensorCPU, TensorCPUStatGetter);
+REGISTER_BLOB_STAT_GETTER(Tensor, TensorStatGetter);
 }
 
 } // namespace caffe2
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index a0a170505acec..2c150d6d0d55a 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -89,13 +89,10 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
  * the allocation and de-allocation of such memory. We make a simplified
  * assumption that the memory is always contiguous.
  */
-template <class Context>
 class Tensor {
  public:
-  /**
-   * Initializes an empty tensor.
-   */
-  Tensor() {}
+  Tensor() = delete;
+  explicit Tensor(DeviceType type) : device_type_(type) {}
 
   /**
    * @brief Creates a tensor of the given dimension.
@@ -103,67 +100,86 @@ class Tensor {
    * Note that the actual data allocation is not going to be carried out until
    * the first time mutable_data() is called.
    */
-  explicit Tensor(const vector<TIndex>& dims) { Resize(dims); }
-  explicit Tensor(const vector<int>& dims) { Resize(dims); }
+  explicit Tensor(const vector<TIndex>& dims, DeviceType type)
+      : device_type_(type) {
+    Resize(dims);
+  }
+  explicit Tensor(const vector<int>& dims, DeviceType type)
+      : device_type_(type) {
+    Resize(dims);
+  }
 
-  /**
-   * @brief Creates a tensor from a source tensor, copying over the content.
-   *
-   * Note that the source tensor can be from a different device context. The
-   * second argument provides a device context object (either Context or
-   * SrcContext) that will be responsible for copying the underlying data.
-   * If you do not wish to pass in a Context object, an equivalent constructor
-   * function exists that will create an implicit context object for copy, but
-   * be noted that this will cause a potential performance hit.
+  /* Now we require that context_for_copy has the same device type as src since template
+   * is removed
    */
-  template <class SrcContext, class ContextForCopy>
-  Tensor(const Tensor<SrcContext>& src, ContextForCopy* context) {
-    CopyFrom(src, context);
+  Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type) : device_type_(type) {
+    CopyFrom(src, context_for_copy);
   }
 
   /**
-   * @brief Creates a tensor from a source tensor, copying over the content.
-   *
-   * Note that this may have a potential performance hit, since a temporary
-   * context object will be created for the memory copy. Prefer explicitly
-   * providing a context for copy if you can.
-   *
-   * Since it's a potentially expensive operation - making copy constructor
-   * explicit here. If SrcContext != Context it's actually a typecast
-   * constructor and it should be definitely explicit.
+   * @brief: Create a Tensor of DeviceType `type` and initialize it with
+   * src Tensor
    */
-  template <class SrcContext>
-  explicit Tensor(const Tensor<SrcContext>& src) {
+  Tensor(const Tensor& src, DeviceType type) : device_type_(type) {
     CopyFrom(src);
   }
 
   /**
    * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
    */
   template <typename T>
-  Tensor(const vector<TIndex>& dims, const vector<T>& values, Context* context)
+  Tensor(
+      const vector<TIndex>& dims,
+      const vector<T>& values,
+      BaseContext* context)
       : meta_(TypeMeta::Make<T>()) {
     Resize(dims);
     CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size_);
-    context->template Copy<T, CPUContext, Context>(size_, values.data(), mutable_data<T>());
+    device_type_ = context->GetDevicetype();
+    context->CopyItemsFromCPU(meta_, size_, values.data(), mutable_data<T>());
   }
 
   /**
    * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
    */
-  template <typename T,
-            typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  Tensor(const T& value, Context* context) {
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
+  Tensor(const T& value, BaseContext* context) : meta_(TypeMeta::Make<T>()) {
     Resize(vector<TIndex>{});
-    context->template Copy<T, CPUContext, Context>(size_, &value, mutable_data<T>());
+    device_type_ = context->GetDevicetype();
+    context->CopyItemsFromCPU(meta_, size_, &value, mutable_data<T>());
+  }
+
+  /*
+   * Since we removed template from tensor, we now store a static
+   * context pointer in tensor, which indicates the type of the tensor.
+   */
+  BaseStaticContext* GetStaticContext() const {
+    return GET_STATIC_CONTEXT(device_type_);
+  }
+
+  /* @brief
+   * Create a context that has the same device_type
+   * as the tensor.
+   * Note that this doesn't support passing in argument
+   * TODO(jerryzh): move this to a global registry
+   * that can create context for us
+   */
+  std::unique_ptr<BaseContext> CreateContext() const {
+    return GetStaticContext()->CreateContext();
   }
 
+  DeviceType GetDeviceType() const {
+    return device_type_;
+  }
   /**
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.
    */
-  template <class SrcContext, class ContextForCopy>
-  void CopyFrom(const Tensor<SrcContext>& src, ContextForCopy* context) {
+  void CopyFrom(const Tensor& src, BaseContext* context = nullptr) {
     if ((void*)&src == (void*)this) {
       return;
     }
@@ -180,27 +196,39 @@ class Tensor {
     Resize(src.dims());
     if (size() > 0) {
       if (meta_.copy()) {
+        CAFFE_ENFORCE(
+            GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        CAFFE_ENFORCE(
+            src.GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
         meta_.copy()(src.raw_data(), raw_mutable_data(), size());
       } else {
-        context->template CopyBytes<SrcContext, Context>(
-            nbytes(), src.raw_data(), raw_mutable_data());
+        // We'll need to use a non-CPU context to perform the copy if
+        // one of the context is not CPU since only non-CPU context
+        // knows how to copy between CPU and that context
+        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
+          if (!context) {
+            src.CreateContext().get()->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          } else {
+            CAFFE_ENFORCE(
+                context->GetDevicetype() == src.GetDeviceType(),
+                "Type for provided context does not match the type of source");
+            context->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          }
+        } else {
+          // In case source context is CPU, and target context is non-CPU
+          // We'll have to create a Context from target and perform the
+          // copy using that context
+          CreateContext().get()->CopyBytesFromCPU(
+              nbytes(), src.raw_data(), raw_mutable_data());
+        }
       }
     }
   }
 
-  /**
-   * @brief Copies the data from a source tensor.
-   *
-   * Note that this may have a potential performance hit, since a temporary
-   * context object will be created for the memory copy. Prefer explicitly
-   * providing a context for copy if you can.
-   */
-  template <class SrcContext>
-  inline void CopyFrom(const Tensor<SrcContext>& src) {
-    SrcContext tmp_context;
-    CopyFrom(src, &tmp_context);
-  }
-
   virtual ~Tensor() noexcept {}
 
   /**
@@ -212,8 +240,7 @@ class Tensor {
    * growthPct. This ensures that Extend runs on an amortized O(1) time
    * complexity.
    */
-  template <class ContextForCopy>
-  void Extend(TIndex num, float growthPct, ContextForCopy* context) {
+  void Extend(TIndex num, float growthPct, BaseContext* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     auto newDims = dims_;
     newDims[0] += num;
@@ -239,8 +266,8 @@ class Tensor {
     size_ = newSize;
   }
 
-  template <class T, class ContextForCopy>
-  void Reserve(const std::vector<T>& newCapacity, ContextForCopy* context) {
+  template <class T>
+  void Reserve(const std::vector<T>& newCapacity, BaseContext* context) {
     auto newSize = std::accumulate(
         newCapacity.begin(),
         newCapacity.end(),
@@ -254,8 +281,7 @@ class Tensor {
     auto oldDims = dims_;
     Resize(newCapacity);
     auto* newData = raw_mutable_data(meta_);
-    context->template CopyItems<ContextForCopy, ContextForCopy>(
-        meta_, oldSize, oldData.get(), newData);
+    context->CopyItemsSameDevice(meta_, oldSize, oldData.get(), newData);
     dims_ = oldDims;
     size_ = oldSize;
     reserved_ = true;
@@ -320,8 +346,7 @@ class Tensor {
    * Resize the tensor like the source tensor. Note that this is just a
    * sugar wrapper that essentially calls Resize(src_tensor.dims()).
    */
-  template <class OtherContext>
-  inline void ResizeLike(const Tensor<OtherContext>& src_tensor) {
+  inline void ResizeLike(const Tensor& src_tensor) {
     // Note: need casting for different context types.
     if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
       Resize(src_tensor.dims());
@@ -384,7 +409,7 @@ class Tensor {
     return ss.str();
   }
 
-  void swap(Tensor<Context>& other) {
+  void swap(Tensor& other) noexcept {
     std::swap(dims_, other.dims_);
     std::swap(size_, other.size_);
     std::swap(meta_, other.meta_);
@@ -392,6 +417,7 @@ class Tensor {
     std::swap(shares_data_, other.shares_data_);
     std::swap(capacity_, other.capacity_);
     std::swap(reserved_, other.reserved_);
+    std::swap(device_type_, other.device_type_);
   }
 
   /**
@@ -542,7 +568,8 @@ class Tensor {
         // destruction procedure.
         auto size = size_;
         auto dtor = meta_.dtor();
-        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
+        auto ptr_and_deleter =
+            GetStaticContext()->New(size_ * meta_.itemsize());
         auto deleter = ptr_and_deleter.second;
         data_.reset(
             ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
@@ -552,7 +579,8 @@ class Tensor {
         meta_.ctor()(data_.get(), size_);
       } else {
         // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
+        auto ptr_and_deleter =
+            GetStaticContext()->New(size_ * meta_.itemsize());
         data_.reset(ptr_and_deleter.first, ptr_and_deleter.second);
       }
       capacity_ = size_ * meta_.itemsize();
@@ -690,20 +718,28 @@ class Tensor {
     return dims_[i];
   }
 
+  // We don't allow change to the type of
+  // tensor after initialization
   Tensor Clone() const {
-    Tensor x;
+    Tensor x(GetDeviceType());
     x.CopyFrom(*this);
     return x;
   }
 
-  Tensor(Tensor<Context>&& src) noexcept {
+  Tensor(Tensor&& src) noexcept {
     swap(src);
   }
 
+  Tensor& operator=(Tensor&&) = default;
+
   /**
    * @brief Delete the copy constructor and use Clone explicitly
    */
-  Tensor(const Tensor<Context>& src) = delete;
+  Tensor(const Tensor& src) = delete;
+
+  void ExtractDeviceOption(DeviceOption* device) const {
+    GetStaticContext()->ExtractDeviceOption(device, raw_data());
+  }
 
  protected:
   vector<TIndex> dims_;
@@ -713,6 +749,7 @@ class Tensor {
   bool shares_data_ = false;
   size_t capacity_ = 0;
   bool reserved_ = false;
+  DeviceType device_type_ = CPU;
   // In case of chunk load we store how much data was already loaded
 
  private:
@@ -785,8 +822,7 @@ class Tensor {
   Tensor& operator=(const Tensor& src) = delete;
 };
 
-// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
-typedef Tensor<CPUContext> TensorCPU;
+using TensorCPU = Tensor;
 
 constexpr int k_limit_default_ = 1000;
 
@@ -795,12 +831,6 @@ typedef TypeMeta (*TypeCall)(const void*);
 TypeCall GetTypeCallFunction(CaffeTypeId id);
 void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c);
 
-template <class Context>
-TypeMeta GetTensorType(const void* c) {
-  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
-  return tc->meta();
-}
-
 // Shape call registry
 typedef vector<TIndex> (*TensorInfoCall)(
     const void*,
@@ -810,19 +840,8 @@ typedef vector<TIndex> (*TensorInfoCall)(
 TensorInfoCall GetTensorInfoFunction(CaffeTypeId id);
 void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c);
 
-template <class Context>
-vector<TIndex> GetTensorInfo(
-    const void* c,
-    bool* shares_data,
-    size_t* capacity,
-    DeviceOption* device) {
-  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
-  *shares_data = tc->shares_data();
-  *capacity = tc->capacity_nbytes();
-  device->set_device_type(CPU);
-  device->set_cuda_gpu_id(0);
-  return tc->dims();
-}
+// resize helper function
+void TensorVectorResize(std::vector<Tensor>& tensors, int size, DeviceType type);
 
 class TensorPrinter {
  public:
@@ -833,13 +852,11 @@ class TensorPrinter {
   ~TensorPrinter();
 
   template <class T>
-  void Print(const Tensor<CPUContext>& tensor);
+  void Print(const Tensor& tensor);
 
-  template <class Context>
-  void PrintMeta(const Tensor<Context>& tensor);
+  void PrintMeta(const Tensor& tensor);
 
-  template <class Context>
-  string MetaStr(const Tensor<Context>& tensor);
+  string MetaStr(const Tensor& tensor);
 
  private:
   bool to_file_;
@@ -849,7 +866,7 @@ class TensorPrinter {
 };
 
 template <class T>
-void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
+void TensorPrinter::Print(const Tensor& tensor) {
   std::stringstream values_stream;
   // One most likely doesn't want to print int64-number of items for visual
   // inspection, so we cast down to int here.
@@ -869,26 +886,5 @@ void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
   }
 }
 
-template <class Context>
-void TensorPrinter::PrintMeta(const Tensor<Context>& tensor) {
-  if (to_file_) {
-    (*log_file_) << MetaStr(tensor) << std::endl;
-  } else {
-    LOG(INFO) << MetaStr(tensor);
-  }
-}
-
-template <class Context>
-std::string TensorPrinter::MetaStr(const Tensor<Context>& tensor) {
-  std::stringstream meta_stream;
-  meta_stream << "Tensor " << tensor_name_ << " of type "
-              << tensor.meta().name() << ". Dims: (";
-  for (const auto dim : tensor.dims()) {
-    meta_stream << dim << ",";
-  }
-  meta_stream << "): ";
-  return meta_stream.str();
-}
-
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_TENSOR_H_
diff --git a/caffe2/core/tensor_int8.h b/caffe2/core/tensor_int8.h
index 93efe66a79de3..ec7d2aaa618a3 100644
--- a/caffe2/core/tensor_int8.h
+++ b/caffe2/core/tensor_int8.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/proto/caffe2.pb.h"
 
 namespace caffe2 {
 namespace int8 {
@@ -12,7 +13,7 @@ struct Int8TensorCPU {
   int32_t zero_point{0};
   // Generally stores uint8_t data, but sometimes int32_t (e.g. bias
   // parameters).
-  TensorCPU t;
+  Tensor t{CPU};
 };
 } // namespace int8
 } // namespace caffe2
diff --git a/caffe2/core/typeid.cc b/caffe2/core/typeid.cc
index 2e1214656479b..ba81e3babc6d0 100644
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@@ -69,8 +69,7 @@ CaffeTypeId CaffeTypeId::createTypeId() {
   return CaffeTypeId(new_value);
 }
 
-CAFFE_DEFINE_KNOWN_TYPE(Tensor<CPUContext>);
-CAFFE_DEFINE_KNOWN_TYPE(Tensor<CUDAContext>);
+CAFFE_DEFINE_KNOWN_TYPE(Tensor);
 CAFFE_DEFINE_KNOWN_TYPE(float);
 CAFFE_DEFINE_KNOWN_TYPE(int);
 CAFFE_DEFINE_KNOWN_TYPE(std::string);
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index 6a497861af8e3..b4a01b57cc11e 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -437,41 +437,37 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
       #T);                                                     \
   }
 
-template <class Context>
 class Tensor;
-class CPUContext;
-class CUDAContext;
 
 // note: first preallocated id is 1, because 0 is used for uninitialized type
 // ids.
 struct _CaffeHighestPreallocatedTypeId final {};
 
-CAFFE_DECLARE_KNOWN_TYPE(1, Tensor<CPUContext>);
-CAFFE_DECLARE_KNOWN_TYPE(2, Tensor<CUDAContext>);
-CAFFE_DECLARE_KNOWN_TYPE(3, float);
-CAFFE_DECLARE_KNOWN_TYPE(4, int);
-CAFFE_DECLARE_KNOWN_TYPE(5, std::string);
-CAFFE_DECLARE_KNOWN_TYPE(6, bool);
-CAFFE_DECLARE_KNOWN_TYPE(7, uint8_t);
-CAFFE_DECLARE_KNOWN_TYPE(8, int8_t);
-CAFFE_DECLARE_KNOWN_TYPE(9, uint16_t);
-CAFFE_DECLARE_KNOWN_TYPE(10, int16_t);
-CAFFE_DECLARE_KNOWN_TYPE(11, int64_t);
-CAFFE_DECLARE_KNOWN_TYPE(12, double);
-CAFFE_DECLARE_KNOWN_TYPE(13, char);
-CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
-CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
-CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
-CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
-CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
-CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
-CAFFE_DECLARE_KNOWN_TYPE(20, char*);
-CAFFE_DECLARE_KNOWN_TYPE(21, int*);
+CAFFE_DECLARE_KNOWN_TYPE(1, Tensor);
+CAFFE_DECLARE_KNOWN_TYPE(2, float);
+CAFFE_DECLARE_KNOWN_TYPE(3, int);
+CAFFE_DECLARE_KNOWN_TYPE(4, std::string);
+CAFFE_DECLARE_KNOWN_TYPE(5, bool);
+CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t);
+CAFFE_DECLARE_KNOWN_TYPE(7, int8_t);
+CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t);
+CAFFE_DECLARE_KNOWN_TYPE(9, int16_t);
+CAFFE_DECLARE_KNOWN_TYPE(10, int64_t);
+CAFFE_DECLARE_KNOWN_TYPE(11, double);
+CAFFE_DECLARE_KNOWN_TYPE(12, char);
+CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr<std::mutex>);
+CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::atomic<bool>>);
+CAFFE_DECLARE_KNOWN_TYPE(15, std::vector<int32_t>);
+CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int64_t>);
+CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<unsigned long>);
+CAFFE_DECLARE_KNOWN_TYPE(18, bool*);
+CAFFE_DECLARE_KNOWN_TYPE(19, char*);
+CAFFE_DECLARE_KNOWN_TYPE(20, int*);
 
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
-CAFFE_DECLARE_KNOWN_TYPE(22, long);
-CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
+CAFFE_DECLARE_KNOWN_TYPE(21, long);
+CAFFE_DECLARE_KNOWN_TYPE(22, std::vector<long>);
 #endif // CAFFE2_UNIQUE_LONG_TYPEMETA
 
-CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
+CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId);
 }
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index a593604114d3e..4a759b8703dc4 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -136,14 +136,14 @@ class Workspace {
       auto* from_blob = parent_ws->GetBlob(ws_blob.second);
       CAFFE_ENFORCE(from_blob);
       CAFFE_ENFORCE(
-          from_blob->template IsType<Tensor<Context>>(),
+          from_blob->template IsType<Tensor>(),
           "Expected blob with tensor value",
           ws_blob.second);
       forwarded_blobs_.erase(blob);
       auto* to_blob = CreateBlob(blob);
       CAFFE_ENFORCE(to_blob);
-      const auto& from_tensor = from_blob->template Get<Tensor<Context>>();
-      auto* to_tensor = to_blob->template GetMutable<Tensor<Context>>();
+      const auto& from_tensor = from_blob->template Get<Tensor>();
+      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
       to_tensor->CopyFrom(from_tensor);
     }
   }
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition.h b/caffe2/experiments/operators/fully_connected_op_decomposition.h
index f06877f188b8e..ae6a228684670 100644
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@@ -100,8 +100,8 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
   }
 
  protected:
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> multi_buffer_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor multi_buffer_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context, class Engine=DefaultEngine>
@@ -207,10 +207,10 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
   }
 
  protected:
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> du_buffer_;
-  Tensor<Context> dv_buffer_;
-  Tensor<Context> dx_buffer_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor du_buffer_{Context::GetDeviceType()};
+  Tensor dv_buffer_{Context::GetDeviceType()};
+  Tensor dx_buffer_{Context::GetDeviceType()};
 };
 
 }  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.h b/caffe2/experiments/operators/fully_connected_op_prune.h
index 5b9508f26f2b5..3f6c24ef4a3f0 100644
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@@ -189,7 +189,7 @@ namespace caffe2 {
         }
 
       protected:
-        Tensor<Context> bias_multiplier_;
+       Tensor bias_multiplier_{Context::GetDeviceType()};
     };
 
   template <typename T, class Context, class Engine=DefaultEngine>
@@ -343,9 +343,9 @@ namespace caffe2 {
         }
 
       protected:
-        Tensor<Context> bias_multiplier_;
-        Tensor<Context> sum_buffer_;
-        Tensor<Context> comp_r_buf_;
+       Tensor bias_multiplier_{Context::GetDeviceType()};
+       Tensor sum_buffer_{Context::GetDeviceType()};
+       Tensor comp_r_buf_{Context::GetDeviceType()};
     };
 
 }  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.h b/caffe2/experiments/operators/fully_connected_op_sparse.h
index a5abe18a07484..6f19c1bacdc5b 100644
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@@ -106,7 +106,7 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
     const auto& jw = Input(3);
     // Notice that we do not need to transpose b
     const auto& b = Input(4);
-    auto* Yt = Output(0); //transposed Y
+    auto* Yt = Output(0); // transposed Y
     // here we assume X is k-by-m
     CAFFE_ENFORCE_EQ(Xt.ndim(), 2);
     CAFFE_ENFORCE_EQ(b.ndim(), 1);
@@ -140,7 +140,7 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
   }
 
  protected:
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
 };
 
 
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
index 8c8d51c4ed01d..b6686ade1aabd 100644
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@@ -104,7 +104,6 @@ class SparseMatrixReshapeOp : public Operator<Context> {
     CAFFE_ENFORCE(
         old_row.size() == nnz,
         "Column and row tensors must have the same size.");
-
     auto* new_col = Output(0);
     auto* new_row = Output(1);
     new_col->Resize(nnz);
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index f589185caa0f3..eb2d5b6acf1a6 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -27,7 +27,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
   bool RunOnDevice() override {
     const auto& input_zero = Input(INPUT0);
     auto* output = Output(OUTPUT);
-    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO);
+    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO, CPU);
 
     vector<itensor> inputs;
     for (int i = 0; i < InputSize(); ++i) {
@@ -88,7 +88,7 @@ class IDEEPSplitOp final : public IDEEPOperator {
           0,
           "If you set split with an input blob, do not pass in "
           "split in the argument.");
-      auto& axis_info = OperatorBase::Input<TensorCPU>(AXIS_INFO);
+      auto& axis_info = OperatorBase::Input<Tensor>(AXIS_INFO, CPU);
       CAFFE_ENFORCE_EQ(axis_info.size(), OutputSize());
       auto* axis_data = axis_info.template data<int>();
       axis_vdata.assign(axis_data, axis_data + OutputSize());
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 44eb9c7a430a8..ad39e641ed933 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -74,7 +74,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
     for (int i = 0; i < InputSize(); ++i) {
       if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
         auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
@@ -85,7 +85,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           InputIsType<itensor>(i) &&
           Input(i).get_data_type() == itensor::data_type::s32) {
         auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
@@ -138,8 +138,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         auto src_dims = src.dims();
         Blob* dst = OperatorBase::OutputBlob(i);
-        dst->Reset(new Tensor<CPUContext>());
-        auto dtensor = dst->template GetMutable<TensorCPU>();
+        dst->Reset(new Tensor(CPU));
+        auto dtensor = dst->GetMutableTensor(CPU);
         dtensor->Resize(src_dims);
         dtensor->ShareData(src);
       }
@@ -156,4 +156,3 @@ class IDEEPFallbackOp final : public IDEEPOperator {
 };
 
 } // namespace caffe2
-
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 67d7d2ca2d732..194b949222bea 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -10,7 +10,7 @@ class CopyCPUToIDEEPOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
 
   bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<TensorCPU>(0);
+    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
     auto* Y = OperatorBase::OutputBlob(0);
     itensor::dims src_dims(X.dims().begin(), X.dims().end());
     if (!(Y->template IsType<itensor>() &&
@@ -31,14 +31,14 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.template IsType<TensorCPU>()) {
+    if (input_blob.template IsType<Tensor>(CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<TensorCPU>(0);
-      auto* Y = OperatorBase::Output<TensorCPU>(0);
+      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
       Y->CopyFrom(X);
     } else {
       const auto& X = OperatorBase::Input<itensor>(0);
-      auto* Y = OperatorBase::Output<TensorCPU>(0);
+      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
       Y->Resize(X.get_dims());
       if (X.get_data_type() == itensor::data_type::f32) {
         X.reorder_to(Y->template mutable_data<float>());
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 200d98fe08609..38885af44c8a8 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -8,7 +8,9 @@
 
 namespace caffe2 {
 
-class IDEEPContext final {
+BaseStaticContext* GetIDEEPStaticContext();
+
+class IDEEPContext final : public BaseContext {
  public:
   typedef std::mt19937 rand_gen_type;
   IDEEPContext() : random_seed_(RandomNumberSeed()) {}
@@ -21,11 +23,17 @@ class IDEEPContext final {
 
   ~IDEEPContext() noexcept {}
 
-  inline void SwitchToDevice(int /*stream_id*/) {}
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
+  BaseStaticContext* GetStaticContext() const override {
+    return GetIDEEPStaticContext();
   }
 
+  static BaseStaticContext* StaticContext() {
+    return GetIDEEPStaticContext();
+  }
+
+  inline void SwitchToDevice(int /*stream_id*/) {}
+  using BaseContext::SwitchToDevice;
+
   inline void WaitEvent(const Event& ev) {
     ev.Wait(IDEEP, this);
   }
@@ -46,7 +54,32 @@ class IDEEPContext final {
   }
 
   inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return GetCPUAllocator()->New(nbytes);
+    return StaticContext()->New(nbytes);
+  }
+
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
+      override {
+    if (nbytes == 0) {
+      return;
+    }
+    CAFFE_ENFORCE(src);
+    CAFFE_ENFORCE(dst);
+    memcpy(dst, src, nbytes);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  bool SupportsNonFundamentalTypes() const override {
+    // IDEEP meta copy is OK
+    return true;
   }
 
   // Two copy functions that deals with cross-device copies.
@@ -89,6 +122,14 @@ class IDEEPContext final {
     return true;
   }
 
+  DeviceType GetDevicetype() const override {
+    return IDEEP;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return IDEEP;
+  }
+
  protected:
   // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
   int random_seed_{1701};
@@ -133,4 +174,25 @@ inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
   CAFFE_ENFORCE(dst);
   memcpy(dst, src, nbytes);
 }
+
+class IDEEPStaticContext : public BaseStaticContext {
+ public:
+  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
+    return GetCPUAllocator()->New(nbytes);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<IDEEPContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<IDEEPContext>(option);
+  }
+
+  DeviceType GetDeviceType() override {
+    return IDEEP;
+  }
+};
+
 } // namespace caffe2
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 45335e9659d48..c9c22387de4aa 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -1,7 +1,8 @@
-#include <ideep_pin_singletons.hpp>
+#include <caffe2/core/event_cpu.h>
 #include <caffe2/core/operator.h>
 #include <caffe2/proto/caffe2.pb.h>
-#include <caffe2/core/event_cpu.h>
+#include <ideep_pin_singletons.hpp>
+#include "ideep_context.h"
 
 namespace caffe2 {
 
@@ -26,4 +27,11 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
 
+BaseStaticContext* GetIDEEPStaticContext() {
+  static IDEEPStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(IDEEP, GetIDEEPStaticContext());
+
 } // namespace caffe2
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index a8c45ca87d46a..9dae032134bc0 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -87,12 +87,12 @@ class ImageInputOp final
   unique_ptr<db::DBReader> owned_reader_;
   const db::DBReader* reader_;
   CPUContext cpu_context_;
-  TensorCPU prefetched_image_;
-  TensorCPU prefetched_label_;
+  Tensor prefetched_image_{CPU};
+  Tensor prefetched_label_{CPU};
   vector<TensorCPU> prefetched_additional_outputs_;
-  Tensor<Context> prefetched_image_on_device_;
-  Tensor<Context> prefetched_label_on_device_;
-  vector<Tensor<Context>> prefetched_additional_outputs_on_device_;
+  Tensor prefetched_image_on_device_{Context::GetDeviceType()};
+  Tensor prefetched_label_on_device_{Context::GetDeviceType()};
+  vector<Tensor> prefetched_additional_outputs_on_device_;
   // Default parameters for images
   PerImageArg default_arg_;
   int batch_size_;
@@ -118,8 +118,8 @@ class ImageInputOp final
   int crop_;
   std::vector<float> mean_;
   std::vector<float> std_;
-  Tensor<Context> mean_gpu_;
-  Tensor<Context> std_gpu_;
+  Tensor mean_gpu_{Context::GetDeviceType()};
+  Tensor std_gpu_{Context::GetDeviceType()};
   bool mirror_;
   bool is_test_;
   bool use_caffe_datum_;
@@ -154,8 +154,6 @@ ImageInputOp<Context>::ImageInputOp(
     Workspace* ws)
     : PrefetchOperator<Context>(operator_def, ws),
       reader_(nullptr),
-      prefetched_additional_outputs_(OutputSize() - 2),
-      prefetched_additional_outputs_on_device_(OutputSize() - 2),
       batch_size_(
           OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
       label_type_(static_cast<LABEL_TYPE>(
@@ -385,6 +383,9 @@ ImageInputOp<Context>::ImageInputOp(
   }
 
   for (int i = 0; i < additional_output_sizes.size(); ++i) {
+    prefetched_additional_outputs_on_device_.emplace_back(
+        Context::GetDeviceType());
+    prefetched_additional_outputs_.emplace_back(CPU);
     prefetched_additional_outputs_[i].Resize(
         TIndex(batch_size_), TIndex(additional_output_sizes[i]));
   }
@@ -1196,12 +1197,12 @@ bool ImageInputOp<Context>::Prefetch() {
   // If the context is not CPUContext, we will need to do a copy in the
   // prefetch function as well.
   if (!std::is_same<Context, CPUContext>::value) {
-    prefetched_image_on_device_.CopyFrom(prefetched_image_, &context_);
-    prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
+    prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
+    prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
 
     for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
       prefetched_additional_outputs_on_device_[i].CopyFrom(
-          prefetched_additional_outputs_[i], &context_);
+          prefetched_additional_outputs_[i], &cpu_context_);
     }
   }
 
@@ -1212,13 +1213,13 @@ bool ImageInputOp<Context>::Prefetch() {
 
 template <class Context>
 bool ImageInputOp<Context>::CopyPrefetched() {
-  auto* image_output = OperatorBase::Output<Tensor<Context> >(0);
-  auto* label_output = OperatorBase::Output<Tensor<Context> >(1);
-  vector<Tensor<Context>*> additional_outputs_output;
+  auto type = Context::GetDeviceType();
+  auto* image_output = OperatorBase::Output<Tensor>(0, type);
+  auto* label_output = OperatorBase::Output<Tensor>(1, type);
+  vector<Tensor*> additional_outputs_output;
 
   for (int i = 2; i < OutputSize(); ++i) {
-    additional_outputs_output.push_back(
-        OperatorBase::Output<Tensor<Context>>(i));
+    additional_outputs_output.push_back(OperatorBase::Output<Tensor>(i, type));
   }
 
   // Note(jiayq): The if statement below should be optimized away by the
@@ -1238,10 +1239,12 @@ bool ImageInputOp<Context>::CopyPrefetched() {
         mean_gpu_.Resize(mean_.size());
         std_gpu_.Resize(std_.size());
 
-        context_.template Copy<float, CPUContext, Context>(
-          mean_.size(), mean_.data(), mean_gpu_.template mutable_data<float>());
-        context_.template Copy<float, CPUContext, Context>(
-          std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
+        context_.template CopyFromCPU<float>(
+            mean_.size(),
+            mean_.data(),
+            mean_gpu_.template mutable_data<float>());
+        context_.template CopyFromCPU<float>(
+            std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
         mean_std_copied_ = true;
       }
       // GPU transform kernel allows explicitly setting output type
diff --git a/caffe2/image/transform_gpu.cu b/caffe2/image/transform_gpu.cu
index c6d8d775332d9..bb557429f5ad6 100644
--- a/caffe2/image/transform_gpu.cu
+++ b/caffe2/image/transform_gpu.cu
@@ -50,9 +50,12 @@ __global__ void transform_kernel(
 
 template <typename T_IN, typename T_OUT, class Context>
 
-bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
-                    Tensor<Context>& mean, Tensor<Context>& std,
-                    Context *context) {
+bool TransformOnGPU(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
+    Context* context) {
   // data comes in as NHWC
   const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
   // data goes out as NCHW
@@ -68,16 +71,18 @@ bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
   return true;
 };
 
-template bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X,
-                                                          Tensor<CUDAContext> *Y,
-                                                          Tensor<CUDAContext>& mean,
-                                                          Tensor<CUDAContext>& std,
-                                                          CUDAContext *context);
-
-template bool TransformOnGPU<uint8_t, float16, CUDAContext>(Tensor<CUDAContext>& X,
-                                                            Tensor<CUDAContext> *Y,
-                                                            Tensor<CUDAContext>& mean,
-                                                            Tensor<CUDAContext>& std,
-                                                            CUDAContext *context);
+template bool TransformOnGPU<uint8_t, float, CUDAContext>(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
+    CUDAContext* context);
+
+template bool TransformOnGPU<uint8_t, float16, CUDAContext>(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
+    CUDAContext* context);
 
 }  // namespace caffe2
diff --git a/caffe2/image/transform_gpu.h b/caffe2/image/transform_gpu.h
index a19b5251f5d72..3ca11ce159feb 100644
--- a/caffe2/image/transform_gpu.h
+++ b/caffe2/image/transform_gpu.h
@@ -31,9 +31,12 @@
 namespace caffe2 {
 
 template <typename T_IN, typename T_OUT, class Context>
-bool TransformOnGPU(Tensor<Context>& X, Tensor<Context>* Y,
-                    Tensor<Context>& mean, Tensor<Context>& std,
-                    Context* context);
+bool TransformOnGPU(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
+    Context* context);
 
 }  // namespace caffe2
 
diff --git a/caffe2/mkl/mkl_utils_test.cc b/caffe2/mkl/mkl_utils_test.cc
index 678d643c5253f..72dcda2c8f6bb 100644
--- a/caffe2/mkl/mkl_utils_test.cc
+++ b/caffe2/mkl/mkl_utils_test.cc
@@ -23,10 +23,10 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
   int pads[2] = {0, 0};
 
   // Creating Input and output tensors
-  TensorCPU X(vector<TIndex>{16, 8, 32, 32});
-  TensorCPU W(vector<TIndex>{64, 8, 3, 3});
-  TensorCPU b(vector<TIndex>{64});
-  TensorCPU Y(vector<TIndex>{16, 64, 30, 30});
+  Tensor X(vector<TIndex>{16, 8, 32, 32}, CPU);
+  Tensor W(vector<TIndex>{64, 8, 3, 3}, CPU);
+  Tensor b(vector<TIndex>{64}, CPU);
+  Tensor Y(vector<TIndex>{16, 64, 30, 30}, CPU);
 
   float* data = X.mutable_data<float>();
   for (int i = 0; i < X.size(); ++i) {
@@ -56,7 +56,7 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
   // Test if the resource wrapper works.
   MKLMemory<float> X_wrapper(X.dims(), primitive, dnnResourceSrc);
   X_wrapper.CopyFrom(X);
-  TensorCPU X_recover(X.dims());
+  Tensor X_recover(X.dims(), CPU);
   X_wrapper.CopyTo(&X_recover);
   const float* recover_data = X_recover.data<float>();
   for (int i = 0; i < X_recover.size(); ++i) {
@@ -93,7 +93,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
   // layout?). Test both cases.
   vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
   for (const auto& dims : dims_list) {
-    auto X_cpu_in = caffe2::make_unique<TensorCPU>(dims);
+    auto X_cpu_in = caffe2::make_unique<Tensor>(dims, CPU);
     CPUContext ctx;
     math::RandUniform<float, CPUContext>(
         X_cpu_in->size(),
@@ -117,7 +117,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
     EXPECT_EQ(X_mkl1->size(), X_cpu_in->size());
 
     // CPU <- MKL1
-    auto X_cpu_out = caffe2::make_unique<TensorCPU>();
+    auto X_cpu_out = caffe2::make_unique<Tensor>(CPU);
     X_mkl1->CopyTo(X_cpu_out.get());
     EXPECT_EQ(X_cpu_out->dims(), dims);
     EXPECT_EQ(X_cpu_out->size(), X_cpu_in->size());
diff --git a/caffe2/mkl/operators/conv_op.cc b/caffe2/mkl/operators/conv_op.cc
index 71618316cbec6..2678f4c37e17a 100644
--- a/caffe2/mkl/operators/conv_op.cc
+++ b/caffe2/mkl/operators/conv_op.cc
@@ -31,7 +31,7 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
 
     const int M = filter.dim32(0);
     if (InputSize() == 2 && !zero_bias_) {
-      TensorCPU cpu_zero_bias;
+      Tensor cpu_zero_bias{CPU};
       cpu_zero_bias.Resize(M);
       CPUContext ctx;
       math::Set<T, CPUContext>(
@@ -72,8 +72,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
       size_t bdata_sizes[4] = {W, H, C, N};
       // We will utilize the SetOutputSize() function int he base class
       // with dummy TensorCPU input and output to calculate the sizes.
-      TensorCPU dummy_input(X.dims());
-      TensorCPU dummy_output;
+      Tensor dummy_input(X.dims(), CPU);
+      Tensor dummy_output(CPU);
       ConvPoolOpBase<MKLContext>::SetOutputSize(
           dummy_input, &dummy_output, M);
       size_t tdata_sizes[4] = {
diff --git a/caffe2/mkl/operators/conv_op_mkldnn.cc b/caffe2/mkl/operators/conv_op_mkldnn.cc
index 0e363863bc435..80edf1332d063 100644
--- a/caffe2/mkl/operators/conv_op_mkldnn.cc
+++ b/caffe2/mkl/operators/conv_op_mkldnn.cc
@@ -28,7 +28,7 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
     auto& X = Input(INPUT);
     auto& filter = Input(FILTER);
     auto& bias = Input(BIAS);
-    TensorCPU* Y = Output(0);
+    Tensor* Y = Output(0);
     CAFFE_ENFORCE(4 == X.ndim());
     const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
     CAFFE_ENFORCE(4 == filter.ndim());
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index cc90bc16c0836..456a96d71fdf8 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
     for (int i = 0; i < InputSize(); ++i) {
       if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
         OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->template GetMutable<TensorCPU>());
+            local_input_blobs_[i]->GetMutableTensor(CPU));
       } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
         OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->template GetMutable<TensorCPU>());
+            local_input_blobs_[i]->GetMutableTensor(CPU));
       } else {
         VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc
index d24bed7b3dbc7..1f3231dc521f5 100644
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@@ -49,7 +49,7 @@ class PackedFCOp final : public Operator<CPUContext> {
 
     // Check out what is the passed in format.
     const MKLPackedMatrix* packed_matrix = nullptr;
-    if (OperatorBase::InputIsType<TensorCPU>(1)) {
+    if (OperatorBase::InputIsType<Tensor>(1, CPU)) {
       const auto& W = Input(1);
       CAFFE_ENFORCE_EQ(W.ndim(), 2);
       CAFFE_ENFORCE_EQ(W.dim32(0), N);
@@ -142,7 +142,7 @@ class PackedFCOp final : public Operator<CPUContext> {
   size_t axis_{1};
   uint32_t hash_{0};
   vector<TIndex> Y_shape_cache_;
-  Tensor<CPUContext> bias_multiplier_;
+  Tensor bias_multiplier_{CPU};
   std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
 };
 
diff --git a/caffe2/mkl/operators/pool_op.cc b/caffe2/mkl/operators/pool_op.cc
index 434fad2f46b37..284e7f80b8c37 100644
--- a/caffe2/mkl/operators/pool_op.cc
+++ b/caffe2/mkl/operators/pool_op.cc
@@ -61,8 +61,8 @@ bool MKLPoolOp<float>::RunOnDeviceWithOrderNCHW() {
   if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
     // We will utilize the SetOutputSize() function in the base class
     // with dummy TensorCPU input and output to calculate the sizes.
-    TensorCPU dummy_input(X.dims());
-    TensorCPU dummy_output;
+    Tensor dummy_input(X.dims(), CPU);
+    Tensor dummy_output(CPU);
 
     ConvPoolOpBase<MKLContext>::SetOutputSize(
         dummy_input, &dummy_output, X.dim32(1));
diff --git a/caffe2/mkl/operators/utility_ops.cc b/caffe2/mkl/operators/utility_ops.cc
index 969450c7c117e..386bbbc5ee18f 100644
--- a/caffe2/mkl/operators/utility_ops.cc
+++ b/caffe2/mkl/operators/utility_ops.cc
@@ -10,7 +10,7 @@ class CopyCPUToMKLOp final : public MKLOperator<float> {
  public:
   using MKLOperator<float>::MKLOperator;
   bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<TensorCPU>(0);
+    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
     auto* Y = OperatorBase::OutputBlob(0);
     if (!Y->template IsType<MKLMemory<float>>() ||
         Y->Get<MKLMemory<float>>().dims() != X.dims()) {
@@ -27,7 +27,7 @@ class CopyMKLToCPUOp final : public MKLOperator<float> {
 
   bool RunOnDevice() override {
     const auto& X = OperatorBase::Input<MKLMemory<float>>(0);
-    auto* Y = OperatorBase::Output<TensorCPU>(0);
+    auto* Y = OperatorBase::Output<Tensor>(0, CPU);
     X.CopyTo(Y);
     return true;
   }
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
index e13b3ada86fa4..6e9075df43475 100644
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -1,5 +1,6 @@
 // #include "caffe2/mkl/utils/mkl_context.h"
 
+#include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"
 
 namespace caffe2 {
@@ -18,4 +19,11 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(MKLDNN, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(MKLDNN, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(MKLDNN, EventResetCPU);
 
+BaseStaticContext* GetMKLStaticContext() {
+  static MKLStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(MKLDNN, GetMKLStaticContext());
+
 } // namespace caffe2
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index b876894746af0..6181a91dda35d 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -6,9 +6,12 @@
 #include <random>
 
 #include "caffe2/core/context.h"
+#include "caffe2/core/context_base.h"
 
 namespace caffe2 {
 
+BaseStaticContext* GetMKLStaticContext();
+
 /**
  * The MKL Context, which is largely the same as the CPUContext. We instantiate
  * this mainly in order to have a first-class MKL device.
@@ -17,7 +20,7 @@ namespace caffe2 {
  * operators to mainly perform input and output via MKLMemory. As a result,
  * most likely MKLContext::New and ::Delete won't be used as often.
  */
-class MKLContext final {
+class MKLContext : public BaseContext {
  public:
   MKLContext() : random_seed_(RandomNumberSeed()) {}
   explicit MKLContext(const DeviceOption& option)
@@ -27,20 +30,28 @@ class MKLContext final {
     CAFFE_ENFORCE_EQ(option.device_type(), MKLDNN);
   }
 
-  ~MKLContext() {}
+  ~MKLContext() override {}
+
+  BaseStaticContext* GetStaticContext() const override {
+    return GetMKLStaticContext();
+  }
+
+  static BaseStaticContext* StaticContext() {
+    return GetMKLStaticContext();
+  }
 
-  inline void SwitchToDevice(int /*stream_id*/ = 0) {}
+  inline void SwitchToDevice(int /*stream_id*/ = 0) override {}
 
-  inline void WaitEvent(const Event& ev) {
+  inline void WaitEvent(const Event& ev) override {
     ev.Wait(MKLDNN, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(MKLDNN, this, err_msg);
   }
 
-  inline void FinishDeviceComputation() {}
+  inline void FinishDeviceComputation() override {}
 
   inline std::mt19937& RandGenerator() {
     if (!random_generator_.get()) {
@@ -50,7 +61,32 @@ class MKLContext final {
   }
 
   inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return GetCPUAllocator()->New(nbytes);
+    return StaticContext()->New(nbytes);
+  }
+
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
+      override {
+    if (nbytes == 0) {
+      return;
+    }
+    CAFFE_ENFORCE(src);
+    CAFFE_ENFORCE(dst);
+    memcpy(dst, src, nbytes);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
+      override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  bool SupportsNonFundamentalTypes() const override {
+    // MKL meta copy is OK
+    return true;
   }
 
   // Two copy functions that deals with cross-device copies.
@@ -90,10 +126,18 @@ class MKLContext final {
     return false;
   }
 
-  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
     return true;
   }
 
+  DeviceType GetDevicetype() const override {
+    return MKLDNN;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return MKLDNN;
+  }
+
  protected:
   // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
   int random_seed_{1701};
@@ -108,21 +152,26 @@ inline void MKLContext::CopyBytes<MKLContext, MKLContext>(
   memcpy(dst, src, nbytes);
 }
 
-template <>
-inline void MKLContext::CopyBytes<CPUContext, MKLContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  memcpy(dst, src, nbytes);
-}
+class MKLStaticContext : public BaseStaticContext {
+ public:
+  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
+    return GetCPUAllocator()->New(nbytes);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<MKLContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<MKLContext>(option);
+  }
+
+  DeviceType GetDeviceType() override {
+    return MKLDNN;
+  }
+};
 
-template <>
-inline void MKLContext::CopyBytes<MKLContext, CPUContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  memcpy(dst, src, nbytes);
-}
 } // namespace caffe2
 
 #endif // CAFFE2_UTILS_MKL_CONTEXT_H_
diff --git a/caffe2/mobile/contrib/CMakeLists.txt b/caffe2/mobile/contrib/CMakeLists.txt
index 29a35812bc4ec..e49c2ef129c32 100644
--- a/caffe2/mobile/contrib/CMakeLists.txt
+++ b/caffe2/mobile/contrib/CMakeLists.txt
@@ -1,7 +1,10 @@
 add_subdirectory(ios)
-add_subdirectory(opengl)
+# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
+if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
+  # add_subdirectory(opengl)
+endif()
 if (USE_ACL)
-  add_subdirectory(arm-compute)
+  # add_subdirectory(arm-compute)
 endif()
 # Finally pass the src lists back to the parent
 
@@ -17,4 +20,4 @@ set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
 # GPU source, test sources, binary sources
 set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
 set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
\ No newline at end of file
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
index 56c95237e923d..111af03f8602b 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
   if (first_run_) {
     first_run_ = false;
     for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
+      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
       Y->Resize(inputs_[i]->dims());
       Y->template mutable_data<float>();
     }
@@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
       // GLTensor
       auto* X = inputs_[i].get();
       X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
+      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
       Timer timer;
       timer.Start();
       getTensorCPU(*X, *Y);
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
index fc53479088443..50b457c7ba86d 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                      std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
   Blob *blob = ws->CreateBlob(name);
-  auto *tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(dims);
   T *t_data = tensor->mutable_data<T>();
   std::random_device rd;
diff --git a/caffe2/mobile/contrib/ios/ios_caffe.cc b/caffe2/mobile/contrib/ios/ios_caffe.cc
index 12e0e5598c6aa..0ac1131350b61 100644
--- a/caffe2/mobile/contrib/ios/ios_caffe.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe.cc
@@ -41,7 +41,7 @@ void GenerateStylizedImage(std::vector<float>& originalImage,
   caffe2::Predictor p(init_net, predict_net);
 
   std::vector<int> dims({1, 3, height, width});
-  caffe2::TensorCPU input;
+  caffe2::Tensor input(caffe2::CPU);
   input.Resize(dims);
   input.ShareExternalPointer(originalImage.data());
   caffe2::Predictor::TensorVector input_vec{&input};
diff --git a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
index d497c9b7b7047..d4207691290d5 100644
--- a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
@@ -50,7 +50,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
 
 void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
   caffe2::FLAGS_caffe2_force_shared_col_buffer = true;
-  caffe2::TensorCPU input;
+  caffe2::Tensor input(caffe2::CPU);
   input.Resize(inData.dims);
   input.ShareExternalPointer(inData.data);
   caffe2::Predictor::TensorVector input_vec{&input};
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index d7842eaaa6bdb..45f55ab2407a2 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -256,9 +256,9 @@ void computeOutputHW(
     int W,
     int* OH,
     int* OW) {
-  Tensor<CPUContext> input, output;
+  Tensor input(CPU), output(CPU);
   input.Resize(1, 1, H, W);
-  op->SetOutputSize<CPUContext>(input, &output, 1);
+  op->SetOutputSize(input, &output, 1);
   CAFFE_ENFORCE_EQ(output.ndim(), 4);
   *OH = output.dim(2);
   *OW = output.dim(3);
@@ -495,7 +495,7 @@ bool RunOnDevice() override {
       caffe2::Timer rt;
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->template GetMutable<TensorCPU>();
+      auto* t = noiseBlob->GetMutableTensor(CPU);
       t->Resize(noiseSize);
       math::RandGaussian<float, CPUContext>(
           t->size(),
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index e6d9daa0e70dc..c4f6ff4d6a3a4 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 5a14f4606635d..90e672397b821 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index 3f05149c70454..45ea26c44cc96 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
         output_dims.push_back(dim);
       }
 
-      auto* tensor = ws_.CreateBlob(blob)->GetMutable<TensorCPU>();
+      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
       tensor->Resize(output_dims);
       outputs->push_back(tensor);
 
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
index db0e867aa07ce..359e7767746b6 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
@@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
@@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
-    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
@@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
index 76278c8ef8fb8..deab1ca7b43f7 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
   // CPU reference
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -103,21 +103,21 @@ static void test_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(K, kernel, kernel, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -189,7 +189,7 @@ static void test_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(1, kernel, kernel, D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -428,7 +428,7 @@ static void test_pooling(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -496,7 +496,7 @@ static void test_pooling(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     if (H == 1 && W == 1) {
       t->Resize(N, C);
     } else {
@@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
diff --git a/caffe2/mobile/contrib/opengl/CMakeLists.txt b/caffe2/mobile/contrib/opengl/CMakeLists.txt
index f23de75d153a1..6d116253f71d5 100644
--- a/caffe2/mobile/contrib/opengl/CMakeLists.txt
+++ b/caffe2/mobile/contrib/opengl/CMakeLists.txt
@@ -1,14 +1,11 @@
-if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
-  add_subdirectory(core)
-  add_subdirectory(operators)
+add_subdirectory(core)
+add_subdirectory(operators)
 
-  if (ANDROID)
-    add_subdirectory(android)
-  endif()
-
-  if (IOS)
-    add_subdirectory(ios)
-  endif()
+if (ANDROID)
+  add_subdirectory(android)
 endif()
 
+if (IOS)
+  add_subdirectory(ios)
+endif()
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index f9ede815f5a99..49a875184c10d 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
   LOG(INFO) << "OPENGLCopyFrom/To Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -275,7 +275,7 @@ void testOpenGLConv(int N,
             << " Op: " << glPoolOperationName[poolOp];
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -301,7 +301,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
       t->Resize(K, C, kernel_h, kernel_w);
     } else {
@@ -343,7 +343,7 @@ void testOpenGLConv(int N,
 
     // bias
     {
-      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(K);
       CPUContext ctx;
       if (random_input) {
@@ -367,7 +367,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -532,7 +532,7 @@ void testOpenGLPRelu(
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -541,7 +541,7 @@ void testOpenGLPRelu(
 
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
     t->Resize(prelu_size);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
 
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -814,7 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t = ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutable<TensorCPU>();
+    auto* t =
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -890,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -941,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@@ -991,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
   }
 
   {
-    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
     t->Resize(1);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@@ -1059,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
   LOG(INFO) << "OpenGL Softmax Test "
             << "N: " << N << " D: " << D << " Tiled:" << tiled;
   Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
   {
     t->Resize(N, D);
     CPUContext ctx;
@@ -1150,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1162,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1171,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1253,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1265,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1274,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1283,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
     t->Resize(C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1384,7 +1385,7 @@ void OpenGL_speedtest(int N,
             << " C: " << C << " H: " << H << " W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1398,7 +1399,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1412,7 +1413,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1478,7 +1479,7 @@ void testOpenGLPadImage(
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1592,7 +1593,7 @@ void testOpenGLResize(int N,
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1674,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGL Preprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1683,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 100;
@@ -1747,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLDeprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1756,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1799,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLNormPlanarYUV Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, 3, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1808,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1817,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("stdev")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 6;
@@ -1878,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
   LOG(INFO) << "OpenGL CopyOps Speed Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1892,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1906,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1989,7 +1990,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace cws;
     cws.RunNetOnce(initNet);
 
-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
+                      ->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2030,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace mws;
     mws.RunNetOnce(initNet);
 
-    auto* t_gl =
-        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
+                     ->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2113,7 +2115,8 @@ void compareBatchedToTiledModels(std::string name,
     Workspace tws;
     tws.RunNetOnce(initNet);
 
-    auto* t_batch = tws.CreateBlob(bachedNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_batch =
+        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2139,7 +2142,8 @@ void compareBatchedToTiledModels(std::string name,
     Workspace bws;
     bws.RunNetOnce(initNet);
 
-    auto* t_tiling = bws.CreateBlob(tiledNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_tiling =
+        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
diff --git a/caffe2/mobile/contrib/snpe/snpe_op.cc b/caffe2/mobile/contrib/snpe/snpe_op.cc
index fa015960183d2..db8a95fe8395a 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op.cc
@@ -111,7 +111,8 @@ class SNPEOp final : public Operator<CPUContext> {
     X(snpe_copy_output_to);
     snpe_copy_output_to_f(ctx_.get(), Output(0)->mutable_data<float>());
 
-    CAFFE_ENFORCE(Output(0)->data<float>(), "nullptr where output should be!\n");
+    CAFFE_ENFORCE(
+        Output(0)->data<float>(), "nullptr where output should be!\n");
     return true;
   }
 
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index 58e3ccbb8a7b6..1bbe303ef777d 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -11,20 +11,22 @@
 #if TEST_REAL_DATA
 #include "data_chw.h"
 #include "data_hwc.h"
-#define POPULATE_DATA(_n, _s, _l) do {\
-  Blob* _blob = ws.CreateBlob((_n));\
-  auto* _tensor = _blob->GetMutable<TensorCPU>();\
-  _tensor->Resize((_s));\
-  memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes());\
-} while(0)
+#define POPULATE_DATA(_n, _s, _l)                                         \
+  do {                                                                    \
+    Blob* _blob = ws.CreateBlob((_n));                                    \
+    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
+    _tensor->Resize((_s));                                                \
+    memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
+  } while (0)
 #else
 // Rough test on static data
-#define POPULATE_DATA(_n, _s, _l) do {\
-  Blob* _blob = ws.CreateBlob((_n));\
-  auto* _tensor = _blob->GetMutable<TensorCPU>();\
-  _tensor->Resize((_s));\
-  memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes());\
-} while(0)
+#define POPULATE_DATA(_n, _s, _l)                                 \
+  do {                                                            \
+    Blob* _blob = ws.CreateBlob((_n));                            \
+    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
+    _tensor->Resize((_s));                                        \
+    memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
+  } while (0)
 #endif
 
 #include <cmath>
@@ -41,7 +43,7 @@ void AddConstInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@@ -54,7 +56,7 @@ void AddNoiseInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
@@ -71,7 +73,7 @@ float snpe_run(int iters, Workspace& ws) {
   const int C = 3;
 
   POPULATE_DATA("X_snpe", (caffe2::vector<caffe2::TIndex>{H, W, C}), hwc);
-  
+
   OperatorDef def;
   def.set_name("snpe_test");
   def.set_type("SNPE");
@@ -176,7 +178,7 @@ int main(int argc, char** argv) {
   float avg_diff = total_diff; // Avg difference as percentage (not a great metric)
   printf("Average difference is %f%%\n", avg_diff * 100);
   printf("JS Divergence is %f\n", JS_divergence); // Jensen-Shannon
-  printf("KL Divergence is %f\n", KL_divergence); // Kullback–Leibler
+  printf("KL Divergence is %f\n", KL_divergence); // Kullback-Leibler
   printf("Predicted %d with %f%% confidence\n", max_index, max * 100);
 
   printf ("Caffe2: %f microseconds.\n", t_caffe2);
diff --git a/caffe2/mobile/contrib/ulp2/ulp.cc b/caffe2/mobile/contrib/ulp2/ulp.cc
index 1d8e0e8fe69a5..3acd17281fefa 100644
--- a/caffe2/mobile/contrib/ulp2/ulp.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@@ -261,14 +261,14 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
   state->XQs.resize(k2b1bXBits);
   state->YQs.resize(k2b1bXBits);
   for (auto i = 0; i < k2b1bXBits; ++i) {
-    state->XQs[i] = caffe2::make_unique<TensorCPU>();
-    state->YQs[i] = caffe2::make_unique<TensorCPU>();
+    state->XQs[i] = caffe2::make_unique<Tensor>(CPU);
+    state->YQs[i] = caffe2::make_unique<Tensor>(CPU);
   }
-  state->WQ = caffe2::make_unique<TensorCPU>();
-  state->WQN = caffe2::make_unique<TensorCPU>();
-  state->WQL1Norm = caffe2::make_unique<TensorCPU>();
-  state->scratch = caffe2::make_unique<TensorCPU>();
-  state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
+  state->WQ = caffe2::make_unique<Tensor>(CPU);
+  state->WQN = caffe2::make_unique<Tensor>(CPU);
+  state->WQL1Norm = caffe2::make_unique<Tensor>(CPU);
+  state->scratch = caffe2::make_unique<Tensor>(CPU);
+  state->scratchColBuffer = caffe2::make_unique<Tensor>(CPU);
 
   signQuantize(W, state->WQ.get());
   filterNormalization11(*(state->WQ), state->WQN.get());
@@ -290,7 +290,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
   };
   if (b) {
     CPUContext context;
-    state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
+    state->bias = caffe2::make_unique<Tensor>(*b, &context, CPU);
   }
   return state;
 }
diff --git a/caffe2/mobile/contrib/ulp2/ulp_neon.cc b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
index 15ad59a47916e..c7a4450e7ba31 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_neon.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
@@ -438,7 +438,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
   const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
   Y->Resize(X.dim32(0), OH, OW, OC);
   if (!state->WQPacked) {
-    state->WQPacked = caffe2::make_unique<TensorCPU>();
+    state->WQPacked = caffe2::make_unique<Tensor>(CPU);
     qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
     CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
     CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index 58bc5e7132836..f6705e638ddac 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -63,7 +63,7 @@ int randInt(int a, int b) {
 }
 
 TensorCPU genTensor11(std::vector<TIndex> shape) {
-  TensorCPU r;
+  Tensor r(CPU);
   r.Resize(shape);
 
   std::random_device rd;
@@ -77,7 +77,7 @@ TensorCPU genTensor11(std::vector<TIndex> shape) {
 }
 
 TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
-  TensorCPU r;
+  Tensor r(CPU);
   r.Resize(shape);
 
   std::random_device rd;
@@ -91,7 +91,7 @@ TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
 }
 
 TensorCPU genTensor0123(std::vector<TIndex> shape) {
-  TensorCPU r;
+  Tensor r(CPU);
   r.Resize(shape);
 
   std::random_device rd;
@@ -114,7 +114,7 @@ TEST(ULP, QPadZero) {
   const auto ICQ = 1;
 
   auto X = genTensor11({1, 10, 10, ICQ * 8});
-  TensorCPU XQ, XQPad;
+  Tensor XQ(CPU), XQPad(CPU);
   signQuantize(X, &XQ);
   qpad_zero(args, XQ, &XQPad);
 
@@ -174,7 +174,7 @@ inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, flo
 void gemmTest(TIndex M, TIndex N, TIndex K) {
   auto X = genTensor11({M, K});
   auto W = genTensor11({N, K});
-  TensorCPU XQ, WQ, YQ, Y;
+  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
   {
     signQuantize(X, &XQ);
     signQuantize(W, &WQ);
@@ -207,7 +207,7 @@ TEST(QConv, ConvTest) {
   int K = 3;
   auto X = genTensor11({1, S, S, IC});
   auto W = genTensor11({OC, K, K, IC});
-  TensorCPU XQ, WQ, YQ, Y;
+  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
   {
     signQuantize(X, &XQ);
     signQuantize(W, &WQ);
@@ -235,16 +235,16 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
   auto X = genTensor0123({N, H, W, IC});
   auto W_ = genTensor11({OC, KH, KW, IC});
   auto bias = genTensorUniform11({OC});
-  TensorCPU Y, YQ, Y2b1b, YOP;
+  Tensor Y(CPU), YQ(CPU), Y2b1b(CPU), YOP(CPU);
 
   {
     std::vector<std::unique_ptr<TensorCPU>> XQs(k2b1bXBits);
     std::vector<std::unique_ptr<TensorCPU>> YQs(k2b1bXBits);
     for (auto i = 0; i < k2b1bXBits; ++i) {
-      XQs[i] = caffe2::make_unique<TensorCPU>();
-      YQs[i] = caffe2::make_unique<TensorCPU>();
+      XQs[i] = caffe2::make_unique<Tensor>(CPU);
+      YQs[i] = caffe2::make_unique<Tensor>(CPU);
     }
-    TensorCPU WQN, WQ;
+    Tensor WQN(CPU), WQ(CPU);
     uniformQuantize2b1b(X, XQs, 0.5, 1.0);
     signQuantize(W_, &WQ);
     filterNormalization11(WQ, &WQN);
@@ -289,17 +289,17 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
     def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
     def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
     def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutable<TensorCPU>();
+    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
     Xws->ResizeLike(X);
     Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
     Wws->ResizeLike(W_);
     Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
     bws->ResizeLike(bias);
     bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
     ws.RunOperatorOnce(def);
-    YOP.CopyFrom<CPUContext>(ws.GetBlob("Y")->Get<TensorCPU>());
+    YOP.CopyFrom(ws.GetBlob("Y")->Get<TensorCPU>());
   }
 
   { conv(args, X, W_, &bias, &Y); }
diff --git a/caffe2/mpi/mpi_gpu_test.cc b/caffe2/mpi/mpi_gpu_test.cc
index 087a87575510b..d24521e0274dc 100644
--- a/caffe2/mpi/mpi_gpu_test.cc
+++ b/caffe2/mpi/mpi_gpu_test.cc
@@ -55,7 +55,6 @@ TEST(MPITest, TestMPIBroadcast) {
   arg->set_f(rank);
   int size;
   MPI_Comm_size(MPI_COMM_WORLD, &size);
-
   for (int root = 0; root < size; ++root) {
     net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
     Workspace ws;
@@ -63,8 +62,8 @@ TEST(MPITest, TestMPIBroadcast) {
     EXPECT_NE(nullptr, net.get());
     EXPECT_TRUE(net->Run());
     // Let's test the value.
-    auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
-    TensorCPU X_cpu(X);
+    auto& X = ws.GetBlob("X")->Get<Tensor>();
+    Tensor X_cpu(X, CPU);
     EXPECT_EQ(X.size(), 10);
     for (int i = 0; i < X.size(); ++i) {
       EXPECT_EQ(X_cpu.data<float>()[i], root);
@@ -133,7 +132,7 @@ TEST(MPITest, TestMPIReduce) {
       auto& X = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
       EXPECT_EQ(X.size(), 10);
       int expected_result = size * (size - 1) / 2;
-      TensorCPU X_cpu(X);
+      Tensor X_cpu(X, CPU);
       for (int i = 0; i < X.size(); ++i) {
         EXPECT_EQ(X_cpu.data<float>()[i], expected_result);
       }
@@ -190,7 +189,7 @@ TEST(MPITest, TestMPIAllgather) {
   EXPECT_TRUE(net->Run());
   // Let's test the value.
   auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
-  TensorCPU X_cpu(X);
+  Tensor X_cpu(X, CPU);
   EXPECT_EQ(X.size(), 20);
   for (int i = 0; i < X.size(); ++i) {
     EXPECT_EQ(X_cpu.data<float>()[i], rank);
@@ -199,7 +198,7 @@ TEST(MPITest, TestMPIAllgather) {
   EXPECT_EQ(X_gathered.size(), 20 * size);
   EXPECT_EQ(X_gathered.dim(0), 2 * size);
   EXPECT_EQ(X_gathered.dim(1), 10);
-  TensorCPU X_gathered_cpu(X_gathered);
+  Tensor X_gathered_cpu(X_gathered, CPU);
   for (int i = 0; i < X_gathered.size(); ++i) {
     EXPECT_EQ(X_gathered_cpu.data<float>()[i], i / 20);
   }
@@ -254,14 +253,14 @@ TEST(MPITest, TestMPIAllreduce) {
   // Let's test the value.
   auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
   EXPECT_EQ(X.size(), 10);
-  TensorCPU X_cpu(X);
+  Tensor X_cpu(X, CPU);
   for (int i = 0; i < X.size(); ++i) {
     EXPECT_EQ(X_cpu.data<float>()[i], rank);
   }
   auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
   EXPECT_EQ(X_reduced.size(), 10);
   int expected_result = size * (size - 1) / 2;
-  TensorCPU X_reduced_cpu(X_reduced);
+  Tensor X_reduced_cpu(X_reduced, CPU);
   for (int i = 0; i < X_reduced.size(); ++i) {
     EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
   }
@@ -316,7 +315,7 @@ TEST(MPITest, TestInPlaceMPIAllreduce) {
   auto& X_reduced = ws.GetBlob("X")->Get<TensorCUDA>();
   EXPECT_EQ(X_reduced.size(), 10);
   int expected_result = size * (size - 1) / 2;
-  TensorCPU X_reduced_cpu(X_reduced);
+  Tensor X_reduced_cpu(X_reduced, CPU);
   for (int i = 0; i < X_reduced.size(); ++i) {
     EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
   }
diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
index 108bf45afdb04..7d251f7445d1e 100644
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@@ -36,8 +36,7 @@ class MPIBroadcastOp final : public Operator<Context> {
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     CAFFE_ENFORCE(
-        OperatorBase::OutputIsType<Tensor<Context>>(0),
-        "Output is of wrong type.");
+        OperatorBase::OutputIsType<Tensor>(0), "Output is of wrong type.");
     auto* output = Output(0);
     // Make sure that output is already allocated.
     CAFFE_ENFORCE(
@@ -168,8 +167,8 @@ class MPISendTensorOp final : public Operator<Context> {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
     auto& input = Input(INPUT);
     if (InputSize() == 4) {
-      dst_ = OperatorBase::Input<TensorCPU>(DST).template data<int>()[0];
-      tag_ = OperatorBase::Input<TensorCPU>(TAG).template data<int>()[0];
+      dst_ = OperatorBase::Input<Tensor>(DST, CPU).template data<int>()[0];
+      tag_ = OperatorBase::Input<Tensor>(TAG, CPU).template data<int>()[0];
     }
     if (raw_buffer_) {
       // We need to do a const cast to cope with the fact that, before OpenMPI
@@ -211,8 +210,8 @@ class MPIReceiveTensorOp final : public Operator<Context> {
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
     if (InputSize() == 4) {
-      src_ = OperatorBase::Input<TensorCPU>(SRC_IN).template data<int>()[0];
-      tag_ = OperatorBase::Input<TensorCPU>(TAG_IN).template data<int>()[0];
+      src_ = OperatorBase::Input<Tensor>(SRC_IN, CPU).template data<int>()[0];
+      tag_ = OperatorBase::Input<Tensor>(TAG_IN, CPU).template data<int>()[0];
     }
     MPI_Status status;
     if (raw_buffer_) {
@@ -228,10 +227,10 @@ class MPIReceiveTensorOp final : public Operator<Context> {
     } else {
       CAFFE_NOT_IMPLEMENTED;
     }
-    auto* src_out = OperatorBase::Output<TensorCPU>(SRC_OUT);
+    auto* src_out = OperatorBase::Output<Tensor>(SRC_OUT, CPU);
     src_out->Resize();
     src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
-    auto* tag_out = OperatorBase::Output<TensorCPU>(TAG_OUT);
+    auto* tag_out = OperatorBase::Output<Tensor>(TAG_OUT, CPU);
     tag_out->Resize();
     tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
     return true;
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index afeac6d127ba7..df97bcffb2cf6 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -26,17 +26,10 @@ void ProfileOperatorObserver::Dump() const {
   LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
             << " op#" << getId() << " ---------";
   for (int i = 0; i < subject_->InputSize(); ++i) {
-    if (subject_->InputIsType<TensorCPU>(i)) {
-      const auto& tensor = subject_->Input<TensorCPU>(i);
-      const auto& name = subject_->debug_def().input(i);
-      TensorPrinter printer(name);
-      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
-    } else if (subject_->InputIsType<TensorCUDA>(i)) {
-      const auto& tensor = subject_->Input<TensorCUDA>(i);
-      const auto& name = subject_->debug_def().input(i);
-      TensorPrinter printer(name);
-      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
-    }
+    const auto& tensor = subject_->Input<Tensor>(i);
+    const auto& name = subject_->debug_def().input(i);
+    TensorPrinter printer(name);
+    LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
   }
 
   int a = 0;
@@ -46,13 +39,13 @@ void ProfileOperatorObserver::Dump() const {
   }
 
   for (int o = 0; o < subject_->OutputSize(); ++o) {
-    if (subject_->OutputIsType<TensorCPU>(o)) {
-      auto* tensor = subject_->Output<TensorCPU>(o);
+    if (subject_->OutputIsType<Tensor>(o, CPU)) {
+      auto* tensor = subject_->Output<Tensor>(o, CPU);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
-    } else if (subject_->OutputIsType<TensorCUDA>(o)) {
-      auto* tensor = subject_->Output<TensorCUDA>(o);
+    } else if (subject_->OutputIsType<Tensor>(o, CUDA)) {
+      auto* tensor = subject_->Output<Tensor>(o, CUDA);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
diff --git a/caffe2/operators/accuracy_op.cc b/caffe2/operators/accuracy_op.cc
index 03733ed89a3b2..8c1273eca2099 100644
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@@ -38,7 +38,7 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
     }
   }
   CAFFE_ENFORCE_LE(correct, N);
-  *(Y->mutable_data<float>()) = static_cast<float>(correct) / N;
+  *(Y->template mutable_data<float>()) = static_cast<float>(correct) / N;
 
   return true;
 }
@@ -46,10 +46,10 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
 REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>);
 
 OPERATOR_SCHEMA(Accuracy)
-  .NumInputs(2)
-  .NumOutputs(1)
-  .ScalarType(TensorProto::FLOAT)
-  .SetDoc(R"DOC(
+    .NumInputs(2)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::FLOAT)
+    .SetDoc(R"DOC(
 Accuracy takes two inputs- predictions and labels, and returns a float
 accuracy value for the batch. Predictions are expected in the form of 2-D tensor
 containing a batch of scores for various classes, and labels are expected in the
@@ -57,16 +57,25 @@ containing a batch of scores for various classes, and labels are expected in the
 the score for the label index in the predictions is the highest among all
 classes, it is considered a correct prediction.
 )DOC")
-  .Arg(
-      "top_k",
-      "Count as correct by comparing the true label to the top k scoring "
-      "classes (default 1: only compare to the top scoring class i.e. argmax)")
-  .Input(0, "predictions", "2-D tensor (Tensor<float>) of size "
-         "(num_batches x num_classes) containing scores")
-  .Input(1, "labels", "1-D tensor (Tensor<int>) of size (num_batches) having "
+    .Arg(
+        "top_k",
+        "Count as correct by comparing the true label to the top k scoring "
+        "classes (default 1: only compare to the top scoring class i.e. argmax)")
+    .Input(
+        0,
+        "predictions",
+        "2-D tensor (Tensor<float>) of size "
+        "(num_batches x num_classes) containing scores")
+    .Input(
+        1,
+        "labels",
+        "1-D tensor (Tensor<float>) of size (num_batches) having "
         "the indices of true labels")
-  .Output(0, "accuracy", "1-D tensor (Tensor<float>) of size 1 containing "
-          "accuracy");
+    .Output(
+        0,
+        "accuracy",
+        "1-D tensor (Tensor<float>) of size 1 containing "
+        "accuracy");
 
 SHOULD_NOT_DO_GRADIENT(Accuracy);
 }  // namespace caffe2
diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
index 949a077ec9a80..5d27707662c74 100644
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@@ -54,7 +54,7 @@ bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(label.ndim(), 1);
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
   Y->Resize(vector<TIndex>());
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
   AccuracyKernel<<<
       std::min(CAFFE_MAXIMUM_NUM_BLOCKS, N),
diff --git a/caffe2/operators/affine_channel_op.cc b/caffe2/operators/affine_channel_op.cc
index 823a3cf8fee37..a19e96f9e1884 100644
--- a/caffe2/operators/affine_channel_op.cc
+++ b/caffe2/operators/affine_channel_op.cc
@@ -70,7 +70,7 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
       scale_dims.data(),
       dY_data,
       scale_data,
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
@@ -85,8 +85,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
         HxW,
         dY_data,
         X_data,
-        dscale->mutable_data<float>(),
-        dbias->mutable_data<float>());
+        dscale->template mutable_data<float>(),
+        dbias->template mutable_data<float>());
   }
   return true;
 }
@@ -104,7 +104,12 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   const float* dY_data = dY.data<float>();
   const float* scale_data = scale.data<float>();
   math::RowwiseMul<float, CPUContext>(
-      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
+      rows,
+      cols,
+      dY_data,
+      scale_data,
+      dX->template mutable_data<float>(),
+      &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
     const float* X_data = X.data<float>();
@@ -120,8 +125,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
         HxW,
         dY_data,
         X_data,
-        dscale->mutable_data<float>(),
-        dbias->mutable_data<float>());
+        dscale->template mutable_data<float>(),
+        dbias->template mutable_data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
index f3a9703cd14c3..6faa01eb97419 100644
--- a/caffe2/operators/affine_channel_op.cu
+++ b/caffe2/operators/affine_channel_op.cu
@@ -71,7 +71,7 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
       scale_dims.data(),
       dY_data,
       scale_data,
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
@@ -91,8 +91,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
             HxW,
             dY_data,
             X_data,
-            dscale->mutable_data<float>(),
-            dbias->mutable_data<float>());
+            dscale->template mutable_data<float>(),
+            dbias->template mutable_data<float>());
   }
   return true;
 }
@@ -110,7 +110,12 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const float* dY_data = dY.data<float>();
   const float* scale_data = scale.data<float>();
   math::RowwiseMul<float, CUDAContext>(
-      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
+      rows,
+      cols,
+      dY_data,
+      scale_data,
+      dX->template mutable_data<float>(),
+      &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
     const float* X_data = X.data<float>();
@@ -130,8 +135,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
             HxW,
             dY_data,
             X_data,
-            dscale->mutable_data<float>(),
-            dbias->mutable_data<float>());
+            dscale->template mutable_data<float>(),
+            dbias->template mutable_data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/apmeter_op.cc b/caffe2/operators/apmeter_op.cc
index 7965c81cad2e5..4867d86097de7 100644
--- a/caffe2/operators/apmeter_op.cc
+++ b/caffe2/operators/apmeter_op.cc
@@ -58,7 +58,7 @@ bool APMeterOp<float, CPUContext>::RunOnDevice() {
 
   const auto* Xdata = X.data<float>();
   const auto* labelData = label.data<int>();
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
 
   BufferPredictions(Xdata, labelData, N, D);
 
@@ -116,7 +116,7 @@ per class for the average precision of that class.
     .Input(
         1,
         "labels",
-        "2-D tensor (Tensor<int>) of size (num_samples) "
+        "2-D tensor (Tensor<float>) of size (num_samples) "
         "containing true labels for each sample")
     .Output(
         0,
diff --git a/caffe2/operators/assert_op.h b/caffe2/operators/assert_op.h
index 65bddaca64ca0..27e0579d91e06 100644
--- a/caffe2/operators/assert_op.h
+++ b/caffe2/operators/assert_op.h
@@ -41,7 +41,7 @@ class AssertOp final : public Operator<Context> {
   }
 
  private:
-  TensorCPU cmp_tensor_;
+  Tensor cmp_tensor_{CPU};
   std::string error_msg_;
 };
 
diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc
index 31a4dd659f756..73c4196b6e9b1 100644
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@@ -33,8 +33,8 @@ class AtomicFetchAddOp final : public Operator<CPUContext> {
     d->Resize(std::vector<TIndex>());
     auto* aPtr = a.data<int32_t>();
     auto* bPtr = b.data<int32_t>();
-    auto* cPtr = c->mutable_data<int32_t>();
-    auto* dPtr = d->mutable_data<int32_t>();
+    auto* cPtr = c->template mutable_data<int32_t>();
+    auto* dPtr = d->template mutable_data<int32_t>();
     std::lock_guard<std::mutex> lg(*mutex);
     *dPtr = *aPtr;
     *cPtr = *aPtr + *bPtr;
@@ -77,7 +77,7 @@ class CheckAtomicBoolOp final : public Operator<CPUContext> {
   bool RunOnDevice() override {
     auto& ptr = OperatorBase::Input<std::unique_ptr<std::atomic<bool>>>(0);
     Output(0)->Resize(1);
-    *Output(0)->mutable_data<bool>() = ptr->load();
+    *Output(0)->template mutable_data<bool>() = ptr->load();
     return true;
   }
 };
diff --git a/caffe2/operators/batch_gather_ops.cu b/caffe2/operators/batch_gather_ops.cu
index 8aa8cb42a3e01..2d047660491b5 100644
--- a/caffe2/operators/batch_gather_ops.cu
+++ b/caffe2/operators/batch_gather_ops.cu
@@ -31,7 +31,7 @@ __global__ void BatchGatherKernel(
 template <>
 bool BatchGatherOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<TensorCUDA>(INDICES));
+      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
 }
 
 template <>
@@ -99,7 +99,7 @@ __global__ void BatchGatherGradientKernel(
 template <>
 bool BatchGatherGradientOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<TensorCUDA>(INDICES));
+      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
 }
 
 template <>
@@ -107,7 +107,7 @@ template <typename TInd>
 bool BatchGatherGradientOp<CUDAContext>::DoRunWithType() {
   return DispatchHelper<
       TensorTypes2<float, GenericTensorImplementation>,
-      TInd>::call(this, OperatorBase::Input<TensorCUDA>(DATA));
+      TInd>::call(this, OperatorBase::Input<Tensor>(DATA, CUDA));
 }
 
 template <>
diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
index b9d3491132c8f..01177441c021d 100644
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@@ -15,7 +15,7 @@ class BatchGatherOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
   }
 
   template <typename TInd>
@@ -54,8 +54,7 @@ class BatchGatherOp final : public Operator<Context> {
         auto src =
             src_base + idx * block_bytesize + batch * data_batch_bytesize;
         auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
-        context_.template CopyItems<Context, Context>(
-            data.meta(), block_size, src, dst);
+        context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
       }
     }
     return true;
@@ -72,7 +71,7 @@ class BatchGatherGradientOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
   }
 
   template <typename TInd>
diff --git a/caffe2/operators/batch_matmul_op.h b/caffe2/operators/batch_matmul_op.h
index e594f526a6bf6..6408f1fa4495d 100644
--- a/caffe2/operators/batch_matmul_op.h
+++ b/caffe2/operators/batch_matmul_op.h
@@ -20,7 +20,7 @@ class BatchMatMulOp final : public Operator<Context> {
         broadcast_(OperatorBase::GetSingleArgument<int>("broadcast", 0)),
         use_scratch_(OperatorBase::GetSingleArgument<int>("use_scratch", 0)) {
     if (use_scratch_) {
-      scratch_ = std::make_shared<Tensor<Context>>();
+      scratch_ = std::make_shared<Tensor>(Context::GetDeviceType());
     }
   }
 
@@ -282,7 +282,7 @@ class BatchMatMulOp final : public Operator<Context> {
   bool broadcast_;
 
   bool use_scratch_;
-  std::shared_ptr<Tensor<Context>> scratch_;
+  std::shared_ptr<Tensor> scratch_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index e8424f0837d6b..33a5363b0afc3 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -30,20 +30,20 @@ class BatchMatMulOpGPUTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+    auto* tensor = blob->GetMutableTensor(CUDA);
     tensor->Resize(dims);
     math::Set<float, CUDAContext>(
         tensor->size(),
         value,
-        tensor->mutable_data<float>(),
+        tensor->template mutable_data<float>(),
         cuda_context_.get());
   }
 
   void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
     const Blob* Y_blob = ws_.GetBlob("Y");
     ASSERT_NE(nullptr, Y_blob);
-    const auto& Y = Y_blob->Get<Tensor<CUDAContext>>();
-    TensorCPU Y_cpu(Y);
+    const auto& Y = Y_blob->Get<Tensor>();
+    Tensor Y_cpu(Y, CPU);
     const auto& Y_dims = Y_cpu.dims();
     ASSERT_EQ(dims.size(), Y_dims.size());
     for (std::size_t i = 0; i < dims.size(); ++i) {
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index 0ec1799179839..28fa8c1a90867 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -24,12 +24,12 @@ class BatchMatMulOpTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutable<TensorCPU>();
+    auto* tensor = blob->GetMutableTensor(CPU);
     tensor->Resize(dims);
     math::Set<float, CPUContext>(
         tensor->size(),
         value,
-        tensor->mutable_data<float>(),
+        tensor->template mutable_data<float>(),
         cpu_context_.get());
   }
 
diff --git a/caffe2/operators/bbox_transform_op.cc b/caffe2/operators/bbox_transform_op.cc
index 0d2b5a3a9aa25..79520face8c09 100644
--- a/caffe2/operators/bbox_transform_op.cc
+++ b/caffe2/operators/bbox_transform_op.cc
@@ -144,7 +144,9 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
 
   box_out->ResizeLike(delta_in);
   Eigen::Map<ERArrXXf> new_boxes(
-      box_out->mutable_data<float>(), box_out->dim32(0), box_out->dim32(1));
+      box_out->template mutable_data<float>(),
+      box_out->dim32(0),
+      box_out->dim32(1));
 
   // We assume roi_in and delta_in over multiple batches are grouped
   // together in increasing order as generated by GenerateProposalsOp
@@ -187,7 +189,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
     auto* roi_batch_splits = Output(1);
     roi_batch_splits->Resize(batch_size);
     Eigen::Map<EArrXf> roi_batch_splits_map(
-        roi_batch_splits->mutable_data<float>(), batch_size);
+        roi_batch_splits->template mutable_data<float>(), batch_size);
     roi_batch_splits_map =
         Eigen::Map<const EArrXi>(num_rois_per_batch.data(), batch_size)
             .cast<float>();
diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc
index b38182b8aa98b..2d1deb0badc5d 100644
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@@ -91,8 +91,7 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
       const auto* src = inPtr + lastStart * innerSizeBytes;
       auto* dst = outPtr + outStart * innerSizeBytes;
       int numItems = i - lastStart;
-      context_.template CopyItems<CPUContext, CPUContext>(
-          data.meta(), numItems * innerSize, src, dst);
+      context_.CopyItemsSameDevice(data.meta(), numItems * innerSize, src, dst);
       outStart += numItems;
       lastStart = -1;
     }
@@ -356,9 +355,9 @@ bool SequenceMaskOp<CPUContext>::RunOnDevice() {
 template <>
 template <class T>
 bool SequenceMaskOp<CPUContext>::DoRunWithType() {
-  const Tensor<CPUContext>* input = &Input(0);
-  const Tensor<CPUContext>* sequence_lengths = nullptr;
-  const Tensor<CPUContext>* window_centers = nullptr;
+  const Tensor* input = &Input(0);
+  const Tensor* sequence_lengths = nullptr;
+  const Tensor* window_centers = nullptr;
 
   if (mode_ == "sequence") {
     sequence_lengths = &Input(1);
@@ -413,7 +412,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
           SequenceFunctor(
               sequence_lengths->data<int>(), sequence_lengths->size()),
           fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
     } else {
       MaskWithFunctor(
           left,
@@ -423,7 +422,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
           SequenceFunctor(
               sequence_lengths->data<int>(), sequence_lengths->size()),
           fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
     }
   } else if (mode_ == "window") {
     MaskWithFunctor(
@@ -433,7 +432,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         WindowFunctor(window_centers->data<int>(), radius_),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "upper") {
     MaskWithFunctor(
         left,
@@ -442,7 +441,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         UpperFunctor(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "lower") {
     MaskWithFunctor(
         left,
@@ -451,7 +450,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         LowerFunctor(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "upperdiag") {
     MaskWithFunctor(
         left,
@@ -460,7 +459,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         UpperDiagFunctor(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "lowerdiag") {
     MaskWithFunctor(
         left,
@@ -469,7 +468,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         LowerDiagFunctor(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else {
     CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
     return false;
diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
index 85315768bd85d..f62ec513ca2e4 100644
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@@ -73,8 +73,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
 
     // Copy numOfOutput from gpu to cpu
     TIndex numOfOutput;
-    context_.Copy<TIndex, CUDAContext, CPUContext>(
-        1, numOfOutputData, &numOfOutput);
+    context_.CopyToCPU(1, numOfOutputData, &numOfOutput);
 
     indices_.Resize(numOfOutput);
     std::vector<TIndex> dims = src.dims();
@@ -85,7 +84,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
     if (OutputSize() == 2) {
       auto* indicesOut = Output(1);
       indicesOut->Resize(numOfOutput);
-      indicesOut->mutable_data<TIndex>();
+      indicesOut->template mutable_data<TIndex>();
     }
 
     if (numOfOutput > 0) {
@@ -109,8 +108,8 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
   }
 
  private:
-  Tensor<CUDAContext> indices_;
-  Tensor<CUDAContext> scratch_;
+  Tensor indices_{CUDA};
+  Tensor scratch_{CUDA};
 };
 
 REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>);
@@ -297,9 +296,9 @@ bool SequenceMaskOp<CUDAContext>::RunOnDevice() {
 template <>
 template <class T>
 bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
-  const Tensor<CUDAContext>* input = &Input(0);
-  const Tensor<CUDAContext>* sequence_lengths = nullptr;
-  const Tensor<CUDAContext>* window_centers = nullptr;
+  const Tensor* input = &Input(0);
+  const Tensor* sequence_lengths = nullptr;
+  const Tensor* window_centers = nullptr;
 
   if (mode_ == "sequence") {
     sequence_lengths = &Input(1);
@@ -355,7 +354,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
           input->data<T>(),
           sequence_lengths->data<int>(),
           fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
     } else {
       sequenceMaskKernel<<<
           CAFFE_GET_BLOCKS(left * right),
@@ -368,7 +367,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
           input->data<T>(),
           sequence_lengths->data<int>(),
           fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
     }
   } else if (mode_ == "window") {
     windowMaskKernel<<<
@@ -383,7 +382,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         window_centers->data<int>(),
         radius_,
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "upper") {
     upperMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -395,7 +394,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "lower") {
     lowerMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -407,7 +406,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "upperdiag") {
     upperDiagMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -419,7 +418,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else if (mode_ == "lowerdiag") {
     lowerDiagMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -431,7 +430,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
   } else {
     CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
   }
diff --git a/caffe2/operators/boolean_unmask_ops.cu b/caffe2/operators/boolean_unmask_ops.cu
index dcdec9c33df7b..2dfc4a19944be 100644
--- a/caffe2/operators/boolean_unmask_ops.cu
+++ b/caffe2/operators/boolean_unmask_ops.cu
@@ -77,9 +77,9 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
       hostValuesData[i] = (char*)value.raw_data();
       hostValueSizesData[i] = value.size();
     }
-    masks_.CopyFrom(hostMasks_, &context_);
-    values_.CopyFrom(hostValues_, &context_);
-    valueSizes_.CopyFrom(hostValueSizes_, &context_);
+    masks_.CopyFrom(hostMasks_);
+    values_.CopyFrom(hostValues_);
+    valueSizes_.CopyFrom(hostValueSizes_);
 
     indices_.Resize(maskSize);
     auto* indicesData = indices_.mutable_data<int>();
@@ -109,14 +109,14 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
   }
 
  private:
-  Tensor<CUDAContext> indices_;
-  Tensor<CUDAContext> masks_;
-  Tensor<CUDAContext> values_;
-  Tensor<CUDAContext> valueSizes_;
-
-  Tensor<CPUContext> hostMasks_;
-  Tensor<CPUContext> hostValues_;
-  Tensor<CPUContext> hostValueSizes_;
+  Tensor indices_{CUDA};
+  Tensor masks_{CUDA};
+  Tensor values_{CUDA};
+  Tensor valueSizes_{CUDA};
+
+  Tensor hostMasks_{CPU};
+  Tensor hostValues_{CPU};
+  Tensor hostValueSizes_{CPU};
 };
 
 REGISTER_CUDA_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CUDAContext>);
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index 05c588f36aa0a..2972cee495747 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -16,13 +16,13 @@ static void AddScalarInput(
     Workspace* ws,
     bool isEmpty = false) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   if (!isEmpty) {
     tensor->Resize(vector<TIndex>{1});
-    *(tensor->mutable_data<DataT>()) = value;
+    *(tensor->template mutable_data<DataT>()) = value;
   } else {
     tensor->Resize(vector<TIndex>{0});
-    tensor->mutable_data<DataT>();
+    tensor->template mutable_data<DataT>();
   }
   return;
 }
diff --git a/caffe2/operators/box_with_nms_limit_op.cc b/caffe2/operators/box_with_nms_limit_op.cc
index 9a3f45f85b85d..8c21cbd77a7be 100644
--- a/caffe2/operators/box_with_nms_limit_op.cc
+++ b/caffe2/operators/box_with_nms_limit_op.cc
@@ -77,8 +77,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
   out_boxes->Resize(0, box_dim);
   out_classes->Resize(0);
 
-  TensorCPU* out_keeps = nullptr;
-  TensorCPU* out_keeps_size = nullptr;
+  Tensor* out_keeps = nullptr;
+  Tensor* out_keeps_size = nullptr;
   if (OutputSize() > 4) {
     out_keeps = Output(4);
     out_keeps_size = Output(5);
@@ -194,7 +194,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
       auto cur_boxes = boxes.block(0, j * box_dim, boxes.rows(), box_dim);
       auto& cur_keep = keeps[j];
       Eigen::Map<EArrXf> cur_out_scores(
-          out_scores->mutable_data<float>() + cur_start_idx + cur_out_idx,
+          out_scores->template mutable_data<float>() + cur_start_idx +
+              cur_out_idx,
           cur_keep.size());
       Eigen::Map<ERArrXXf> cur_out_boxes(
           out_boxes->mutable_data<float>() +
@@ -202,7 +203,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
           cur_keep.size(),
           box_dim);
       Eigen::Map<EArrXf> cur_out_classes(
-          out_classes->mutable_data<float>() + cur_start_idx + cur_out_idx,
+          out_classes->template mutable_data<float>() + cur_start_idx +
+              cur_out_idx,
           cur_keep.size());
 
       utils::GetSubArray(
@@ -220,9 +222,11 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
       out_keeps->Extend(total_keep_count, 50, &context_);
 
       Eigen::Map<EArrXi> out_keeps_arr(
-          out_keeps->mutable_data<int>() + cur_start_idx, total_keep_count);
+          out_keeps->template mutable_data<int>() + cur_start_idx,
+          total_keep_count);
       Eigen::Map<EArrXi> cur_out_keeps_size(
-          out_keeps_size->mutable_data<int>() + b * num_classes, num_classes);
+          out_keeps_size->template mutable_data<int>() + b * num_classes,
+          num_classes);
 
       cur_out_idx = 0;
       for (int j = 0; j < num_classes; j++) {
@@ -240,7 +244,7 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
     auto* batch_splits_out = Output(3);
     batch_splits_out->Resize(batch_size);
     Eigen::Map<EArrXf> batch_splits_out_map(
-        batch_splits_out->mutable_data<float>(), batch_size);
+        batch_splits_out->template mutable_data<float>(), batch_size);
     batch_splits_out_map =
         Eigen::Map<const EArrXi>(total_keep_per_batch.data(), batch_size)
             .cast<float>();
diff --git a/caffe2/operators/ceil_op.cu b/caffe2/operators/ceil_op.cu
index de382ada4186d..651b0020eddd9 100644
--- a/caffe2/operators/ceil_op.cu
+++ b/caffe2/operators/ceil_op.cu
@@ -22,7 +22,7 @@ bool CeilOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->mutable_data<float>());
+      X.size(), X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/channel_backprop_stats_op.cc b/caffe2/operators/channel_backprop_stats_op.cc
index bee287d29cef9..90b0b38ceceef 100644
--- a/caffe2/operators/channel_backprop_stats_op.cc
+++ b/caffe2/operators/channel_backprop_stats_op.cc
@@ -26,8 +26,10 @@ bool ChannelBackpropStatsOp<CPUContext>::RunOnDevice() {
   ConstEigenVectorArrayMap<float> mean_arr(Input(SAVED_MEAN).data<float>(), C);
   ConstEigenVectorArrayMap<float> inv_stddev_arr(
       Input(SAVED_INV_STDDEV).data<float>(), C);
-  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
-  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dBias_arr(
+      dBias->template mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(
+      dScale->template mutable_data<float>(), C);
 
   dBias_arr.setZero();
   dScale_arr.setZero();
diff --git a/caffe2/operators/channel_backprop_stats_op.cu b/caffe2/operators/channel_backprop_stats_op.cu
index 3726773843113..19999a2360286 100644
--- a/caffe2/operators/channel_backprop_stats_op.cu
+++ b/caffe2/operators/channel_backprop_stats_op.cu
@@ -199,8 +199,8 @@ bool ChannelBackpropStatsOp<CUDAContext>::RunOnDevice() {
           numBlocksPerChannel,
           dBiasScratch_.data<float>(),
           dScaleScratch_.data<float>(),
-          dBias->mutable_data<float>(),
-          dScale->mutable_data<float>());
+          dBias->template mutable_data<float>(),
+          dScale->template mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/channel_backprop_stats_op.h b/caffe2/operators/channel_backprop_stats_op.h
index 7678c00d96f87..ce0e08927108b 100644
--- a/caffe2/operators/channel_backprop_stats_op.h
+++ b/caffe2/operators/channel_backprop_stats_op.h
@@ -23,8 +23,8 @@ class ChannelBackpropStatsOp : public Operator<Context> {
   INPUT_TAGS(INPUT, SAVED_MEAN, SAVED_INV_STDDEV, OUTPUT_GRAD);
   OUTPUT_TAGS(SCALE_GRAD, BIAS_GRAD);
 
-  Tensor<Context> dBiasScratch_;
-  Tensor<Context> dScaleScratch_;
+  Tensor dBiasScratch_{Context::GetDeviceType()};
+  Tensor dScaleScratch_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu
index ad9a0ab4e79d7..120947c21af31 100644
--- a/caffe2/operators/channel_shuffle_op_gpu.cu
+++ b/caffe2/operators/channel_shuffle_op_gpu.cu
@@ -56,7 +56,7 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), S, C, G, K, X.data<float>(), Y->mutable_data<float>());
+      X.size(), S, C, G, K, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -74,7 +74,7 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), G, K, X.data<float>(), Y->mutable_data<float>());
+      X.size(), G, K, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -93,7 +93,13 @@ bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      dY.size(), S, C, K, G, dY.data<float>(), dX->mutable_data<float>());
+      dY.size(),
+      S,
+      C,
+      K,
+      G,
+      dY.data<float>(),
+      dX->template mutable_data<float>());
   return true;
 }
 
@@ -111,7 +117,7 @@ bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      dY.size(), K, G, dY.data<float>(), dX->mutable_data<float>());
+      dY.size(), K, G, dY.data<float>(), dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/channel_stats_op.cc b/caffe2/operators/channel_stats_op.cc
index 442ab48d764de..b9cd19f75909e 100644
--- a/caffe2/operators/channel_stats_op.cc
+++ b/caffe2/operators/channel_stats_op.cc
@@ -17,8 +17,10 @@ bool ChannelStatsOp<CPUContext>::RunOnDevice() {
 
   Output(SUM)->Resize(C);
   Output(SUMSQ)->Resize(C);
-  EigenVectorArrayMap<float> sum(Output(SUM)->mutable_data<float>(), C);
-  EigenVectorArrayMap<float> sumsq(Output(SUMSQ)->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> sum(
+      Output(SUM)->template mutable_data<float>(), C);
+  EigenVectorArrayMap<float> sumsq(
+      Output(SUMSQ)->template mutable_data<float>(), C);
 
   sum.setZero();
   sumsq.setZero();
diff --git a/caffe2/operators/channel_stats_op.cu b/caffe2/operators/channel_stats_op.cu
index fff23ffe46102..7f129ad1ea47e 100644
--- a/caffe2/operators/channel_stats_op.cu
+++ b/caffe2/operators/channel_stats_op.cu
@@ -185,8 +185,8 @@ bool ChannelStatsOp<CUDAContext>::RunOnDevice() {
           numBlocksPerChannel,
           sumScratch_.data<float>(),
           sumsqScratch_.data<float>(),
-          sum->mutable_data<float>(),
-          sumsq->mutable_data<float>());
+          sum->template mutable_data<float>(),
+          sumsq->template mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/channel_stats_op.h b/caffe2/operators/channel_stats_op.h
index eb6b062068c09..0ccb885285760 100644
--- a/caffe2/operators/channel_stats_op.h
+++ b/caffe2/operators/channel_stats_op.h
@@ -23,8 +23,8 @@ class ChannelStatsOp : public Operator<Context> {
   INPUT_TAGS(INPUT);
   OUTPUT_TAGS(SUM, SUMSQ);
 
-  Tensor<Context> sumScratch_;
-  Tensor<Context> sumsqScratch_;
+  Tensor sumScratch_{Context::GetDeviceType()};
+  Tensor sumsqScratch_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/clip_op.cc b/caffe2/operators/clip_op.cc
index 02e80bd131beb..789a44d61cee4 100644
--- a/caffe2/operators/clip_op.cc
+++ b/caffe2/operators/clip_op.cc
@@ -8,7 +8,7 @@ bool ClipOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);
   Y->ResizeLike(X);
-  EigenVectorMap<float>(Y->mutable_data<float>(), Y->size()) =
+  EigenVectorMap<float>(Y->template mutable_data<float>(), Y->size()) =
       ConstEigenVectorMap<float>(X.data<float>(), X.size())
           .cwiseMax(min_)
           .cwiseMin(max_);
@@ -25,7 +25,7 @@ bool ClipGradientOp<float, CPUContext>::RunOnDevice() {
   dX->ResizeLike(Y);
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   for (int i = 0; i < Y.size(); ++i) {
     dXdata[i] = dYdata[i] * (Ydata[i] > min_ && Ydata[i] < max_);
   }
diff --git a/caffe2/operators/clip_op.cu b/caffe2/operators/clip_op.cu
index 91b6dca882f39..167ef21492f50 100644
--- a/caffe2/operators/clip_op.cu
+++ b/caffe2/operators/clip_op.cu
@@ -46,9 +46,12 @@ bool ClipOp<float, CUDAContext>::RunOnDevice() {
   auto* Y = Output(0);
   CAFFE_ENFORCE_GT(X.size(), 0);
   Y->ResizeLike(X);
-  ClipKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
-               0, context_.cuda_stream()>>>(
-      X.size(), min_, max_, X.data<float>(), Y->mutable_data<float>());
+  ClipKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(), min_, max_, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -60,10 +63,17 @@ bool ClipGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_GT(Y.size(), 0);
   CAFFE_ENFORCE_EQ(dY.size(), Y.size());
   dX->ResizeLike(Y);
-  ClipGradientKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS,
-                       0, context_.cuda_stream()>>>(
-      Y.size(), min_, max_, Y.data<float>(), dY.data<float>(),
-      dX->mutable_data<float>());
+  ClipGradientKernel<<<
+      CAFFE_GET_BLOCKS(Y.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      Y.size(),
+      min_,
+      max_,
+      Y.data<float>(),
+      dY.data<float>(),
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
index 256e2109504ab..006f6212a31d8 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
@@ -180,8 +180,8 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
   //   outputs[0].data[...] = rois
   auto* rois_out = Output(0);
   rois_out->Resize(rois.rows(), rois.cols());
-  Eigen::Map<ERArrXXf> rois_out_mat(rois_out->mutable_data<float>(),
-                                    rois.rows(), rois.cols());
+  Eigen::Map<ERArrXXf> rois_out_mat(
+      rois_out->template mutable_data<float>(), rois.rows(), rois.cols());
   rois_out_mat = rois;
 
   // Create new roi blobs for each FPN level
@@ -207,9 +207,10 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
     // Output blob_roi_level
     auto* roi_out = Output(i + 1);
     roi_out->Resize(blob_roi_level.rows(), blob_roi_level.cols());
-    Eigen::Map<ERArrXXf> roi_out_mat(roi_out->mutable_data<float>(),
-                                     blob_roi_level.rows(),
-                                     blob_roi_level.cols());
+    Eigen::Map<ERArrXXf> roi_out_mat(
+        roi_out->template mutable_data<float>(),
+        blob_roi_level.rows(),
+        blob_roi_level.cols());
     roi_out_mat = blob_roi_level;
 
     // Append indices from idx_lvl to rois_idx_restore
@@ -219,8 +220,9 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
   utils::ArgSort(rois_idx_restore);
   auto* rois_idx_restore_out = Output(OutputSize() - 1);
   rois_idx_restore_out->Resize(rois_idx_restore.size());
-  Eigen::Map<EArrXi> rois_idx_restore_out_mat(rois_idx_restore_out->mutable_data<int>(),
-                                              rois_idx_restore.size());
+  Eigen::Map<EArrXi> rois_idx_restore_out_mat(
+      rois_idx_restore_out->template mutable_data<int>(),
+      rois_idx_restore.size());
   rois_idx_restore_out_mat = rois_idx_restore;
 
   return true;
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index 08e87db7af33c..35c170c320e9a 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -81,8 +81,8 @@ class SplitByLengthsOp final : public Operator<Context> {
 
  protected:
   int axis_;
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{Context::GetDeviceType()};
+  Tensor inclusive_scan_length_buffer_{Context::GetDeviceType()};
   // Input: X, optionally split
   // The split tensor is stored in CPU.
 };
@@ -134,7 +134,7 @@ bool SplitOp<Context>::RunOnDevice() {
         0,
         "If you set split with an input blob, do not pass in "
         "split in the argument.");
-    auto& split_tensor = OperatorBase::Input<TensorCPU>(1);
+    auto& split_tensor = OperatorBase::Input<Tensor>(1, CPU);
     CAFFE_ENFORCE_EQ(split_tensor.size(), OutputSize());
     axis_data = split_tensor.template data<int>();
   } else if (split_.size() == 0) {
@@ -199,7 +199,7 @@ bool SplitOp<Context>::RunOnDevice() {
 template <class Context>
 bool SplitByLengthsOp<Context>::RunOnDevice() {
   auto& input = Input(0);
-  auto& length = OperatorBase::Input<TensorCPU>(1);
+  auto& length = OperatorBase::Input<Tensor>(1, CPU);
   auto length_length = length.size();
   CAFFE_ENFORCE_EQ(
       length_length % OutputSize(),
@@ -244,7 +244,7 @@ bool SplitByLengthsOp<Context>::RunOnDevice() {
 template <class Context>
 bool ConcatOp<Context>::RunOnDevice() {
   auto* output = Output(0);
-  TensorCPU* split = OperatorBase::Output<TensorCPU>(1);
+  Tensor* split = OperatorBase::Output<Tensor>(1, CPU);
   split->Resize(vector<TIndex>(1, InputSize()));
   int* axis_data = split->template mutable_data<int>();
   auto& input_zero = Input(0);
diff --git a/caffe2/operators/conditional_op.cc b/caffe2/operators/conditional_op.cc
index 9c53f3f4cb9bd..e202ea2e9881e 100644
--- a/caffe2/operators/conditional_op.cc
+++ b/caffe2/operators/conditional_op.cc
@@ -34,10 +34,10 @@ bool ConditionalOp<CPUContext>::RunOnDevice() {
   for (TIndex i = 0; i < condition.size(); i++) {
     auto* dst = outPtr + i * innerSizeBytes;
     if (condPtr[i]) {
-      context_.template CopyItems<CPUContext, CPUContext>(
+      context_.CopyItemsSameDevice(
           dataT.meta(), innerSize, ptrT + i * innerSizeBytes, dst);
     } else {
-      context_.template CopyItems<CPUContext, CPUContext>(
+      context_.CopyItemsSameDevice(
           dataF.meta(), innerSize, ptrF + i * innerSizeBytes, dst);
     }
   }
diff --git a/caffe2/operators/conv_op.h b/caffe2/operators/conv_op.h
index efdc30f161a31..019e91248b9d8 100644
--- a/caffe2/operators/conv_op.h
+++ b/caffe2/operators/conv_op.h
@@ -54,10 +54,10 @@ class ConvOp final : public ConvPoolOpBase<Context> {
       const T* bias,
       T* Y);
 
-  Tensor<Context> col_buffer_;
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> img_shape_device_;
-  Tensor<Context> col_buffer_shape_device_;
+  Tensor col_buffer_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor img_shape_device_{Context::GetDeviceType()};
+  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
   // Input: X, W, b
   // Output: Y
   INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -83,10 +83,10 @@ class ConvGradientOp final : public ConvPoolOpBase<Context> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  Tensor<Context> col_buffer_;
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> img_shape_device_;
-  Tensor<Context> col_buffer_shape_device_;
+  Tensor col_buffer_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor img_shape_device_{Context::GetDeviceType()};
+  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
   bool no_bias_;
   // input: X, W, dY
   // output: dW, db, and optionally dX
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
index f8ad628c0ca72..9975c04cb6ece 100644
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@@ -19,9 +19,9 @@ namespace caffe2 {
 
 template <typename T, class Context>
 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor<Context>& X = Input(INPUT);
+  const Tensor& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor<Context>* Y = Output(0);
+  Tensor* Y = Output(0);
   const int N = X.dim32(0), C = X.dim32(1);
   CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
   const int M = filter.dim32(0);
@@ -96,7 +96,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         N, C, HxW, M, X_data, filter_data, bias_data, Y_data);
   }
 
-  auto f = [&](Tensor<Context>* col_buffer) {
+  auto f = [&](Tensor* col_buffer) {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2Col, followed by gemm.
@@ -180,9 +180,9 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 // The implementations.
 template <typename T, class Context>
 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
-  const Tensor<Context>& X = Input(INPUT);
+  const Tensor& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor<Context>* Y = Output(0);
+  Tensor* Y = Output(0);
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
 
   CAFFE_ENFORCE_EQ(
@@ -233,7 +233,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
     ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
         output_image_size, &bias_multiplier_);
   }
-  auto f = [&](Tensor<Context>* col_buffer) {
+  auto f = [&](Tensor* col_buffer) {
     col_buffer->Resize(
         vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
     T* col_buffer_data = col_buffer->template mutable_data<T>();
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
index b0ad152a56f9c..b9f54b6d55be7 100644
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@@ -19,16 +19,16 @@ void createSharedBuffer<CPUContext>(Workspace* ws) {
 }
 
 template <>
-void runWithSharedBuffer(
+void runWithSharedBuffer<CPUContext>(
     Workspace* ws,
-    std::function<void(Tensor<CPUContext>* buffer)> f) {
+    std::function<void(Tensor* buffer)> f) {
   auto* mutexBlob = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU_MUTEX__");
   CAFFE_ENFORCE(mutexBlob, "Must call createSharedBuffer() first");
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
   auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutable<TensorCPU>();
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_op_shared.h b/caffe2/operators/conv_op_shared.h
index 18f7aa898a6b5..34ccee5fa2c45 100644
--- a/caffe2/operators/conv_op_shared.h
+++ b/caffe2/operators/conv_op_shared.h
@@ -19,9 +19,7 @@ void createSharedBuffer(Workspace* ws);
  * access to shared buffer.
  */
 template <typename Context>
-void runWithSharedBuffer(
-    Workspace* ws,
-    std::function<void(Tensor<Context>* buffer)> f);
+void runWithSharedBuffer(Workspace* ws, std::function<void(Tensor* buffer)> f);
 } // namespace caffe2
 
 #endif // CAFFE2_OPERATORS_CONV_OP_SHARED_H_
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index eb5a762cbd3e3..f80d15a5d9054 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -12,16 +12,16 @@ void createSharedBuffer<CUDAContext>(Workspace* ws) {
 }
 
 template <>
-void runWithSharedBuffer(
+void runWithSharedBuffer<CUDAContext>(
     Workspace* ws,
-    std::function<void(Tensor<CUDAContext>* buffer)> f) {
+    std::function<void(Tensor* buffer)> f) {
   auto* mutexBlob = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA_MUTEX__");
   CAFFE_ENFORCE(mutexBlob, "Must call createSharedBuffer() first");
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")
-                     ->GetMutable<TensorCUDA>();
+  auto* buffer =
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index 723304994c4e9..0cea8f9645a71 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -168,7 +168,7 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
   // Returns the input image dimensions for the current storage order type.
-  vector<int> GetDims(const Tensor<Context>& input) {
+  vector<int> GetDims(const Tensor& input) {
     vector<int> dims;
     switch (order_) {
       case StorageOrder::NCHW:
@@ -184,7 +184,7 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
   // Returns the size of the input image for the current storage type.
-  int GetDimsSize(const Tensor<Context>& input) {
+  int GetDimsSize(const Tensor& input) {
     int size = 0;
     switch (order_) {
       case StorageOrder::NCHW:
@@ -214,12 +214,8 @@ class ConvPoolOpBase : public Operator<Context> {
   // Note(jiayq): the templatization of this function is mainly to help
   // implementations that do not use first-class Tensor objects, such as the
   // MKL operator. One can still call this function with dummy
-  // Tensor<CPUContext> objects in order to obtain the sizes.
-  template <typename AlternativeContext>
-  void SetOutputSize(
-      const Tensor<AlternativeContext>& input,
-      Tensor<AlternativeContext>* output,
-      int output_channel) {
+  // Tensor objects in order to obtain the sizes.
+  void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) {
     CAFFE_ENFORCE(input.size() > 0);
     vector<int> output_dims;
     int N = input.dim32(0);
@@ -335,7 +331,7 @@ class ConvPoolOpBase : public Operator<Context> {
         stride_.cbegin(), stride_.cend(), [](const int x) { return x > 1; });
   }
 
-  void SetDeviceTensor(const std::vector<int>& data, Tensor<Context>* tensor) {
+  void SetDeviceTensor(const std::vector<int>& data, Tensor* tensor) {
     bool reset_tensor_device_ = false;
 
     if (tensor->size() != data.size()) {
@@ -358,7 +354,7 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
   template <typename T>
-  void SetBiasMultiplier(const int size, Tensor<Context>* bias_multiplier_) {
+  void SetBiasMultiplier(const int size, Tensor* bias_multiplier_) {
     if (bias_multiplier_->size() != size) {
       // If the helper bias multiplier is not image size, reshape and fill it
       // with one.
@@ -735,9 +731,9 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
  private:
-  inline void AllocateAndCopy(const vector<int>& vec, Tensor<Context>& tensor) {
+  inline void AllocateAndCopy(const vector<int>& vec, Tensor& tensor) {
     tensor.Resize(vec.size());
-    context_.template Copy<int, CPUContext, Context>(
+    context_.template CopyFromCPU<int>(
         vec.size(), vec.data(), tensor.template mutable_data<int>());
   }
 
diff --git a/caffe2/operators/conv_transpose_op.h b/caffe2/operators/conv_transpose_op.h
index 6dcdbb81b1ced..8204bb02befe4 100644
--- a/caffe2/operators/conv_transpose_op.h
+++ b/caffe2/operators/conv_transpose_op.h
@@ -18,8 +18,8 @@ class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  Tensor<Context> col_buffer_;
-  Tensor<Context> bias_multiplier_;
+  Tensor col_buffer_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
   // Input: X, W, b
   // Output: Y
   INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -41,8 +41,8 @@ class ConvTransposeGradientOp final : public ConvTransposeUnpoolBase<Context> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  Tensor<Context> col_buffer_;
-  Tensor<Context> bias_multiplier_;
+  Tensor col_buffer_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
   const bool no_bias_;
   // input: X, W, dY
   // output: dW, optionally db and dX
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index 808433939c785..23def95ea9bd1 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -17,9 +17,9 @@ namespace caffe2 {
 
 template <typename T, class Context>
 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor<Context>& X = Input(INPUT);
+  const Tensor& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor<Context>* Y = Output(0);
+  Tensor* Y = Output(0);
   const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -59,7 +59,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   const T* filter_data = filter.template data<T>();
   T* Ydata = Y->template mutable_data<T>();
 
-  auto f = [&](Tensor<Context>* col_buffer) {
+  auto f = [&](Tensor* col_buffer) {
     col_buffer->Resize(
         vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
     T* col_buffer_data = col_buffer->template mutable_data<T>();
@@ -139,9 +139,9 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 
 template <typename T, class Context>
 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
-  const Tensor<Context>& X = Input(INPUT);
+  const Tensor& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor<Context>* Y = Output(0);
+  Tensor* Y = Output(0);
   const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -180,7 +180,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   const T* filter_data = filter.template data<T>();
   T* Ydata = Y->template mutable_data<T>();
 
-  auto f = [&](Tensor<Context>* /*col_buffer*/) {
+  auto f = [&](Tensor* /*col_buffer*/) {
     col_buffer_.Resize(
         vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
     T* col_buffer_data = col_buffer_.template mutable_data<T>();
diff --git a/caffe2/operators/conv_transpose_op_mobile.h b/caffe2/operators/conv_transpose_op_mobile.h
index ddfe365e678dc..568dd34c1c844 100644
--- a/caffe2/operators/conv_transpose_op_mobile.h
+++ b/caffe2/operators/conv_transpose_op_mobile.h
@@ -35,7 +35,7 @@ class ConvTransposeMobileOp final : public ConvTransposeUnpoolBase<Context> {
  private:
   // We store a numThreasds per-worker  tiles of Y, and numThreads per-worker threadBuffer for the
   // gemm output, laid out in that order.
-  TensorCPU threadBuffer_;
+  Tensor threadBuffer_{CPU};
 
   // Input: X, W, b
   // Output: Y
diff --git a/caffe2/operators/conv_transpose_op_mobile_impl.h b/caffe2/operators/conv_transpose_op_mobile_impl.h
index d434ec49e3e5b..c724c907f13c8 100644
--- a/caffe2/operators/conv_transpose_op_mobile_impl.h
+++ b/caffe2/operators/conv_transpose_op_mobile_impl.h
@@ -529,9 +529,9 @@ void sumInto(float* acc, std::vector<float*>& toSum, size_t size) {
 
 template <typename T, class Context>
 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor<Context>& X = Input(INPUT);
+  const Tensor& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor<Context>* Y = Output(0);
+  Tensor* Y = Output(0);
   const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -606,7 +606,7 @@ bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         &context_);
   };
 
-  auto f = [&](Tensor<Context>* threadBuffer) {
+  auto f = [&](Tensor* threadBuffer) {
     threadBuffer->Resize(
         numThreads * threadYBufferSizeAligned +
         numThreads * threadColBufferSize);
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index b9282e767d060..da443928a9745 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -17,11 +17,10 @@ void AddConstInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
-  math::Set<float, CPUContext>(tensor->size(), value,
-                               tensor->mutable_data<float>(),
-                               &context);
+  math::Set<float, CPUContext>(
+      tensor->size(), value, tensor->template mutable_data<float>(), &context);
 }
 
 void AddNoiseInput(const vector<TIndex>& shape,
@@ -30,14 +29,15 @@ void AddNoiseInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
-    tensor->size(),
-    0.0f, 10.0f,
-    tensor->mutable_data<float>(),
-    &context);
+      tensor->size(),
+      0.0f,
+      10.0f,
+      tensor->template mutable_data<float>(),
+      &context);
 }
 
 inline float relativeError(float a, float b) {
diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h
index bf2708d22733e..e3e253e150c46 100644
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@@ -131,10 +131,7 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
     }
   }
   // Sets the output size. The output channel is manually specified.
-  void SetOutputSize(
-      const Tensor<Context>& input,
-      Tensor<Context>* output,
-      int output_channel) {
+  void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) {
     CAFFE_ENFORCE(4 == input.ndim());
     CAFFE_ENFORCE(input.size() > 0);
     int N = input.dim32(0);
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cc b/caffe2/operators/cosine_embedding_criterion_op.cc
index 26f477fe62cd0..b99f3f33572ea 100644
--- a/caffe2/operators/cosine_embedding_criterion_op.cc
+++ b/caffe2/operators/cosine_embedding_criterion_op.cc
@@ -18,7 +18,7 @@ bool CosineEmbeddingCriterionOp<CPUContext>::RunOnDevice() {
 
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output_data = output->mutable_data<float>();
+  float* output_data = output->template mutable_data<float>();
   for (int i = 0; i < S.size(); ++i) {
     output_data[i] =
         Ydata[i] == 1 ? (1.f - Sdata[i]) : std::max(0.f, Sdata[i] - margin_);
@@ -38,7 +38,7 @@ bool CosineEmbeddingCriterionGradientOp<CPUContext>::RunOnDevice() {
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
   const float* dOutput_data = dOutput.data<float>();
-  float* dSdata = dS->mutable_data<float>();
+  float* dSdata = dS->template mutable_data<float>();
   for (int i = 0; i < S.size(); ++i) {
     dSdata[i] = dOutput_data[i] *
         (Ydata[i] == 1 ? -1.f : static_cast<float>(Sdata[i] >= margin_));
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cu b/caffe2/operators/cosine_embedding_criterion_op.cu
index 69a37ff329445..e720f95efc683 100644
--- a/caffe2/operators/cosine_embedding_criterion_op.cu
+++ b/caffe2/operators/cosine_embedding_criterion_op.cu
@@ -33,8 +33,8 @@ bool CosineEmbeddingCriterionOp<CUDAContext>::RunOnDevice() {
 
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output_data = output->mutable_data<float>();
- 
+  float* output_data = output->template mutable_data<float>();
+
   CECKernel<<<CAFFE_GET_BLOCKS(S.size()), CAFFE_CUDA_NUM_THREADS,
               0, context_.cuda_stream()>>>(
       S.size(), Sdata, Ydata, margin_, output_data);
@@ -53,7 +53,7 @@ bool CosineEmbeddingCriterionGradientOp<CUDAContext>::RunOnDevice() {
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
   const float* dOutput_data = dOutput.data<float>();
-  float* dSdata = dS->mutable_data<float>();
+  float* dSdata = dS->template mutable_data<float>();
   CECGradientKernel<<<CAFFE_GET_BLOCKS(S.size()), CAFFE_CUDA_NUM_THREADS,
                       0, context_.cuda_stream()>>>(
       S.size(), Sdata, Ydata, dOutput_data, margin_, dSdata);
diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h
index 3b7bf7dd82711..cf58b7cd351b2 100644
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@@ -75,7 +75,7 @@ class ResetCounterOp final : public Operator<Context> {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
     auto previous = counterPtr->reset(init_count_);
     if (OutputSize() == 1) {
-      auto* output = OperatorBase::Output<TensorCPU>(0);
+      auto* output = Output(0);
       output->Resize();
       *output->template mutable_data<T>() = previous;
     }
@@ -96,7 +96,7 @@ class CountDownOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto* output = Output(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<bool>() = counterPtr->countDown();
     return true;
@@ -113,7 +113,7 @@ class CheckCounterDoneOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto* output = Output(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<bool>() = counterPtr->checkIfDone();
     return true;
@@ -130,7 +130,7 @@ class CountUpOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto* output = Output(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<T>() = counterPtr->countUp();
     return true;
@@ -147,7 +147,7 @@ class RetrieveCountOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto* output = Output(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<T>() = counterPtr->retrieve();
     return true;
diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc
index c288eb7be69d8..584b7abd5a183 100644
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@@ -56,7 +56,7 @@ bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(N);
   const auto* Xdata = X.data<float>();
   const auto* labelData = label.data<int>();
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
   CAFFE_ENFORCE(
       (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() &&
           (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(),
@@ -85,7 +85,7 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -123,7 +123,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -167,7 +167,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -201,7 +201,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -241,12 +241,12 @@ bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(dY.ndim(), 1);
   CAFFE_ENFORCE_EQ(dY.dim32(0), N);
   dX->ResizeLike(X);
-  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data<float>(),
-                               &context_);
+  math::Set<float, CPUContext>(
+      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
   const float* Xdata = X.data<float>();
   const float* dYdata = dY.data<float>();
   const int* labelData = label.data<int>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   for (int i = 0; i < N; ++i) {
     dXdata[i * D + labelData[i]] =
         - dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD());
@@ -263,7 +263,7 @@ bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
   TIndex N = X.size();
   Y->Resize(shape);
   const auto* Xdata = X.data<float>();
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
   for (TIndex i = 0; i < N; ++i) {
     DCHECK_GE(Xdata[i], 0.0);
     DCHECK_LE(Xdata[i], 1.0);
@@ -283,7 +283,7 @@ bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
   shape.pop_back();
   dX->Resize(shape);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   TIndex N = dX->size();
   // use eigen?
   for (TIndex i = 0; i < N; ++i) {
@@ -311,7 +311,7 @@ bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(vector<TIndex>{N});
   const float* Xdata = X.data<float>();
   const float* labelData = label.data<float>();
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
   CAFFE_ENFORCE(
       (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() &&
           (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(),
@@ -350,11 +350,11 @@ bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(dY.dim32(0), N);
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
-    dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
   const float* Xdata = X.data<float>();
   const float* dYdata = dY.data<float>();
   const float* labelData = label.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   EigenArrayMap<float>(dXdata, D, N) =
       (ConstEigenArrayMap<float>(labelData, D, N) /
        ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()))
diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
index 70bfbe4e9e3bc..cab3c9692a42f 100644
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@@ -43,10 +43,17 @@ bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
       (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
   Y->Resize(vector<TIndex>(size_t(1), N));
-  LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
-                            0, context_.cuda_stream()>>>(
-      N, D, X.data<float>(), label.data<int>(), kLOG_THRESHOLD(),
-      Y->mutable_data<float>());
+  LabelCrossEntropyKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      X.data<float>(),
+      label.data<int>(),
+      kLOG_THRESHOLD(),
+      Y->template mutable_data<float>());
   return true;
 }
 
@@ -71,11 +78,19 @@ bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(dY.dim32(0), N);
   dX->ResizeLike(X);
   math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-  LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
-                                    0, context_.cuda_stream()>>>(
-      N, D, X.data<float>(), label.data<int>(), dY.data<float>(),
-      kLOG_THRESHOLD(), dX->mutable_data<float>());
+      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
+  LabelCrossEntropyGradientKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      X.data<float>(),
+      label.data<int>(),
+      dY.data<float>(),
+      kLOG_THRESHOLD(),
+      dX->template mutable_data<float>());
   return true;
 }
 
@@ -104,9 +119,12 @@ bool MakeTwoClassOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max() / 2);
   Y->Resize(shape);
   int N = X.size();
-  MakeTwoClassKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
-                       0, context_.cuda_stream()>>>(
-      N, X.data<float>(), Y->mutable_data<float>());
+  MakeTwoClassKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -121,9 +139,12 @@ bool MakeTwoClassGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_LT(dY.size(), std::numeric_limits<int>::max());
   dX->Resize(shape);
   int N = dX->size();
-  MakeTwoClassGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
-                               0, context_.cuda_stream()>>>(
-      N, dY.data<float>(), dX->mutable_data<float>());
+  MakeTwoClassGradientKernel<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, dY.data<float>(), dX->template mutable_data<float>());
   return true;
 }
 
@@ -234,7 +255,7 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::RunOnDevice() {
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -272,7 +293,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -356,7 +377,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -386,7 +407,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->mutable_data<float>();
+  auto* out_ptr = out->template mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
diff --git a/caffe2/operators/ctc_beam_search_decoder_op.cc b/caffe2/operators/ctc_beam_search_decoder_op.cc
index 9dd426978b257..e299950e9d946 100644
--- a/caffe2/operators/ctc_beam_search_decoder_op.cc
+++ b/caffe2/operators/ctc_beam_search_decoder_op.cc
@@ -4,8 +4,7 @@ namespace caffe2 {
 
 namespace {
 
-template <class Context>
-const float* getTensorDataPtr(const Tensor<Context>& tensor, int t, int n) {
+const float* getTensorDataPtr(const Tensor& tensor, int t, int n) {
   const auto& dims = tensor.dims();
   CAFFE_ENFORCE_EQ(dims.size(), 3);
   int offset = (t * dims[1] + n) * dims[2];
diff --git a/caffe2/operators/ctc_greedy_decoder_op.cc b/caffe2/operators/ctc_greedy_decoder_op.cc
index 1a9c415aac74b..8a5e0932defd6 100644
--- a/caffe2/operators/ctc_greedy_decoder_op.cc
+++ b/caffe2/operators/ctc_greedy_decoder_op.cc
@@ -4,8 +4,7 @@ namespace caffe2 {
 
 namespace {
 
-template <class Context>
-const float* getTensorDataPtr(const Tensor<Context>& tensor, int t, int n) {
+const float* getTensorDataPtr(const Tensor& tensor, int t, int n) {
   const auto& dims = tensor.dims();
   CAFFE_ENFORCE_EQ(dims.size(), 3);
   int offset = (t * dims[1] + n) * dims[2];
@@ -34,7 +33,7 @@ bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
 
   vector<int> values_cach;
   output_len->Resize(vector<TIndex>{batch_size});
-  int* output_len_data = output_len->mutable_data<int>();
+  int* output_len_data = output_len->template mutable_data<int>();
 
   for (int32_t i = 0; i < batch_size; ++i) {
     int previous_label = 0, t_dec = 0;
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index e7291476e6076..cb7e108748a6d 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -12,7 +12,7 @@
 namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(std::unique_ptr<dataset_ops::TreeCursor>);
-CAFFE_KNOWN_TYPE(dataset_ops::TensorVectorPtr<CPUContext>);
+CAFFE_KNOWN_TYPE(dataset_ops::TensorVectorPtr);
 CAFFE_KNOWN_TYPE(dataset_ops::SharedTensorVectorPtr);
 
 namespace dataset_ops {
@@ -215,7 +215,7 @@ class GetCursorOffsetOp : public Operator<CPUContext> {
   bool RunOnDevice() override {
     auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
     Output(0)->Resize(cursor->offsets.size());
-    auto* output = Output(0)->mutable_data<int>();
+    auto* output = Output(0)->template mutable_data<int>();
     for (size_t i = 0; i < cursor->offsets.size(); ++i) {
       output[i] = cursor->offsets[i];
     }
@@ -314,16 +314,16 @@ class PackRecordsOp : public Operator<CPUContext> {
     Output(0)->Resize(walker.size());
 
     // Output(0)->raw_mutable_data(TypeMeta::Make<SharedTensorVectorPtr>()));
-    auto* dst = Output(0)->mutable_data<SharedTensorVectorPtr>();
+    auto* dst = Output(0)->template mutable_data<SharedTensorVectorPtr>();
 
     for (int batchId = 0; batchId < walker.size(); ++batchId) {
       dst[batchId] = std::make_shared<std::vector<TensorCPU>>();
       dst[batchId]->reserve(walker.fields().size());
 
       for (const auto& field : walker.fields()) {
-        dst[batchId]->emplace_back(field.dim());
+        dst[batchId]->emplace_back(field.dim(), CPU);
         auto& tensor = dst[batchId]->back();
-        context_.template CopyItems<CPUContext, CPUContext>(
+        context_.CopyItemsSameDevice(
             field.meta(),
             tensor.size(),
             field.ptr() /* src */,
@@ -397,7 +397,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
       for (int j = 0; j < numTensors; ++j) {
         const auto& input = inputs[i]->at(j);
 
-        context_.CopyItems<CPUContext, CPUContext>(
+        context_.CopyItemsSameDevice(
             *metas[j],
             input.size(),
             input.raw_data() /* src */,
@@ -518,8 +518,7 @@ class ReadNextBatchOp : public Operator<CPUContext> {
       if (out->size() == 0) {
         continue;
       }
-      context_.template CopyItems<CPUContext, CPUContext>(
-          in.meta(), out->size(), src, dst);
+      context_.CopyItemsSameDevice(in.meta(), out->size(), src, dst);
     }
     return true;
   }
@@ -560,7 +559,7 @@ class ComputeOffsetOp : public Operator<CPUContext> {
           std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
     }
     out->Resize(limits.at(0) + 1, sizes.size());
-    auto* out_data = out->mutable_data<int64_t>();
+    auto* out_data = out->template mutable_data<int64_t>();
     for (int k = 0; k <= limits.at(0); k++) {
       // advance cursor
       if (cursor->offsets.empty()) {
@@ -609,7 +608,7 @@ class SortAndShuffleOp : public Operator<CPUContext> {
     int num_batch = size / batch_size_;
     auto* out = Output(0);
     out->Resize(size);
-    auto* out_data = out->mutable_data<int64_t>();
+    auto* out_data = out->template mutable_data<int64_t>();
 
     vector<int> shuffle_idx(size);
     iota(shuffle_idx.begin(), shuffle_idx.end(), 0);
@@ -739,7 +738,7 @@ class ReadRandomBatchOp : public Operator<CPUContext> {
         auto size = *(offsetptr + offsetdim[1]) - offset;
         // copy data
         auto src = src_base + offset * block_bytesize;
-        context_.template CopyItems<CPUContext, CPUContext>(
+        context_.CopyItemsSameDevice(
             in.meta(), size * block_size, src, dst + start * block_bytesize);
         start += size;
         idx++;
@@ -779,8 +778,7 @@ class AppendOp final : public Operator<Context> {
     auto oldSize = c->size();
     c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
     auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
-    context_.template CopyItems<Context, Context>(
-        b.meta(), b.size(), b.raw_data(), dst);
+    context_.CopyItemsSameDevice(b.meta(), b.size(), b.raw_data(), dst);
     return true;
   }
 };
@@ -830,8 +828,7 @@ class AtomicAppendOp final : public Operator<Context> {
       auto oldSize = c->size();
       c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
       auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
-      context_.template CopyItems<Context, Context>(
-          b.meta(), b.size(), b.raw_data(), dst);
+      context_.CopyItemsSameDevice(b.meta(), b.size(), b.raw_data(), dst);
     }
     return true;
   }
@@ -844,9 +841,8 @@ class CreateTensorVectorOp final : public Operator<Context> {
   using Operator<Context>::Operator;
 
   bool RunOnDevice() override {
-    auto ptr = make_unique<std::vector<Tensor<Context>>>();
-    *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR) =
-        std::move(ptr);
+    auto ptr = make_unique<std::vector<Tensor>>();
+    *OperatorBase::Output<TensorVectorPtr>(TENSOR_VECTOR) = std::move(ptr);
     return true;
   }
 
@@ -861,8 +857,7 @@ class TensorVectorSizeOp final : public Operator<Context> {
   USE_SIMPLE_CTOR_DTOR(TensorVectorSizeOp);
 
   bool RunOnDevice() override {
-    auto& vector_ptr =
-        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
+    auto& vector_ptr = OperatorBase::Input<TensorVectorPtr>(TENSOR_VECTOR);
     auto* size = Output(SIZE);
     size->Resize();
     // 32-bit should be enough here
@@ -882,8 +877,8 @@ class ConcatTensorVectorOp final : public Operator<Context> {
   using Operator<Context>::Operator;
 
   bool RunOnDevice() override {
-    const TensorVectorPtr<Context>& tensorVector =
-        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
+    const TensorVectorPtr& tensorVector =
+        OperatorBase::Input<TensorVectorPtr>(TENSOR_VECTOR);
 
     auto* tensor = Output(TENSOR);
     CAFFE_ENFORCE(!tensorVector->empty());
@@ -904,7 +899,7 @@ class ConcatTensorVectorOp final : public Operator<Context> {
     auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());
 
     for (const auto& t : *tensorVector) {
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           t.meta(), t.size(), t.raw_data(), dst + offset);
       offset += t.nbytes();
     }
@@ -947,8 +942,7 @@ class CollectTensorOp final : public Operator<Context> {
 
     for (int i = 0; i < OutputSize(); ++i) {
       // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
-      TensorVectorPtr<Context>& tensorVector =
-          *OperatorBase::Output<TensorVectorPtr<Context>>(i);
+      TensorVectorPtr& tensorVector = *OperatorBase::Output<TensorVectorPtr>(i);
 
       if (numVisited_ >= numToCollect_) {
         CAFFE_ENFORCE(
@@ -966,13 +960,11 @@ class CollectTensorOp final : public Operator<Context> {
         CAFFE_ENFORCE(numVisited_ >= numToCollect_);
       } else if (pos >= tensorVector->size()) {
         // append
-        tensorVector->push_back(Tensor<Context>());
-        tensorVector->back().template CopyFrom<Context, Context>(
-            tensor, &context_);
+        tensorVector->emplace_back(Context::GetDeviceType());
+        tensorVector->back().CopyFrom(tensor, &context_);
       } else {
         // replace
-        tensorVector->at(pos).template CopyFrom<Context, Context>(
-            tensor, &context_);
+        tensorVector->at(pos).CopyFrom(tensor, &context_);
       }
     }
 
@@ -1436,13 +1428,13 @@ class TreeCursorSerializer : public BlobSerializerBase {
     // serialize offsets as a tensor
     if (cursor->offsets.size() > 0) {
       Blob offsets_blob;
-      auto* offsets = offsets_blob.template GetMutable<Tensor<CPUContext>>();
+      auto* offsets = offsets_blob.GetMutableTensor(CPU);
       offsets->Resize(cursor->offsets.size());
       std::copy(
           cursor->offsets.begin(),
           cursor->offsets.end(),
-          offsets->mutable_data<TOffset>());
-      TensorSerializer<CPUContext> ser;
+          offsets->template mutable_data<TOffset>());
+      TensorSerializer ser;
       ser.Serialize(
           *offsets, name, blob_proto.mutable_tensor(), 0, offsets->size());
     }
@@ -1464,10 +1456,10 @@ class TreeCursorDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override {
     // deserialize the offsets
-    TensorDeserializer<CPUContext> deser;
+    TensorDeserializer deser;
     Blob offset_blob;
     deser.Deserialize(proto, &offset_blob);
-    auto& offsets = offset_blob.template Get<Tensor<CPUContext>>();
+    auto& offsets = offset_blob.template Get<Tensor>();
     auto* offsets_ptr = offsets.data<TOffset>();
 
     // deserialize the field names
diff --git a/caffe2/operators/dataset_ops.h b/caffe2/operators/dataset_ops.h
index 161a82b0d101f..809e570ba3c00 100644
--- a/caffe2/operators/dataset_ops.h
+++ b/caffe2/operators/dataset_ops.h
@@ -191,8 +191,7 @@ class TreeWalker {
 
 using SharedTensorVectorPtr = std::shared_ptr<std::vector<TensorCPU>>;
 
-template <class Context>
-using TensorVectorPtr = std::unique_ptr<std::vector<Tensor<Context>>>;
+using TensorVectorPtr = std::unique_ptr<std::vector<Tensor>>;
 
 class SharedTensorVectorPtrSerializer : public BlobSerializerBase {
  public:
diff --git a/caffe2/operators/deform_conv_op.h b/caffe2/operators/deform_conv_op.h
index 56b4d3228b1db..a0e4882abdbce 100644
--- a/caffe2/operators/deform_conv_op.h
+++ b/caffe2/operators/deform_conv_op.h
@@ -70,10 +70,10 @@ class DeformConvOp final : public DeformConvOpBase<T, Context> {
   bool RunOnDeviceWithOrderNCHW() override;
 
  private:
-  Tensor<Context> col_buffer_;
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> img_shape_device_;
-  Tensor<Context> col_buffer_shape_device_;
+  Tensor col_buffer_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor img_shape_device_{Context::GetDeviceType()};
+  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
   // Input: X, o, W, b
   // Output: Y
   INPUT_TAGS(INPUT, OFFSET, FILTER, BIAS);
@@ -96,10 +96,10 @@ class DeformConvGradientOp final : public DeformConvOpBase<T, Context> {
   bool RunOnDeviceWithOrderNCHW() override;
 
  private:
-  Tensor<Context> col_buffer_;
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> img_shape_device_;
-  Tensor<Context> col_buffer_shape_device_;
+  Tensor col_buffer_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor img_shape_device_{Context::GetDeviceType()};
+  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
   bool no_bias_;
   // input: X, W, dY
   // output: dO, dW, db, and optionally dX
diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h
index 072c156cf5bae..5d84d5905fd9a 100644
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@@ -14,10 +14,10 @@ namespace caffe2 {
 
 template <typename T, class Context>
 bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor<Context>& X = Input(INPUT);
-  const Tensor<Context>& offset = Input(OFFSET);
+  const Tensor& X = Input(INPUT);
+  const Tensor& offset = Input(OFFSET);
   auto& filter = Input(FILTER);
-  Tensor<Context>* Y = Output(0);
+  Tensor* Y = Output(0);
   const int N = X.dim32(0), C = X.dim32(1);
   CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
   const int M = filter.dim32(0);
@@ -133,7 +133,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     bias_data = Input(BIAS).template data<T>();
   }
 
-  auto f = [&](Tensor<Context>* col_buffer) {
+  auto f = [&](Tensor* col_buffer) {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2col, followed by gemm.
diff --git a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
index 6868a482ff121..e514e5e2ed6fc 100644
--- a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
+++ b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
@@ -286,9 +286,9 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> {
   }
 
   bool RunOnDeviceWithOrderNCHW() override {
-    const Tensor<CUDAContext>& X = Input(0);
+    const Tensor& X = Input(0);
     auto& filter = Input(1);
-    Tensor<CUDAContext>* Y = Output(0);
+    Tensor* Y = Output(0);
     const int N = X.dim32(0), C = X.dim32(1);
     CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
     const int M = filter.dim32(0);
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index 4e00cd4396726..448172a5f8699 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -15,7 +15,7 @@ bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
   int N = X.ndim() > 0 ? X.dim32(0) : 1;
   distance->Resize(N);
   int D = N > 0 ? X.size() / N : 0;
-  float* distance_data = distance->mutable_data<float>();
+  float* distance_data = distance->template mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   for (int i = 0; i < N; ++i) {
@@ -48,7 +48,7 @@ bool L1DistanceOp<float, CPUContext>::RunOnDevice() {
   const float* Y_data = Y.data<float>();
 
   for (int i = 0; i < N; ++i) {
-    (distance->mutable_data<float>())[i] =
+    (distance->template mutable_data<float>())[i] =
         (ConstEigenVectorMap<float>(X_data + i * D, D).array() -
          ConstEigenVectorMap<float>(Y_data + i * D, D).array())
             .abs()
@@ -86,14 +86,18 @@ bool L1DistanceGradientOp<float, CPUContext>::RunOnDevice() {
           (X.data<float>())[offset + j] - (Y.data<float>())[offset + j];
       const float kEps = 1e-12f;
       if (temp < -kEps) {
-        dX->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
-        dY->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
+        dX->template mutable_data<float>()[offset + j] =
+            -(dDistance.data<float>())[i];
+        dY->template mutable_data<float>()[offset + j] =
+            (dDistance.data<float>())[i];
       } else if (temp > kEps) {
-        dX->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
-        dY->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
+        dX->template mutable_data<float>()[offset + j] =
+            (dDistance.data<float>())[i];
+        dY->template mutable_data<float>()[offset + j] =
+            -(dDistance.data<float>())[i];
       } else {
-        dX->mutable_data<float>()[offset + j] = 0;
-        dY->mutable_data<float>()[offset + j] = 0;
+        dX->template mutable_data<float>()[offset + j] = 0;
+        dY->template mutable_data<float>()[offset + j] = 0;
       }
     }
   }
@@ -112,7 +116,7 @@ bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
   const int N = X.ndim() > 0 ? X.dim32(0) : 1;
   const int D = X.size_from_dim(1);
   result->Resize(N);
-  float* result_data = result->mutable_data<float>();
+  float* result_data = result->template mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   float X2, Y2;
@@ -308,7 +312,7 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
   D = std::min(DX, DY);
   restD = std::max(DX, DY) - D;
   result->Resize(N);
-  float* result_data = result->mutable_data<float>();
+  float* result_data = result->template mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   for (int i = 0; i < N; ++i) { // TODO: multithreading
diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
index e1a56399a2f94..d9ecad7f329fe 100644
--- a/caffe2/operators/distance_op.cu
+++ b/caffe2/operators/distance_op.cu
@@ -55,7 +55,11 @@ bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      N, D, X.data<float>(), Y.data<float>(), distance->mutable_data<float>());
+      N,
+      D,
+      X.data<float>(),
+      Y.data<float>(),
+      distance->template mutable_data<float>());
   return true;
 }
 
@@ -97,23 +101,27 @@ bool SquaredL2DistanceGradientOp<float, CUDAContext>::RunOnDevice() {
       X.size(),
       X.data<float>(),
       Y.data<float>(),
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       &context_);
 
-  StripedScaleKernel<float><<<
-      CAFFE_GET_BLOCKS(N * D),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N,
-      D,
-      dDistance.data<float>(),
-      dX->data<float>(),
-      dX->mutable_data<float>());
+  StripedScaleKernel<float>
+      <<<CAFFE_GET_BLOCKS(N * D),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          N,
+          D,
+          dDistance.data<float>(),
+          dX->data<float>(),
+          dX->template mutable_data<float>());
 
   // The gradient of the other side is basically the negative.
   math::Scale<float, CUDAContext>(
-      X.size(), -1, dX->data<float>(), dY->mutable_data<float>(), &context_);
+      X.size(),
+      -1,
+      dX->data<float>(),
+      dY->template mutable_data<float>(),
+      &context_);
   return true;
 }
 
@@ -162,7 +170,11 @@ bool L1DistanceOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      N, D, X.data<float>(), Y.data<float>(), distance->mutable_data<float>());
+      N,
+      D,
+      X.data<float>(),
+      Y.data<float>(),
+      distance->template mutable_data<float>());
 
   return true;
 }
@@ -228,8 +240,8 @@ bool L1DistanceGradientOp<float, CUDAContext>::RunOnDevice() {
       X.data<float>(),
       Y.data<float>(),
       dDistance.data<float>(),
-      dX->mutable_data<float>(),
-      dY->mutable_data<float>());
+      dX->template mutable_data<float>(),
+      dY->template mutable_data<float>());
 
   return true;
 }
@@ -300,7 +312,7 @@ bool CosineSimilarityOp<float, CUDAContext>::RunOnDevice() {
   const int N = X.ndim() > 0 ? X.dim32(0) : 1;
   const int D = X.size_from_dim(1);
   result->Resize(N);
-  float* result_data = result->mutable_data<float>();
+  float* result_data = result->template mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   // Auxiliary arrays, one allocation of memory
@@ -355,8 +367,8 @@ bool CosineSimilarityGradientOp<float, CUDAContext>::RunOnDevice() {
   const auto* X_data = X.data<float>();
   const auto* Y_data = Y.data<float>();
   const auto* dCos_data = dCos.data<float>();
-  auto* dX_data = dX->mutable_data<float>();
-  auto* dY_data = dY->mutable_data<float>();
+  auto* dX_data = dX->template mutable_data<float>();
+  auto* dY_data = dY->template mutable_data<float>();
 
   // one memory allocation, a few arrays
   aux_.Resize(6 * N);
@@ -454,7 +466,11 @@ bool DotProductOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      N, D, X.data<float>(), Y.data<float>(), result->mutable_data<float>());
+      N,
+      D,
+      X.data<float>(),
+      Y.data<float>(),
+      result->template mutable_data<float>());
 
   return true;
 }
@@ -510,8 +526,8 @@ bool DotProductGradientOp<float, CUDAContext>::RunOnDevice() {
       X.data<float>(),
       Y.data<float>(),
       dDot.data<float>(),
-      dX->mutable_data<float>(),
-      dY->mutable_data<float>());
+      dX->template mutable_data<float>(),
+      dY->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h
index aad57e955dac2..be95b58145d1a 100644
--- a/caffe2/operators/distance_op.h
+++ b/caffe2/operators/distance_op.h
@@ -157,7 +157,7 @@ class CosineSimilarityOp : public Operator<Context> {
   OUTPUT_TAGS(COS_OUT);
 
  private:
-  Tensor<Context> aux_;
+  Tensor aux_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -174,7 +174,7 @@ class CosineSimilarityGradientOp final : public Operator<Context> {
   OUTPUT_TAGS(DER_X_OUT, DER_Y_OUT);
 
  private:
-  Tensor<Context> aux_;
+  Tensor aux_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
index bd9178e9f4ae2..be5b8224aa40c 100644
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@@ -9,8 +9,8 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(X.dims());
   if (is_test_) {
     if (Y != &X) {
-      context_.Copy<float, CPUContext, CPUContext>(
-          X.size(), X.data<float>(), Y->mutable_data<float>());
+      context_.CopyFromCPU<float>(
+          X.size(), X.data<float>(), Y->template mutable_data<float>());
     }
     return true;
   } else {
@@ -19,10 +19,10 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
     // generate probability depending on 1-ratio.
     std::bernoulli_distribution dist(1. - ratio_);
     const float* Xdata = X.data<float>();
-    float* Ydata = Y->mutable_data<float>();
+    float* Ydata = Y->template mutable_data<float>();
     auto mask = Output(1);
     mask->Resize(X.dims());
-    bool* mask_data = mask->mutable_data<bool>();
+    bool* mask_data = mask->template mutable_data<bool>();
     auto& gen = context_.RandGenerator();
     for (int i = 0; i < X.size(); ++i) {
       mask_data[i] = dist(gen);
@@ -39,8 +39,8 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
   dX->Resize(dY.dims());
   if (is_test_) {
     if (dX != &dY) {
-      context_.Copy<float, CPUContext, CPUContext>(
-          dY.size(), dY.data<float>(), dX->mutable_data<float>());
+      context_.CopyFromCPU<float>(
+          dY.size(), dY.data<float>(), dX->template mutable_data<float>());
     }
     return true;
   } else {
@@ -48,7 +48,7 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
     CAFFE_ENFORCE_EQ(dY.size(), mask.size());
     const float* dYdata = dY.data<float>();
     const bool* mask_data = mask.data<bool>();
-    float* dXdata = dX->mutable_data<float>();
+    float* dXdata = dX->template mutable_data<float>();
     float scale = 1. / (1. - ratio_);
     for (int i = 0; i < dY.size(); ++i) {
       dXdata[i] = dYdata[i] * mask_data[i] * scale;
@@ -144,7 +144,9 @@ mask: [[False False False  True  True]
 </details>
 
 )DOC")
-    .Arg("ratio", "*(type: float; default: 0.5)* Probability of an element to be zeroed.")
+    .Arg(
+        "ratio",
+        "*(type: float; default: 0.5)* Probability of an element to be zeroed.")
     .ArgIsTest(
         "*(type: int; default: 0)* If zero (train mode), perform dropout. If non-zero"
         "(test mode), Y = X.")
@@ -154,7 +156,7 @@ mask: [[False False False  True  True]
         1,
         "mask",
         "*(type: Tensor`<bool>`)* The output mask containing boolean values for"
-        "each element, signifying which elements are dropped out. If `is_test` is" 
+        "each element, signifying which elements are dropped out. If `is_test` is"
         "nonzero, this output is not filled.")
     .InheritOnnxSchema("Dropout");
 
diff --git a/caffe2/operators/dropout_op.cu b/caffe2/operators/dropout_op.cu
index 745840e82affc..6489ada5927ba 100644
--- a/caffe2/operators/dropout_op.cu
+++ b/caffe2/operators/dropout_op.cu
@@ -25,15 +25,15 @@ bool DropoutOp<float, CUDAContext>::RunOnDevice() {
   Y->Resize(X.dims());
   if (is_test_) {
     if (Y != &X) {
-      context_.Copy<float, CUDAContext, CUDAContext>(
-          X.size(), X.data<float>(), Y->mutable_data<float>());
+      context_.CopySameDevice<float>(
+          X.size(), X.data<float>(), Y->template mutable_data<float>());
     }
     return true;
   } else {
     // We do a simple trick here: since curand cannot generate random
     // boolean numbers, we will generate into dY and write the result to
     // mask.
-    float* Ydata = Y->mutable_data<float>();
+    float* Ydata = Y->template mutable_data<float>();
     auto* mask = Output(1);
     mask->Resize(X.dims());
     CAFFE_ENFORCE(X.data<float>() != Ydata, "In-place GPU dropout is broken");
@@ -44,7 +44,11 @@ bool DropoutOp<float, CUDAContext>::RunOnDevice() {
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
-        X.size(), ratio_, X.data<float>(), Ydata, mask->mutable_data<bool>());
+        X.size(),
+        ratio_,
+        X.data<float>(),
+        Ydata,
+        mask->template mutable_data<bool>());
     return true;
   }
 }
@@ -69,8 +73,8 @@ bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
   dX->Resize(dY.dims());
   if (is_test_) {
     if (dX != &dY) {
-      context_.Copy<float, CUDAContext, CUDAContext>(
-          dY.size(), dY.data<float>(), dX->mutable_data<float>());
+      context_.CopySameDevice<float>(
+          dY.size(), dY.data<float>(), dX->template mutable_data<float>());
     }
     return true;
   } else {
@@ -86,7 +90,7 @@ bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
         dY.data<float>(),
         mask.data<bool>(),
         scale,
-        dX->mutable_data<float>());
+        dX->template mutable_data<float>());
     return true;
   }
 }
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index 906c2563aa0e0..8e66f03aa6d14 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -141,7 +141,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
   // now actually run the computation
   if (is_test_) {
     if (Y != &X) {
-      context_.Copy<T, CUDAContext, CUDAContext>(
+      context_.CopySameDevice<T>(
           X.size(), X.template data<T>(), Y->template mutable_data<T>());
     }
     return true;
@@ -150,8 +150,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
     // Reshape tensor descriptors if necessary
     if (X.dims() != cudnn_input_dims_ && !is_test_) {
       CAFFE_ENFORCE(scratch_blob_);
-      Tensor<CUDAContext>* states =
-          scratch_blob_->GetMutable<Tensor<CUDAContext>>();
+      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
       cudnn_input_dims_ = X.dims();
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
@@ -172,7 +171,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
       if (!states_initialized_) {
         // set the dropout descriptor (note: need to allocate the states data
         // before acquiring the mutex)
-        uint8_t* states_data = states->mutable_data<uint8_t>();
+        uint8_t* states_data = states->template mutable_data<uint8_t>();
         {
           // Need to protect  as clashes with NCCL
           std::lock_guard<std::mutex> lk(CUDAContext::mutex());
@@ -195,7 +194,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
         X.template data<T>(),
         data_desc_,
         Y->template mutable_data<T>(),
-        mask->mutable_data<uint8_t>(),
+        mask->template mutable_data<uint8_t>(),
         reserve_space_size_in_bytes_));
   }
   return true;
@@ -219,7 +218,7 @@ template <typename T, typename M>
 bool CuDNNDropoutGradientOp::DoRunWithType() {
   const auto& dY = Input(0);
   const auto& mask = Input(1);
-  const Tensor<CUDAContext>& states = scratch_blob_->Get<Tensor<CUDAContext>>();
+  const Tensor& states = scratch_blob_->Get<Tensor>();
   auto* dX = Output(0);
 
   auto size_prod = 1;
diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc
index e935136905fba..d68bfbc5a0eb9 100644
--- a/caffe2/operators/elementwise_linear_op.cc
+++ b/caffe2/operators/elementwise_linear_op.cc
@@ -23,7 +23,7 @@ bool ElementwiseLinearOp<float, CPUContext>::RunOnDevice(){
   const float* X_data = X.data<float>();
   const float* a_data = a.data<float>();
   const float* b_data = b.data<float>();
-  float* Y_data = Y->mutable_data<float>();
+  float* Y_data = Y->template mutable_data<float>();
 
   int p = 0;
   for (int n = 0; n < N; ++n) {
@@ -48,7 +48,7 @@ bool ElementwiseLinearGradientOp<float, CPUContext>::RunOnDevice(){
   CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
   CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
 
-  auto *g_X = Output(0);
+  auto* g_X = Output(0);
   auto *g_a = Output(1);
   auto *g_b = Output(2);
   g_X->ResizeLike(X);
@@ -58,9 +58,9 @@ bool ElementwiseLinearGradientOp<float, CPUContext>::RunOnDevice(){
   const float* g_o_data = g_o.data<float>();
   const float* X_data = X.data<float>();
   const float* a_data = a.data<float>();
-  float* g_X_data = g_X->mutable_data<float>();
-  float* g_a_data = g_a->mutable_data<float>();
-  float* g_b_data = g_b->mutable_data<float>();
+  float* g_X_data = g_X->template mutable_data<float>();
+  float* g_a_data = g_a->template mutable_data<float>();
+  float* g_b_data = g_b->template mutable_data<float>();
 
   math::Set<float, CPUContext>(g_a->size(), 0.f, g_a_data, &context_);
   math::Set<float, CPUContext>(g_b->size(), 0.f, g_b_data, &context_);
diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
index e4cd235eeffa3..efbf52a86a18f 100644
--- a/caffe2/operators/elementwise_linear_op.cu
+++ b/caffe2/operators/elementwise_linear_op.cu
@@ -67,10 +67,17 @@ bool ElementwiseLinearOp<float, CUDAContext>::RunOnDevice(){
 
   Y->ResizeLike(X);
 
-  ElementwiseLinearKernel<<<CAFFE_GET_BLOCKS(N * D), CAFFE_CUDA_NUM_THREADS,
-                          0, context_.cuda_stream()>>>(
-    N, D, X.data<float>(), a.data<float>(), b.data<float>(),
-    Y->mutable_data<float>());
+  ElementwiseLinearKernel<<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      X.data<float>(),
+      a.data<float>(),
+      b.data<float>(),
+      Y->template mutable_data<float>());
   return true;
 }
 
@@ -88,15 +95,15 @@ bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
   CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
   CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
 
-  auto *g_X = Output(0);
+  auto* g_X = Output(0);
   auto *g_a = Output(1);
   auto *g_b = Output(2);
   g_X->ResizeLike(X);
   g_a->ResizeLike(a);
   g_b->ResizeLike(a);
 
-  float* g_a_data = g_a->mutable_data<float>();
-  float* g_b_data = g_b->mutable_data<float>();
+  float* g_a_data = g_a->template mutable_data<float>();
+  float* g_b_data = g_b->template mutable_data<float>();
 
   ElementwiseLinearGradientKernel<<<
       D,
@@ -108,7 +115,7 @@ bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
       g_o.data<float>(),
       X.data<float>(),
       a.data<float>(),
-      g_X->mutable_data<float>(),
+      g_X->template mutable_data<float>(),
       g_a_data,
       g_b_data);
   return true;
diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc
index 9da98a83b78ae..5ddd4570356e9 100644
--- a/caffe2/operators/elementwise_logical_ops.cc
+++ b/caffe2/operators/elementwise_logical_ops.cc
@@ -12,8 +12,8 @@ OPERATOR_SCHEMA(Where)
     .AllowInplace({{1, 2}})
     .IdenticalTypeAndShapeOfInput(1)
     .SetDoc(R"DOC(
-Operator Where takes three input data (Tensor<bool>, Tensor<T>, Tensor<T>) and
-produces one output data (Tensor<T>) where z = c ? x : y is applied elementwise.
+Operator Where takes three input data (Tensor, Tensor, Tensor) and
+produces one output data (Tensor) where z = c ? x : y is applied elementwise.
 )DOC")
     .Input(0, "C", "input tensor containing booleans")
     .Input(1, "X", "input tensor")
diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h
index 99b84c5830397..a90e3332d861d 100644
--- a/caffe2/operators/elementwise_logical_ops.h
+++ b/caffe2/operators/elementwise_logical_ops.h
@@ -53,13 +53,13 @@ class WhereOp final : public Operator<Context> {
       for (int i = 0; i < select.size(); i++) {
         size_t offset = i * block_size;
         if (select_data[i]) {
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               output->meta(),
               block_size,
               left_data + offset,
               output_data + offset);
         } else {
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               output->meta(),
               block_size,
               right_data + offset,
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index 6b3151ecc4990..9afb154d9bddc 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -19,7 +19,7 @@ void FillTensor(
     const std::vector<caffe2::TIndex>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<caffe2::Tensor<Context>>();
+  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
   tensor->Resize(shape);
   auto* mutable_data = tensor->template mutable_data<O_Type>();
   const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
@@ -59,8 +59,7 @@ void elementwiseAnd() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{true, false, false, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -80,8 +79,7 @@ void elementwiseAnd() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{
         true, false, false, false, true, false, false, false};
@@ -107,8 +105,7 @@ void elementwiseOr() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{true, true, true, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -128,8 +125,7 @@ void elementwiseOr() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{true, true, true, false, true, true, true, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -154,8 +150,7 @@ void elementwiseXor() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{false, true, true, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -175,8 +170,7 @@ void elementwiseXor() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{
         false, true, true, false, false, true, true, false};
@@ -201,8 +195,7 @@ void elementwiseNot() {
   EXPECT_TRUE(op->Run());
   auto* blob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, blob);
-  caffe2::CPUContext context;
-  caffe2::TensorCPU Y(blob->Get<caffe2::Tensor<Context>>(), &context);
+  caffe2::Tensor Y(blob->Get<caffe2::Tensor>(), caffe2::CPU);
   EXPECT_EQ(Y.size(), N);
   std::vector<bool> result{false, true};
   for (size_t i = 0; i < Y.size(); ++i) {
@@ -224,8 +217,7 @@ void elementwiseEQ() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{false, true, false, true};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -242,8 +234,7 @@ void elementwiseEQ() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{true, true, false, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -262,8 +253,7 @@ void elementwiseEQ() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::CPUContext context;
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
+    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{
         true, false, false, true, false, true, true, false};
diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
index 1dee0d6272470..2bd91f191229f 100644
--- a/caffe2/operators/elementwise_ops.cu
+++ b/caffe2/operators/elementwise_ops.cu
@@ -88,7 +88,7 @@ void device_reduce(
     const T* d_in,
     T* d_out,
     int N,
-    Tensor<CUDAContext>* buffer,
+    Tensor* buffer,
     CUDAContext* context) {
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
@@ -114,7 +114,7 @@ void device_reduce<float16>(
     const float16* in,
     float16* out,
     int N,
-    Tensor<CUDAContext>* buffer,
+    Tensor* buffer,
     CUDAContext* context) {
 #if defined(__HIPCC__) && !ROCBLAS_FP16
   CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
@@ -127,7 +127,7 @@ void device_reduce<float16>(
     math::Set<float16, CUDAContext>(
         N,
         convert::To<float, float16>(1.),
-        buffer->mutable_data<float16>(),
+        buffer->template mutable_data<float16>(),
         context);
   }
 
diff --git a/caffe2/operators/elementwise_ops.h b/caffe2/operators/elementwise_ops.h
index aec5ea458fff4..747d2fe0285c2 100644
--- a/caffe2/operators/elementwise_ops.h
+++ b/caffe2/operators/elementwise_ops.h
@@ -512,8 +512,8 @@ class SumReduceLikeOp final : public Operator<Context> {
   int axis_;
   string axis_str_;
   string order_;
-  Tensor<Context> ones_;
-  Tensor<Context> sum_buffer_;
+  Tensor ones_{Context::GetDeviceType()};
+  Tensor sum_buffer_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops_utils.cc b/caffe2/operators/elementwise_ops_utils.cc
index 3d906c9a0f708..6a5b136831c48 100644
--- a/caffe2/operators/elementwise_ops_utils.cc
+++ b/caffe2/operators/elementwise_ops_utils.cc
@@ -3,6 +3,45 @@
 namespace caffe2 {
 namespace elementwise_ops_utils {
 
+std::tuple<size_t, size_t, size_t>
+ComputeLegacyBroadcastSizes(const Tensor& A, const Tensor& B, int axis) {
+  CAFFE_ENFORCE_GE(
+      A.ndim(),
+      B.ndim(),
+      "If you are doing broadcasting, input1 should have "
+      "a smaller or equal number of dimensions.");
+  if (axis == -1) {
+    axis = A.ndim() - B.ndim();
+  }
+  CAFFE_ENFORCE(
+      axis >= 0 && axis <= A.ndim() - B.ndim(),
+      "Broadcast axis should be in the range of"
+      "[0, A.ndim() - B.ndim()], but axis = ",
+      axis);
+
+  int b_dim_start = 0;
+  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
+    ++b_dim_start;
+  }
+  int b_dim_end = B.ndim() - 1;
+  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
+    --b_dim_end;
+  }
+  size_t pre = 1, n = 1, post = 1;
+  for (int i = 0; i < axis + b_dim_start; ++i) {
+    pre *= A.dim(i);
+  }
+  for (int i = b_dim_start; i <= b_dim_end; ++i) {
+    CAFFE_ENFORCE_EQ(
+        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
+    n *= B.dim(i);
+  }
+  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
+    post *= A.dim(i);
+  }
+  return std::make_tuple(pre, n, post);
+}
+
 std::vector<int> ComputeBinaryBroadcastForwardDims(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims) {
diff --git a/caffe2/operators/elementwise_ops_utils.h b/caffe2/operators/elementwise_ops_utils.h
index dd37b12076e30..f8ff47cdf4ced 100644
--- a/caffe2/operators/elementwise_ops_utils.h
+++ b/caffe2/operators/elementwise_ops_utils.h
@@ -10,48 +10,8 @@
 namespace caffe2 {
 namespace elementwise_ops_utils {
 
-template <typename Context>
-std::tuple<size_t, size_t, size_t> ComputeLegacyBroadcastSizes(
-    const Tensor<Context>& A,
-    const Tensor<Context>& B,
-    int axis) {
-  CAFFE_ENFORCE_GE(
-      A.ndim(),
-      B.ndim(),
-      "If you are doing broadcasting, input1 should have "
-      "a smaller or equal number of dimensions.");
-  if (axis == -1) {
-    axis = A.ndim() - B.ndim();
-  }
-  CAFFE_ENFORCE(
-      axis >= 0 && axis <= A.ndim() - B.ndim(),
-      "Broadcast axis should be in the range of"
-      "[0, A.ndim() - B.ndim()], but axis = ",
-      axis);
-
-  int b_dim_start = 0;
-  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
-    ++b_dim_start;
-  }
-  int b_dim_end = B.ndim() - 1;
-  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
-    --b_dim_end;
-  }
-  size_t pre = 1, n = 1, post = 1;
-  for (int i = 0; i < axis + b_dim_start; ++i) {
-    pre *= A.dim(i);
-  }
-  for (int i = b_dim_start; i <= b_dim_end; ++i) {
-    CAFFE_ENFORCE_EQ(
-        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
-    n *= B.dim(i);
-  }
-  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
-    post *= A.dim(i);
-  }
-  return std::make_tuple(pre, n, post);
-}
-
+std::tuple<size_t, size_t, size_t>
+ComputeLegacyBroadcastSizes(const Tensor& A, const Tensor& B, int axis);
 std::vector<int> ComputeBinaryBroadcastForwardDims(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims);
diff --git a/caffe2/operators/enforce_finite_op.cu b/caffe2/operators/enforce_finite_op.cu
index b909d70cb43cc..38f1669a40af3 100644
--- a/caffe2/operators/enforce_finite_op.cu
+++ b/caffe2/operators/enforce_finite_op.cu
@@ -7,7 +7,7 @@ namespace caffe2 {
 template <>
 template <typename T>
 bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
-  buffer_.CopyFrom<CUDAContext, CUDAContext>(Input(0), &context_);
+  buffer_.CopyFrom(Input(0), &context_);
   EnforceOnCPU<T>(buffer_);
   return true;
 }
diff --git a/caffe2/operators/enforce_finite_op.h b/caffe2/operators/enforce_finite_op.h
index d8e5a15a3ac0a..a1f63ecb7bccc 100644
--- a/caffe2/operators/enforce_finite_op.h
+++ b/caffe2/operators/enforce_finite_op.h
@@ -23,10 +23,10 @@ class EnforceFiniteOp final : public Operator<Context> {
   bool DoRunWithType();
 
  private:
-  Tensor<CPUContext> buffer_;
+  Tensor buffer_{Context::GetDeviceType()};
 
   template <typename T>
-  void EnforceOnCPU(const Tensor<CPUContext>& input) {
+  void EnforceOnCPU(const Tensor& input) {
     const T* input_data = input.template data<T>();
     auto size = input.size();
 
diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h
index 8130f42ad026c..46b820f316893 100644
--- a/caffe2/operators/ensure_cpu_output_op.h
+++ b/caffe2/operators/ensure_cpu_output_op.h
@@ -15,9 +15,9 @@ class EnsureCPUOutputOp : public Operator<Context> {
       : Operator<Context>(operator_def, ws) {}
 
   bool RunOnDevice() override {
-    if (OperatorBase::InputIsType<TensorCPU>(0)) {
+    if (OperatorBase::InputIsType<Tensor>(0, CPU)) {
       return CopyWithContext<CPUContext>();
-    } else if (OperatorBase::InputIsType<Tensor<Context>>(0)) {
+    } else if (OperatorBase::InputIsType<Tensor>(0, Context::GetDeviceType())) {
       // CUDA Context will go this branch
       return CopyWithContext<Context>();
     } else {
@@ -32,10 +32,10 @@ class EnsureCPUOutputOp : public Operator<Context> {
   template <class InputContext>
   bool CopyWithContext() {
     // Output is always on CPU
-    auto* output = OperatorBase::Output<TensorCPU>(0);
-    auto& input = OperatorBase::Input<Tensor<InputContext>>(0);
+    auto* output = OperatorBase::Output<Tensor>(0, CPU);
+    auto& input = OperatorBase::Input<Tensor>(0, InputContext::GetDeviceType());
     output->ResizeLike(input);
-    context_.template CopyItems<InputContext, CPUContext>(
+    context_.CopyItemsToCPU(
         input.meta(),
         input.size(),
         input.raw_data(),
diff --git a/caffe2/operators/expand_op.h b/caffe2/operators/expand_op.h
index 9f5406fe62447..8337862630390 100644
--- a/caffe2/operators/expand_op.h
+++ b/caffe2/operators/expand_op.h
@@ -26,7 +26,7 @@ class ExpandOp final : public Operator<Context> {
     const auto& X = Input(0);
     const auto& Y_shape_tensor = Input(1);
     std::vector<int64_t> shape_dims(Y_shape_tensor.size());
-    context_.template Copy<int64_t, Context, CPUContext>(
+    context_.template CopyToCPU<int64_t>(
         Y_shape_tensor.size(),
         Y_shape_tensor.template data<int64_t>(),
         shape_dims.data());
diff --git a/caffe2/operators/feature_maps_ops.h b/caffe2/operators/feature_maps_ops.h
index 7c9b7abeff03b..a9da8a7ebd3f8 100644
--- a/caffe2/operators/feature_maps_ops.h
+++ b/caffe2/operators/feature_maps_ops.h
@@ -198,7 +198,7 @@ class MergeSingleListFeatureTensorsOp : public Operator<Context> {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValues.meta(),
               inLengthsData[exampleIndex],
               &inValues.template data<T>()[inValuesOffset_[inputIndex]],
@@ -268,7 +268,7 @@ class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> {
             Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
         if (inPresenceData[exampleIndex]) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValuesValuesGrad.meta(),
               inLengthsData[exampleIndex],
               &inValuesValuesGradData[inValuesValuesOffset],
@@ -367,12 +367,12 @@ class MergeSingleMapFeatureTensorsOp : public Operator<Context> {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inKeys.meta(),
               inLengthsData[exampleIndex],
               &inKeys.template data<K>()[inValuesOffset_[inputIndex]],
               &outValuesKeysData[valuesOffset]);
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValues.meta(),
               inLengthsData[exampleIndex],
               &inValues.template data<V>()[inValuesOffset_[inputIndex]],
@@ -510,7 +510,7 @@ class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> {
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         if (inLengthsData[exampleIndex] > 0) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValuesGrad.meta(),
               inLengthsData[exampleIndex],
               &inValuesGradData[inValuesOffset],
@@ -597,7 +597,7 @@ class MergeMultiListFeatureTensorsOp : public Operator<Context> {
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesLengthsData[outKeysOffset] =
               inValuesLengthsData[inKeysOffset_[inputIndex]];
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValuesValues.meta(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesValues
@@ -703,13 +703,13 @@ class MergeMultiMapFeatureTensorsOp : public Operator<Context> {
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesLengthsData[outKeysOffset] =
               inValuesLengthsData[inKeysOffset_[inputIndex]];
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValuesKeys.meta(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesKeys
                    .template data<K>()[inValuesValuesOffset_[inputIndex]],
               &outValuesKeysData[outValuesValuesOffset]);
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValuesValues.meta(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesValues
@@ -791,7 +791,7 @@ class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> {
         }
         if (valuesLengthCopy > 0) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               inValuesValuesGrad.meta(),
               valuesLengthCopy,
               &inValuesValuesGradData[inValuesValuesOffset],
diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc
index 021df2ec0abe5..ff3eac217390a 100644
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@@ -3,9 +3,8 @@
 namespace caffe2 {
 
 template <>
-bool RangeFillOp<float, CPUContext>::Fill(
-    TensorCPU* output) {
-  float* data = output->mutable_data<float>();
+bool RangeFillOp<float, CPUContext>::Fill(Tensor* output) {
+  float* data = output->template mutable_data<float>();
   for (int i = 0; i < output->size(); ++i) {
     data[i] = i;
   }
@@ -14,7 +13,7 @@ bool RangeFillOp<float, CPUContext>::Fill(
 
 template <>
 template <typename T>
-bool DiagonalFillOp<CPUContext>::FillWithType(TensorCPU* output) {
+bool DiagonalFillOp<CPUContext>::FillWithType(Tensor* output) {
   VerifyOutputShape(output);
   T value = OperatorBase::GetSingleArgument<T>("value", 0);
   auto* data = output->template mutable_data<T>();
diff --git a/caffe2/operators/filler_op.cu b/caffe2/operators/filler_op.cu
index 9df195a918b91..65918cc04b5d5 100644
--- a/caffe2/operators/filler_op.cu
+++ b/caffe2/operators/filler_op.cu
@@ -25,19 +25,19 @@ __global__ void FillDiagonalKernel(
 }
 
 template <>
-bool RangeFillOp<float, CUDAContext>::Fill(TensorCUDA* output) {
+bool RangeFillOp<float, CUDAContext>::Fill(Tensor* output) {
   int N = output->size();
   FillRangeKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
-      context_.cuda_stream()>>>(N, output->mutable_data<float>());
+      context_.cuda_stream()>>>(N, output->template mutable_data<float>());
   return true;
 }
 
 template <>
 template <typename T>
-bool DiagonalFillOp<CUDAContext>::FillWithType(TensorCUDA* output) {
+bool DiagonalFillOp<CUDAContext>::FillWithType(Tensor* output) {
   VerifyOutputShape(output);
   auto* data = output->template mutable_data<T>();
   int size = output->size();
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
index c144b70378273..659b4906cc4d4 100644
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@@ -56,7 +56,7 @@ class FillerOp : public Operator<Context> {
       auto shape = vector<TIndex>{};
       if (input_as_shape_) {
         // Shape input must be in CPU context
-        auto& input = OperatorBase::Input<Tensor<CPUContext>>(0);
+        auto& input = OperatorBase::Input<Tensor>(0, CPU);
         CAFFE_ENFORCE_EQ(
             input.ndim(),
             1,
@@ -76,7 +76,7 @@ class FillerOp : public Operator<Context> {
     return Fill(output);
   }
 
-  virtual bool Fill(Tensor<Context>* output) = 0;
+  virtual bool Fill(Tensor* output) = 0;
 
  protected:
   vector<TIndex> shape_;
@@ -105,7 +105,7 @@ class UniformFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     T min = min_;
     T max = max_;
     if (InputSize() == 3) {
@@ -163,7 +163,7 @@ class UniqueUniformFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     return (this->*body_)(output);
   }
 
@@ -179,7 +179,7 @@ class UniqueUniformFillOp final : public FillerOp<Context> {
   }
 
   template <typename T>
-  bool FillWithType(Tensor<Context>* output) {
+  bool FillWithType(Tensor* output) {
     T min = OperatorBase::GetSingleArgument<T>("min", 0);
     T max = OperatorBase::GetSingleArgument<T>("max", 0);
 
@@ -201,7 +201,7 @@ class UniqueUniformFillOp final : public FillerOp<Context> {
     return true;
   }
 
-  bool (UniqueUniformFillOp::*body_)(Tensor<Context>* output);
+  bool (UniqueUniformFillOp::*body_)(Tensor* output);
 };
 
 template <class Context>
@@ -268,12 +268,12 @@ class ConstantFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     return (this->*body_)(output);
   }
 
   template <typename T>
-  bool FillWithType(Tensor<Context>* output) {
+  bool FillWithType(Tensor* output) {
     T value = OperatorBase::GetSingleArgument<T>("value", 0);
     auto* data = output->template mutable_data<T>();
     if (output->size()) {
@@ -282,7 +282,7 @@ class ConstantFillOp final : public FillerOp<Context> {
     return true;
   }
 
-  bool FillWithString(Tensor<Context>* output) {
+  bool FillWithString(Tensor* output) {
     auto value = OperatorBase::GetSingleArgument<std::string>("value", "");
     auto* data = output->template mutable_data<std::string>();
     for (int i = 0; i < output->size(); ++i) {
@@ -292,7 +292,7 @@ class ConstantFillOp final : public FillerOp<Context> {
   }
 
  private:
-  bool (ConstantFillOp::*body_)(Tensor<Context>* output);
+  bool (ConstantFillOp::*body_)(Tensor* output);
 };
 
 template <class Context>
@@ -355,19 +355,19 @@ class DiagonalFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     return (this->*body_)(output);
   }
 
   template <typename T>
-  bool FillWithType(Tensor<Context>* output);
+  bool FillWithType(Tensor* output);
 
  private:
-  void VerifyOutputShape(Tensor<Context>* output) {
+  void VerifyOutputShape(Tensor* output) {
     CAFFE_ENFORCE(output->ndim() >= 2, "Input shape must be >= 2D");
   }
 
-  TIndex GetStepSize(Tensor<Context>* output) {
+  TIndex GetStepSize(Tensor* output) {
     TIndex step;
     if (output->ndim() == 2) {
       step = output->dim(1) + 1;
@@ -393,7 +393,7 @@ class DiagonalFillOp final : public FillerOp<Context> {
     return step;
   }
 
-  bool (DiagonalFillOp::*body_)(Tensor<Context>* output);
+  bool (DiagonalFillOp::*body_)(Tensor* output);
 };
 
 template <typename T, class Context>
@@ -407,7 +407,7 @@ class GaussianFillOp final : public FillerOp<Context> {
     DCHECK_GT(std_, 0) << "Standard deviation should be nonnegative.";
   }
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     math::RandGaussian<T, Context>(
         output->size(),
         mean_,
@@ -429,7 +429,7 @@ class XavierFillOp final : public FillerOp<Context> {
   XavierFillOp(const OperatorDef& operator_def, Workspace* ws)
       : FillerOp<Context>(operator_def, ws) {}
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     const int fan_in = output->size() / output->dim32(0);
     T scale = std::sqrt(T(3) / fan_in);
     math::RandUniform<T, Context>(
@@ -449,7 +449,7 @@ class MSRAFillOp final : public FillerOp<Context> {
   MSRAFillOp(const OperatorDef& operator_def, Workspace* ws)
       : FillerOp<Context>(operator_def, ws) {}
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     const int fan_out = output->size() / output->dim32(1);
     T scale = std::sqrt(T(2) / fan_out);
     math::RandGaussian<T, Context>(
@@ -472,7 +472,7 @@ class RangeFillOp final : public FillerOp<Context> {
   RangeFillOp(const OperatorDef& operator_def, Workspace* ws)
       : FillerOp<Context>(operator_def, ws) {}
 
-  bool Fill(Tensor<Context>* output) override;
+  bool Fill(Tensor* output) override;
 };
 
 template <class Context>
diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
index 32bceda79acc8..da6061ef03315 100644
--- a/caffe2/operators/find_op.cu
+++ b/caffe2/operators/find_op.cu
@@ -38,7 +38,7 @@ bool FindOp<CUDAContext>::DoRunWithType() {
 
   const T* idx_data = idx.data<T>();
   const T* needles_data = needles.data<T>();
-  int* res_data = res_indices->mutable_data<int>();
+  int* res_data = res_indices->template mutable_data<int>();
 
   FindKernel<
       T><<<needles.size(), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
diff --git a/caffe2/operators/flatten_op.h b/caffe2/operators/flatten_op.h
index a250cd4c272b8..43851ce3aa955 100644
--- a/caffe2/operators/flatten_op.h
+++ b/caffe2/operators/flatten_op.h
@@ -20,7 +20,7 @@ class FlattenOp : public Operator<Context> {
     CAFFE_ENFORCE_GE(
         input.dims().size(), axis_, "The rank of the tensor must be >= axis.");
     output->Resize(input.size_to_dim(axis_), input.size_from_dim(axis_));
-    context_.template CopyItems<Context, Context>(
+    context_.CopyItemsSameDevice(
         input.meta(),
         input.size(),
         input.raw_data(),
diff --git a/caffe2/operators/floor_op.cu b/caffe2/operators/floor_op.cu
index a1bd383e1821f..41723d84da2b6 100644
--- a/caffe2/operators/floor_op.cu
+++ b/caffe2/operators/floor_op.cu
@@ -22,7 +22,7 @@ bool FloorOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->mutable_data<float>());
+      X.size(), X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h
index 068acfec19c30..eca665750cf05 100644
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@@ -144,7 +144,8 @@ class FullyConnectedOp final : public Operator<Context> {
   // A local vector to cache the output shape so we don't need to recreate
   // a vector object every time we run Run().
   vector<TIndex> Y_shape_cache_;
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  ;
 
   bool float16_compute_;
 };
@@ -312,7 +313,7 @@ class FullyConnectedGradientOp : public Operator<Context> {
  protected:
   size_t axis_{1};
   size_t axis_w_{1};
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
   bool float16_compute_;
 };
 
diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.h b/caffe2/operators/gather_fused_8bit_rowwise_op.h
index 621ea335a4993..3b6f549fe624c 100644
--- a/caffe2/operators/gather_fused_8bit_rowwise_op.h
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h
@@ -14,7 +14,7 @@ class GatherFused8BitRowwiseOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
   }
 
   template <typename Index>
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index 81f4fa53d5599..adc308ecdd325 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -30,7 +30,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(RANGES));
+        this, OperatorBase::Input<Tensor>(RANGES, CPU));
   }
 
   template <typename Index>
@@ -88,7 +88,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
             j);
 
         if (InputSize() == 2) {
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               data.meta(),
               rangeLength,
               rawData + rangeStart * itemsize,
diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc
index 0b4f3a6a9d755..2b1039b35a846 100644
--- a/caffe2/operators/generate_proposals_op.cc
+++ b/caffe2/operators/generate_proposals_op.cc
@@ -290,8 +290,8 @@ bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
   }
   out_rois->Extend(roi_counts, 50, &context_);
   out_rois_probs->Extend(roi_counts, 50, &context_);
-  float* out_rois_ptr = out_rois->mutable_data<float>();
-  float* out_rois_probs_ptr = out_rois_probs->mutable_data<float>();
+  float* out_rois_ptr = out_rois->template mutable_data<float>();
+  float* out_rois_probs_ptr = out_rois_probs->template mutable_data<float>();
   for (int i = 0; i < num_images; i++) {
     const ERArrXXf& im_i_boxes = im_boxes[i];
     const EArrXf& im_i_probs = im_probs[i];
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index af9214379becd..a090842205b7f 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -18,10 +18,10 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
-      tensor->size(), value, tensor->mutable_data<float>(), &context);
+      tensor->size(), value, tensor->template mutable_data<float>(), &context);
   return;
 }
 
@@ -34,10 +34,10 @@ static void AddLinSpacedInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
-      tensor->mutable_data<float>(), tensor->size());
+      tensor->template mutable_data<float>(), tensor->size());
   tensor_vec.setLinSpaced(min_val, max_val);
 
   return;
@@ -51,10 +51,10 @@ static void AddInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
-      tensor->mutable_data<float>(), tensor->size());
+      tensor->template mutable_data<float>(), tensor->size());
   tensor_vec.array() = utils::AsEArrXt(values);
 
   return;
@@ -79,7 +79,7 @@ TEST(GenerateProposalsTest, TestComputeAllAnchors) {
       79, -68, 8, 115, 103, -160, -40, 207, 151, -6, 32, 85, 79, -52, 8, 131,
       103, -144, -40, 223, 151;
 
-  TensorCPU anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()});
+  Tensor anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()}, CPU);
   Eigen::Map<ERMatXf>(
       anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
       anchors;
@@ -143,7 +143,7 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) {
     all_anchors_gt(i, 4) = angles[i % angles.size()];
   }
 
-  TensorCPU anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()});
+  Tensor anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()}, CPU);
   Eigen::Map<ERMatXf>(
       anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
       anchors;
diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h
index bf2119d0f5a43..df0c27642337b 100644
--- a/caffe2/operators/given_tensor_fill_op.h
+++ b/caffe2/operators/given_tensor_fill_op.h
@@ -51,7 +51,7 @@ class GivenTensorFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor<Context>* output) override {
+  bool Fill(Tensor* output) override {
     return (this->*body_)(output);
   }
 
@@ -69,20 +69,20 @@ class GivenTensorFillOp final : public FillerOp<Context> {
   }
 
   template <typename Type>
-  bool FillWithType(Tensor<Context>* output) {
+  bool FillWithType(Tensor* output) {
     DCHECK_EQ(output->size(), values_.size())
         << "output size: " << output->size()
         << " given size: " << values_.size();
     auto* data = output->template mutable_data<Type>();
     const Type* values_data = values_.template data<Type>();
     if (output->size()) {
-      context_.template Copy<Type, CPUContext, Context>(
-          output->size(), values_data, data);
+      context_.CopyItemsFromCPU(
+          TypeMeta::Make<Type>(), output->size(), values_data, data);
     }
     return true;
   }
 
-  bool (GivenTensorFillOp::*body_)(Tensor<Context>* output);
-  TensorCPU values_;
+  bool (GivenTensorFillOp::*body_)(Tensor* output);
+  Tensor values_{CPU};
 };
 } // namespace caffe2
diff --git a/caffe2/operators/group_norm_op.h b/caffe2/operators/group_norm_op.h
index a65f57c5fb98c..8ff11353aa4f8 100644
--- a/caffe2/operators/group_norm_op.h
+++ b/caffe2/operators/group_norm_op.h
@@ -152,8 +152,8 @@ class GroupNormGradientOp final : public Operator<Context> {
   const int group_;
   const StorageOrder order_;
 
-  Tensor<Context> ds_;
-  Tensor<Context> db_;
+  Tensor ds_{Context::GetDeviceType()};
+  Tensor db_{Context::GetDeviceType()};
 
   // Input: dY, X, gamma, beta, mu, inv_sig
   // Output: dX, dgamma, dbeta
diff --git a/caffe2/operators/gru_unit_op.h b/caffe2/operators/gru_unit_op.h
index ecbaac2c3c15c..c6a85ac5fb61c 100644
--- a/caffe2/operators/gru_unit_op.h
+++ b/caffe2/operators/gru_unit_op.h
@@ -143,8 +143,9 @@ class GRUUnitOp : public Operator<Context> {
       seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
     }
 
-    const auto t = static_cast<OperatorBase*>(this)->
-      Input<Tensor<CPUContext>>(TIMESTEP).template data<int32_t>()[0];
+    const auto t = static_cast<OperatorBase*>(this)
+                       ->Input<Tensor>(TIMESTEP, CPU)
+                       .template data<int32_t>()[0];
     Output(HIDDEN_T)->ResizeLike(Input(HIDDEN_T_M_1));
     auto* H = Output(HIDDEN_T)->template mutable_data<T>();
 
@@ -194,8 +195,9 @@ class GRUUnitGradientOp : public Operator<Context> {
     CAFFE_ENFORCE_EQ(3 * D, G);
     const auto* H_prev = Input(HIDDEN_T_M_1).template data<T>();
     const auto* X = Input(GATES).template data<T>();
-    const auto t = static_cast<OperatorBase*>(this)->
-      Input<Tensor<CPUContext>>(TIMESTEP).template data<int32_t>()[0];
+    const auto t = static_cast<OperatorBase*>(this)
+                       ->Input<Tensor>(TIMESTEP, CPU)
+                       .template data<int32_t>()[0];
     const auto* H = Input(HIDDEN_T).template data<T>();
     const auto* H_diff = Input(HIDDEN_T_GRAD).template data<T>();
 
diff --git a/caffe2/operators/h_softmax_op.cc b/caffe2/operators/h_softmax_op.cc
index 1a8689d9c76ff..ff65ba1797c98 100644
--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@@ -36,8 +36,8 @@ float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,
     scale_.mutable_data<float>(), &context_);
 
   // Put the intermediate result X - max(X) into Y
-  context_.template Copy<float, CPUContext, CPUContext>(dim_out, fc_output_data,
-    softmax_output_data);
+  context_.template CopyFromCPU<float>(
+      dim_out, fc_output_data, softmax_output_data);
   // Subtract the scale
   math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, -1,
     sum_multiplier_.data<float>(), scale_.data<float>(), 1, softmax_output_data,
@@ -86,14 +86,14 @@ bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
   int N = W.dim32(0);
   CAFFE_ENFORCE_EQ(N, b.dim32(0));
   Y->Resize(M);
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
   math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
   const auto* labeldata = label.data<int>();
 
   auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
   int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
   intermediate_output->Resize(int_output_size);
-  float * int_output_data = intermediate_output->mutable_data<float>();
+  float* int_output_data = intermediate_output->template mutable_data<float>();
   int int_output_offset = 0;
 
   if (bias_multiplier_.size() != M) {
@@ -151,7 +151,7 @@ void HSoftmaxGradientOp<float, CPUContext>::RunBackwardSingle(const float* X,
   }
 
   float* dX_softmax = dint_output + int_output_offset - dim_out;
-  context_.Copy<float, CPUContext, CPUContext>(dim_out, dX_entropy, dX_softmax);
+  context_.CopyFromCPU<float>(dim_out, dX_entropy, dX_softmax);
 
   math::Dot<float, CPUContext>(dim_out, X_entropy, dX_entropy, scaledata,
     &context_);
@@ -205,10 +205,10 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
   db->ResizeLike(b);
   dX_intermediate_output->ResizeLike(intermediate_output);
 
-  float* dX_data = dX->mutable_data<float>();
-  float* dW_data = dW->mutable_data<float>();
-  float* db_data = db->mutable_data<float>();
-  float* dOutput_data = dX_intermediate_output->mutable_data<float>();
+  float* dX_data = dX->template mutable_data<float>();
+  float* dW_data = dW->template mutable_data<float>();
+  float* db_data = db->template mutable_data<float>();
+  float* dOutput_data = dX_intermediate_output->template mutable_data<float>();
 
   math::Set<float, CPUContext>(X.size(), 0.f, dX_data, &context_);
   math::Set<float, CPUContext>(W.size(), 0.f, dW_data, &context_);
@@ -257,7 +257,7 @@ bool HSoftmaxSearchOp<float, CPUContext>::pruning(
     float parent_score,
     float beam) {
   int w_length = src_node.children_size() + src_node.word_ids_size();
-  Tensor<CPUContext> intermediate_data;
+  Tensor intermediate_data{CPU};
   intermediate_data.Resize(2 * w_length);
   float* int_output_data = intermediate_data.template mutable_data<float>();
   int int_output_offset = 0;
@@ -398,8 +398,10 @@ bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
         [&](std::pair<string, float> a, std::pair<string, float> b) {
           return a.second < b.second;
         });
-    auto* y_name_data = Y_names->mutable_data<string>() + sample * top_n_;
-    auto* y_score_data = Y_scores->mutable_data<float>() + sample * top_n_;
+    auto* y_name_data =
+        Y_names->template mutable_data<string>() + sample * top_n_;
+    auto* y_score_data =
+        Y_scores->template mutable_data<float>() + sample * top_n_;
     for (int i = 0; i < top_n_; i++) {
       if (i < info.size()) {
         y_name_data[i] = info[i].first;
@@ -543,18 +545,18 @@ REGISTER_CPU_OPERATOR(
     HuffmanTreeHierarchyOp<int64_t, CPUContext>);
 
 OPERATOR_SCHEMA(HSoftmax)
-  .NumInputs(4)
-  .NumOutputs(2)
-  .SetDoc(R"DOC(
+    .NumInputs(4)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
 Hierarchical softmax is an operator which approximates the softmax operator
 while giving significant training speed gains and reasonably comparable
 performance. In this operator, instead of calculating the probabilities of all
 the classes, we calculate the probability of each step in the path from root to
 the target word in the hierarchy.
 
-The operator takes a 2-D tensor (Tensor<float>) containing a batch of layers, a
+The operator takes a 2-D tensor (Tensor) containing a batch of layers, a
 set of parameters represented by the weight matrix and bias terms, and a 1-D
-tensor (Tensor<int>) holding labels, or the indices of the target class. The
+tensor (Tensor) holding labels, or the indices of the target class. The
 hierarchy has to be specified as an argument to the operator.
 
 The operator returns a 1-D tensor holding the computed log probability of the
@@ -562,20 +564,28 @@ target class and a 2-D tensor of intermediate outputs (from the weight matrix
 and softmax from each step in the path from root to target class) which will be
 used by the gradient operator to compute gradients for all samples in the batch.
 )DOC")
-  .Arg("hierarchy", "Serialized HierarchyProto string containing list of "
-  "vocabulary words and their paths from root of hierarchy to the leaf")
-  .Input(0, "X", "Input data from previous layer")
-  .Input(1, "W", "2D blob containing 'stacked' fully connected weight "
-  "matrices. Each node in the hierarchy contributes one FC weight matrix if "
-  "it has children nodes. Dimension is N*D, D is input dimension of data (X), "
-  "N is sum of all output dimensions, or total number of nodes (excl root)")
-  .Input(2, "b", "1D blob with N parameters")
-  .Input(3, "labels", "int word_id of the target word")
-  .Output(0, "Y", "1-D of log probability outputs, one per sample")
-  .Output(1, "intermediate_output", "Extra blob to store the intermediate "
-  "FC and softmax outputs for each node in the hierarchical path of a word. "
-  "The outputs from samples are stored in consecutive blocks in the forward "
-  "pass and are used in reverse order in the backward gradientOp pass");
+    .Arg(
+        "hierarchy",
+        "Serialized HierarchyProto string containing list of "
+        "vocabulary words and their paths from root of hierarchy to the leaf")
+    .Input(0, "X", "Input data from previous layer")
+    .Input(
+        1,
+        "W",
+        "2D blob containing 'stacked' fully connected weight "
+        "matrices. Each node in the hierarchy contributes one FC weight matrix if "
+        "it has children nodes. Dimension is N*D, D is input dimension of data (X), "
+        "N is sum of all output dimensions, or total number of nodes (excl root)")
+    .Input(2, "b", "1D blob with N parameters")
+    .Input(3, "labels", "int word_id of the target word")
+    .Output(0, "Y", "1-D of log probability outputs, one per sample")
+    .Output(
+        1,
+        "intermediate_output",
+        "Extra blob to store the intermediate "
+        "FC and softmax outputs for each node in the hierarchical path of a word. "
+        "The outputs from samples are stored in consecutive blocks in the forward "
+        "pass and are used in reverse order in the backward gradientOp pass");
 
 OPERATOR_SCHEMA(HSoftmaxGradient).NumInputs(6).NumOutputs(4);
 
diff --git a/caffe2/operators/h_softmax_op.h b/caffe2/operators/h_softmax_op.h
index 423f5b7a4f2e3..10ee600d89776 100644
--- a/caffe2/operators/h_softmax_op.h
+++ b/caffe2/operators/h_softmax_op.h
@@ -25,9 +25,9 @@ class HSoftmaxOpBase : public Operator<Context> {
 
  protected:
   std::unordered_map<int, PathProto> hierarchy_all_map_;
-  Tensor<Context> scale_;
-  Tensor<Context> sum_multiplier_;
-  Tensor<Context> bias_multiplier_;
+  Tensor scale_{Context::GetDeviceType()};
+  Tensor sum_multiplier_{Context::GetDeviceType()};
+  Tensor bias_multiplier_{Context::GetDeviceType()};
   static constexpr T kLOG_THRESHOLD() {
     return 1e-20f;
   }
diff --git a/caffe2/operators/half_float_ops.cu b/caffe2/operators/half_float_ops.cu
index fb1cd16db4044..111ff3ca9fe2f 100644
--- a/caffe2/operators/half_float_ops.cu
+++ b/caffe2/operators/half_float_ops.cu
@@ -31,7 +31,7 @@ bool FloatToHalfOp<CUDAContext>::RunOnDevice() {
       context_.cuda_stream()>>>(
       X.size(),
       X.data<float>(),
-      reinterpret_cast<half*>(Y->mutable_data<float16>()));
+      reinterpret_cast<half*>(Y->template mutable_data<float16>()));
   return true;
 }
 
@@ -47,7 +47,7 @@ bool HalfToFloatOp<CUDAContext>::RunOnDevice() {
       context_.cuda_stream()>>>(
       X.size(),
       reinterpret_cast<const half*>(X.data<float16>()),
-      Y->mutable_data<float>());
+      Y->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h
index 355dc31d8e550..cff2a620ef469 100644
--- a/caffe2/operators/if_op.h
+++ b/caffe2/operators/if_op.h
@@ -32,7 +32,7 @@ class IfOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor<Context>>(0),
+        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
         "Invalid condition in If operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index dd04c87b75872..b9a8b1b46e272 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -93,8 +93,7 @@ struct Index: IndexBase {
     return true;
   }
 
-  template<typename Ctx>
-  bool Store(Tensor<Ctx>* out) {
+  bool Store(Tensor* out) {
     std::lock_guard<std::mutex> lock(dictMutex_);
     out->Resize(nextId_ - 1);
     auto outData = out->template mutable_data<T>();
@@ -151,7 +150,10 @@ class IndexGetOp: public Operator<CPUContext> {
     const auto& keys = Input(1);
     auto* values = Output(0);
     values->ResizeLike(keys);
-    dict->Get(keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
+    dict->Get(
+        keys.data<T>(),
+        values->template mutable_data<TIndexValue>(),
+        keys.size());
     return true;
   }
 };
@@ -226,7 +228,7 @@ class IndexSizeOp : public Operator<CPUContext> {
     auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
     auto* out = Output(0);
     out->Resize(std::vector<TIndex>{});
-    *out->mutable_data<TIndexValue>() = base->Size();
+    *out->template mutable_data<TIndexValue>() = base->Size();
     return true;
   }
 };
@@ -351,7 +353,7 @@ class IndexSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor) override {
     auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
     Blob tensor_blob;
-    auto* tensor_out = tensor_blob.template GetMutable<Tensor<CPUContext>>();
+    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
 
     if (base->Type().Match<std::string>()) {
       doStore<std::string>(base, tensor_out);
@@ -367,7 +369,7 @@ class IndexSerializer : public BlobSerializerBase {
         tensor_out->size() <= std::numeric_limits<int32_t>::max(),
         "Index too large to be serialized.");
     BlobProto blob_proto;
-    TensorSerializer<CPUContext> ser;
+    TensorSerializer ser;
     ser.Serialize(
         *tensor_out, name, blob_proto.mutable_tensor(), 0, tensor_out->size());
     blob_proto.set_name(name);
@@ -382,9 +384,7 @@ class IndexSerializer : public BlobSerializerBase {
 
  private:
   template <typename T>
-  void doStore(
-      const std::unique_ptr<IndexBase>& base,
-      Tensor<CPUContext>* tensor_out) {
+  void doStore(const std::unique_ptr<IndexBase>& base, Tensor* tensor_out) {
     auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
     CAFFE_ENFORCE(dict, "Wrong dictionary type.");
     dict->Store(tensor_out);
@@ -394,7 +394,7 @@ class IndexSerializer : public BlobSerializerBase {
 class IndexDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override {
-    TensorDeserializer<CPUContext> deser;
+    TensorDeserializer deser;
     Blob tensor_blob;
     deser.Deserialize(proto, &tensor_blob);
 
@@ -403,7 +403,7 @@ class IndexDeserializer : public BlobDeserializerBase {
     bool isFrozen{false};
     is >> maxElements >> isFrozen;
 
-    auto& tensor_in = tensor_blob.template Get<Tensor<CPUContext>>();
+    auto& tensor_in = tensor_blob.template Get<Tensor>();
     auto* base = blob->template GetMutable<std::unique_ptr<IndexBase>>();
 
     if (tensor_in.IsType<std::string>()) {
@@ -426,7 +426,7 @@ class IndexDeserializer : public BlobDeserializerBase {
   void doLoad(
       std::unique_ptr<IndexBase>* base,
       int64_t maxElements,
-      const Tensor<CPUContext>& tensor_in) {
+      const Tensor& tensor_in) {
     base->reset(new Index<T>(maxElements));
     auto* dict = dynamic_cast_if_rtti<Index<T>*>(base->get());
     dict->Load(tensor_in.data<T>(), tensor_in.size());
diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu
index 87532066278b2..8796684c6f237 100644
--- a/caffe2/operators/instance_norm_op.cu
+++ b/caffe2/operators/instance_norm_op.cu
@@ -206,9 +206,9 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto input_data = input.data<float>();
   const auto scale_data = scale.data<float>();
   const auto bias_data = bias.data<float>();
-  auto output_data = output->mutable_data<float>();
-  auto mean_data = mean->mutable_data<float>();
-  auto inv_stdev_data = inv_stdev->mutable_data<float>();
+  auto output_data = output->template mutable_data<float>();
+  auto mean_data = mean->template mutable_data<float>();
+  auto inv_stdev_data = inv_stdev->template mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
@@ -283,9 +283,9 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto input_data = input.data<float>();
   const auto scale_data = scale.data<float>();
   const auto bias_data = bias.data<float>();
-  auto output_data = output->mutable_data<float>();
-  auto mean_data = mean->mutable_data<float>();
-  auto inv_stdev_data = inv_stdev->mutable_data<float>();
+  auto output_data = output->template mutable_data<float>();
+  auto mean_data = mean->template mutable_data<float>();
+  auto inv_stdev_data = inv_stdev->template mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
@@ -370,9 +370,9 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto bias_data = bias.data<float>();
   const auto output_grad_data = output_grad.data<float>();
 
-  auto input_grad_data = input_grad->mutable_data<float>();
-  auto scale_grad_data = scale_grad->mutable_data<float>();
-  auto bias_grad_data = bias_grad->mutable_data<float>();
+  auto input_grad_data = input_grad->template mutable_data<float>();
+  auto scale_grad_data = scale_grad->template mutable_data<float>();
+  auto bias_grad_data = bias_grad->template mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
@@ -501,9 +501,9 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto bias_data = bias.data<float>();
   const auto output_grad_data = output_grad.data<float>();
 
-  auto input_grad_data = input_grad->mutable_data<float>();
-  auto scale_grad_data = scale_grad->mutable_data<float>();
-  auto bias_grad_data = bias_grad->mutable_data<float>();
+  auto input_grad_data = input_grad->template mutable_data<float>();
+  auto scale_grad_data = scale_grad->template mutable_data<float>();
+  auto bias_grad_data = bias_grad->template mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
diff --git a/caffe2/operators/instance_norm_op.h b/caffe2/operators/instance_norm_op.h
index 90f11b3d49c08..7435c7c8b43c0 100644
--- a/caffe2/operators/instance_norm_op.h
+++ b/caffe2/operators/instance_norm_op.h
@@ -40,8 +40,8 @@ class InstanceNormOp : public Operator<Context> {
   StorageOrder order_;
 
   // temp results that get passed to the gradient, but are otherwise stored here
-  Tensor<Context> mean_;
-  Tensor<Context> inv_stdev_;
+  Tensor mean_{Context::GetDeviceType()};
+  Tensor inv_stdev_{Context::GetDeviceType()};
 
   INPUT_TAGS(INPUT, SCALE, BIAS);
   OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
@@ -81,8 +81,8 @@ class InstanceNormGradientOp : public Operator<Context> {
 
   // temp results that could get passed through to this gradient, but if not,
   // are stored here
-  Tensor<Context> mean_;
-  Tensor<Context> inv_stdev_;
+  Tensor mean_{Context::GetDeviceType()};
+  Tensor inv_stdev_{Context::GetDeviceType()};
 
   INPUT_TAGS(INPUT, SCALE, BIAS, OUTPUT_GRAD, MEAN, INV_STDEV);
   OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
diff --git a/caffe2/operators/integral_image_op.cu b/caffe2/operators/integral_image_op.cu
index 872d29bd0dddb..d8fa0b8f4dcc6 100644
--- a/caffe2/operators/integral_image_op.cu
+++ b/caffe2/operators/integral_image_op.cu
@@ -144,7 +144,7 @@ bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
       cols_out,
       chans,
       X.data<float>(),
-      Y->mutable_data<float>());
+      Y->template mutable_data<float>());
   // Integral image over columns of the integral image over rows
   const int col_pass_size = X.dim32(0) * chans * cols_out;
   ColPassKernel<<<
@@ -152,7 +152,11 @@ bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      col_pass_size, rows_out, cols_out, chans, Y->mutable_data<float>());
+      col_pass_size,
+      rows_out,
+      cols_out,
+      chans,
+      Y->template mutable_data<float>());
   return true;
 }
 
@@ -161,8 +165,8 @@ bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Original input to "forward" op
   auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
                        // (aka "gradOutput")
-  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
-                        // (aka "gradInput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to
+                        // "forward" op (aka "gradInput")
 
   dX->ResizeLike(X);
   // Row pass reduces shape of dY from (N, C, H + 1, W + 1)
@@ -199,7 +203,7 @@ bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
       cols_out,
       chans,
       row_pass_buffer_.data<float>(),
-      dX->mutable_data<float>());
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/integral_image_op.h b/caffe2/operators/integral_image_op.h
index b8920d677de83..16bf99b33db04 100644
--- a/caffe2/operators/integral_image_op.h
+++ b/caffe2/operators/integral_image_op.h
@@ -28,7 +28,7 @@ class IntegralImageGradientOp final : public Operator<Context> {
   bool RunOnDevice() override;
 
  protected:
-  Tensor<Context> row_pass_buffer_;
+  Tensor row_pass_buffer_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/jsd_op.cc b/caffe2/operators/jsd_op.cc
index 44648838f4a2b..890fd96b6556d 100644
--- a/caffe2/operators/jsd_op.cc
+++ b/caffe2/operators/jsd_op.cc
@@ -35,7 +35,7 @@ bool BernoulliJSDOp<float, CPUContext>::RunOnDevice() {
   L->ResizeLike(X);
   auto* x_data = X.data<float>();
   auto* t_data = T.data<float>();
-  auto* l_data = L->mutable_data<float>();
+  auto* l_data = L->template mutable_data<float>();
   for (int i = 0; i < N; i++) {
     auto p_mdl = x_data[i];
     auto p_emp = t_data[i];
@@ -57,7 +57,7 @@ bool BernoulliJSDGradientOp<float, CPUContext>::RunOnDevice() {
   auto* go_data = go.data<float>();
   auto* x_data = X.data<float>();
   auto* t_data = T.data<float>();
-  auto* gi_data = gi->mutable_data<float>();
+  auto* gi_data = gi->template mutable_data<float>();
   for (int i = 0; i < N; i++) {
     auto p_mdl = x_data[i];
     auto p_emp = t_data[i];
diff --git a/caffe2/operators/last_n_window_collector.cc b/caffe2/operators/last_n_window_collector.cc
index c9d1a777538d1..25f06cf751dd4 100644
--- a/caffe2/operators/last_n_window_collector.cc
+++ b/caffe2/operators/last_n_window_collector.cc
@@ -94,7 +94,7 @@ class LastNWindowCollectorOp : public Operator<Context> {
 
     if (num_entries > numToCollect_) {
       // just copy the last N rows
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           input.meta(),
           num_to_copy * block_size,
           input_data + (num_entries - numToCollect_) * block_bytesize,
@@ -105,13 +105,13 @@ class LastNWindowCollectorOp : public Operator<Context> {
     auto start = *next_data;
     auto first_chunk_size =
         std::min<size_t>(num_to_copy + start, numToCollect_) - start;
-    context_.template CopyItems<Context, Context>(
+    context_.CopyItemsSameDevice(
         input.meta(),
         first_chunk_size * block_size,
         input_data,
         output_data + start * block_bytesize);
 
-    context_.template CopyItems<Context, Context>(
+    context_.CopyItemsSameDevice(
         input.meta(),
         (num_to_copy - first_chunk_size) * block_size,
         input_data + first_chunk_size * block_bytesize,
diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu
index bcec393b2ad95..2a909739c6f4c 100644
--- a/caffe2/operators/layer_norm_op.cu
+++ b/caffe2/operators/layer_norm_op.cu
@@ -45,7 +45,7 @@ void allocScratchAndReduce(
     float* output,
     int num_segments,
     int* seg_indices,
-    Tensor<CUDAContext>* scratch,
+    Tensor* scratch,
     cudaStream_t stream) {
   size_t temp_storage_bytes;
   cub::DeviceSegmentedReduce::Sum(
@@ -63,8 +63,8 @@ void allocScratchAndReduce(
   scratch->Resize(vector<size_t>{temp_storage_floats});
 
   cub::DeviceSegmentedReduce::Sum(
-      scratch->mutable_data<float>(), // To retrieve required temporary storage
-                                      // size
+      scratch->template mutable_data<float>(), // To retrieve required temporary
+                                               // storage size
       temp_storage_bytes, // size_t &temp_storage_bytes
       input, // InputIteratorT d_i
       output, // OutputIteratorT d_out
@@ -72,7 +72,7 @@ void allocScratchAndReduce(
       seg_indices, // int *d_begin_offsets
       seg_indices + 1, // int *d_end_offsets
       stream // cudaStream_t stream=0
-      );
+  );
 }
 
 } //  namespace
@@ -107,7 +107,7 @@ bool LayerNormOp<CUDAContext>::DoRunWithType<float>() {
       std::bind1st(std::multiplies<int>(), right));
 
   seg_indices_.Resize(vector<size_t>{segs.size()});
-  context_.CopyBytes<CPUContext, CUDAContext>(
+  context_.CopyBytesFromCPU(
       sizeof(int) * segs.size(),
       static_cast<void*>(segs.data()),
       static_cast<void*>(seg_indices_.mutable_data<int>()));
@@ -237,7 +237,7 @@ __global__ void gradientMegaKernel(
   }
 }
 
-#define PRINT(X, N, D) printTensor<<<1, 1, 0, context_.cuda_stream()>>>(X, N, D)
+#define PRINT(X, N, D) printTensor >> (X, N, D)
 
 } // namespace
 
@@ -272,7 +272,7 @@ bool LayerNormGradientOp<CUDAContext>::DoRunWithType<float>() {
       std::bind1st(std::multiplies<int>(), right));
 
   seg_indices_.Resize(vector<size_t>{segs.size()});
-  context_.CopyBytes<CPUContext, CUDAContext>(
+  context_.CopyBytesFromCPU(
       sizeof(int) * segs.size(),
       static_cast<void*>(segs.data()),
       static_cast<void*>(seg_indices_.mutable_data<int>()));
diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h
index da74f83398812..b6d032a8211aa 100644
--- a/caffe2/operators/layer_norm_op.h
+++ b/caffe2/operators/layer_norm_op.h
@@ -28,8 +28,8 @@ class LayerNormOp : public Operator<Context> {
   int axis_;
   float epsilon_;
 
-  Tensor<Context> scratch_;
-  Tensor<Context> seg_indices_;
+  Tensor scratch_{Context::GetDeviceType()};
+  Tensor seg_indices_{Context::GetDeviceType()};
 };
 
 template <class Context>
@@ -53,11 +53,11 @@ class LayerNormGradientOp : public Operator<Context> {
   int axis_;
   float epsilon_;
 
-  Tensor<Context> scratch_;
-  Tensor<Context> gscratch_;
-  Tensor<Context> seg_indices_;
-  Tensor<Context> dstdev_;
-  Tensor<Context> dmean_;
+  Tensor scratch_{Context::GetDeviceType()};
+  Tensor gscratch_{Context::GetDeviceType()};
+  Tensor seg_indices_{Context::GetDeviceType()};
+  Tensor dstdev_{Context::GetDeviceType()};
+  Tensor dmean_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/leaky_relu_op.cc b/caffe2/operators/leaky_relu_op.cc
index dcf62084a1207..280630e7fbae7 100644
--- a/caffe2/operators/leaky_relu_op.cc
+++ b/caffe2/operators/leaky_relu_op.cc
@@ -104,7 +104,7 @@ print("Y:\n", workspace.FetchBlob("Y"))
 )DOC")
     .Input(0, "X", "Input tensor of data to be operated on.")
     .Output(0, "Y", "Output tensor, calculated as described above.");
-    
+
 OPERATOR_SCHEMA(LeakyReluGradient)
     .NumInputs(2)
     .NumOutputs(1)
diff --git a/caffe2/operators/leaky_relu_op.cu b/caffe2/operators/leaky_relu_op.cu
index ece07b786a510..95429e6d63793 100644
--- a/caffe2/operators/leaky_relu_op.cu
+++ b/caffe2/operators/leaky_relu_op.cu
@@ -36,7 +36,7 @@ bool LeakyReluOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), alpha_, X.data<float>(), Y->mutable_data<float>());
+      X.size(), alpha_, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -56,7 +56,7 @@ bool LeakyReluGradientOp<float, CUDAContext>::RunOnDevice() {
       alpha_,
       Y.data<float>(),
       dY.data<float>(),
-      dX->mutable_data<float>());
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/lengths_pad_op.h b/caffe2/operators/lengths_pad_op.h
index e89f4fbbfc5f7..9f65c39a262a7 100644
--- a/caffe2/operators/lengths_pad_op.h
+++ b/caffe2/operators/lengths_pad_op.h
@@ -33,13 +33,14 @@ class LengthsPadOp : public Operator<Context> {
 
     // Context::CopyFrom and math::Sum need the same context to avoid race
     // conditions
-    CPUContext cpuContext;
-    lengths_host_.CopyFrom(lengths, &cpuContext);
+    // why? CPUContext is not used in Sum
+    lengths_host_.CopyFrom(lengths);
 
     auto lengths_size = lengths_host_.size();
-    auto* lengths_data = lengths_host_.data<int32_t>();
+    auto* lengths_data = lengths_host_.template data<int32_t>();
 
     int32_t total_length = 0;
+    CPUContext cpuContext;
     math::Sum<int32_t, CPUContext>(
         lengths_size, lengths_data, &total_length, &cpuContext);
 
@@ -65,7 +66,7 @@ class LengthsPadOp : public Operator<Context> {
           i,
           " is larger than target length");
 
-      context_.template Copy<T, Context, Context>(
+      context_.template CopySameDevice<T>(
           block_size * length, src_data, out_data);
 
       out_data += block_size * target_length_;
@@ -79,7 +80,7 @@ class LengthsPadOp : public Operator<Context> {
  private:
   double padding_value_;
   int target_length_;
-  TensorCPU lengths_host_;
+  Tensor lengths_host_{CPU};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h
index fb525bd9b972f..9d2b7a6f07122 100644
--- a/caffe2/operators/lengths_tile_op.h
+++ b/caffe2/operators/lengths_tile_op.h
@@ -23,12 +23,13 @@ class LengthsTileOp : public Operator<Context> {
 
     // Context::CopyFrom and math::Sum need the same context to avoid race
     // conditions
-    CPUContext cpuContext;
-    lengths_host_.CopyFrom(lengths, &cpuContext);
+    // why? CPUContext is not used in Sum
+    lengths_host_.CopyFrom(lengths);
     auto lengths_size = lengths_host_.size();
     auto* lengths_data = lengths_host_.data<int32_t>();
 
     int32_t total_length = 0;
+    CPUContext cpuContext;
     math::Sum<int32_t, CPUContext>(
         lengths_size, lengths_data, &total_length, &cpuContext);
 
@@ -44,7 +45,7 @@ class LengthsTileOp : public Operator<Context> {
       auto length = lengths_data[i];
       CAFFE_ENFORCE_GE(length, 0);
       for (int32_t j = 0; j < length; ++j) {
-        context_.template CopyBytes<Context, Context>(block_bytesize, src, out);
+        context_.CopyBytesSameDevice(block_bytesize, src, out);
         out += block_bytesize;
       }
       src += block_bytesize;
@@ -55,7 +56,7 @@ class LengthsTileOp : public Operator<Context> {
   INPUT_TAGS(DATA, LENGTHS);
 
  private:
-  TensorCPU lengths_host_;
+  Tensor lengths_host_{CPU};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/listwise_l2r_op.cc b/caffe2/operators/listwise_l2r_op.cc
index 3940dfb2b1670..d52d3bcce63b7 100644
--- a/caffe2/operators/listwise_l2r_op.cc
+++ b/caffe2/operators/listwise_l2r_op.cc
@@ -77,9 +77,9 @@ template <>
 float LambdaRankNdcgOp<float, CPUContext>::LambdaRankNdcgSession(
     int start_index,
     int end_index,
-    const Tensor<CPUContext>& y,
-    const Tensor<CPUContext>& r,
-    Tensor<CPUContext>** dy) {
+    const Tensor& y,
+    const Tensor& r,
+    Tensor** dy) {
   CAFFE_ENFORCE(start_index >= 0);
   CAFFE_ENFORCE(start_index < y.size());
   const auto* y_data = y.template data<float>();
diff --git a/caffe2/operators/listwise_l2r_op.h b/caffe2/operators/listwise_l2r_op.h
index ae1aca2c16436..9564222d473cc 100644
--- a/caffe2/operators/listwise_l2r_op.h
+++ b/caffe2/operators/listwise_l2r_op.h
@@ -29,16 +29,16 @@ class LambdaRankNdcgOp final : public Operator<Context> {
   float LambdaRankNdcgSession(
       int start_index,
       int end_index,
-      const Tensor<CPUContext>& y,
-      const Tensor<CPUContext>& r,
-      Tensor<CPUContext>** dy);
+      const Tensor& y,
+      const Tensor& r,
+      Tensor** dy);
   bool use_ndcg_as_loss_;
-  Tensor<Context> gain_;
-  Tensor<Context> discount_;
-  Tensor<Context> rank_idx_;
-  Tensor<Context> ideal_idx_;
-  Tensor<Context> lambda_;
-  Tensor<Context> inv_log_i_;
+  Tensor gain_{Context::GetDeviceType()};
+  Tensor discount_{Context::GetDeviceType()};
+  Tensor rank_idx_{Context::GetDeviceType()};
+  Tensor ideal_idx_{Context::GetDeviceType()};
+  Tensor lambda_{Context::GetDeviceType()};
+  Tensor inv_log_i_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
index 4b21fb2660d18..de8380e45e98c 100644
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@@ -536,7 +536,7 @@ class CheckpointOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     int64_t iter =
-        OperatorBase::Input<TensorCPU>(0).template data<int64_t>()[0];
+        OperatorBase::Input<Tensor>(0, CPU).template data<int64_t>()[0];
     if (iter % every_ == 0) {
       GetMutableArgument("db", true, &save_op_def_)
           ->set_s(FormatString(db_pattern_, iter));
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
index 334570306c4f9..1cba60e86d978 100644
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -15,7 +15,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   const int image_size = C * H * W;
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
 
   if (OutputSize() > 1) {
     scale_ = Output(1);
@@ -25,11 +25,10 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
-  TensorCPU padded_square(
-      vector<TIndex>{C + size_ - 1, H, W});
-  float* padded_square_data = padded_square.mutable_data<float>();
+  Tensor padded_square(vector<TIndex>{C + size_ - 1, H, W}, CPU);
+  float* padded_square_data = padded_square.template mutable_data<float>();
   math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
                                &context_);
   const float alpha_over_size = alpha_ / size_;
@@ -48,7 +47,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     for (int c = 1; c < C; ++c) {
       float* this_scale_slice = scale_data + n * image_size + c * H * W;
       // copy previous scale
-      context_.Copy<float, CPUContext, CPUContext>(
+      context_.CopyFromCPU<float>(
           H * W, this_scale_slice - H * W, this_scale_slice);
       // add head
       math::Axpy<float, CPUContext>(
@@ -80,7 +79,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   const int num_rows = N * H * W;
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
 
   if (OutputSize() > 1) {
     scale_ = Output(1);
@@ -90,10 +89,10 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
 
-  TensorCPU padded_square(vector<TIndex>(1, C + size_ - 1));
-  float* padded_square_data = padded_square.mutable_data<float>();
+  Tensor padded_square(vector<TIndex>(1, C + size_ - 1), CPU);
+  float* padded_square_data = padded_square.template mutable_data<float>();
   math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
                                &context_);
   const float alpha_over_size = alpha_ / size_;
@@ -143,13 +142,12 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
 
-  TensorCPU padded_ratio(
-      vector<TIndex>{C + size_ - 1, H, W});
-  float* padded_ratio_data = padded_ratio.mutable_data<float>();
+  Tensor padded_ratio(vector<TIndex>{C + size_ - 1, H, W}, CPU);
+  float* padded_ratio_data = padded_ratio.template mutable_data<float>();
   // Compute scale(copied from LRNOp) - reusing padded_ratio
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
@@ -170,7 +168,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     for (int c = 1; c < C; ++c) {
       float* this_scale_slice = scale_data + n * image_size + c * H * W;
       // copy previous scale
-      context_.Copy<float, CPUContext, CPUContext>(
+      context_.CopyFromCPU<float>(
           H * W, this_scale_slice - H * W, this_scale_slice);
       // add head
       math::Axpy<float, CPUContext>(
@@ -185,9 +183,8 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
 
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
                                &context_);
-  TensorCPU accum_ratio(vector<TIndex>{H, W});
-  float* accum_ratio_data = accum_ratio.mutable_data<float>();
-
+  Tensor accum_ratio(vector<TIndex>{H, W}, CPU);
+  float* accum_ratio_data = accum_ratio.template mutable_data<float>();
 
   const float cache_ratio = 2. * alpha_ * beta_ / size_;
   const int inverse_pre_pad = size_ - (size_ + 1) / 2;
@@ -246,9 +243,9 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  TensorCPU padded_ratio(vector<TIndex>(1, C + size_ - 1));
-  float* padded_ratio_data = padded_ratio.mutable_data<float>();
-  float* scale_data = scale_->mutable_data<float>();
+  Tensor padded_ratio(vector<TIndex>(1, C + size_ - 1), CPU);
+  float* padded_ratio_data = padded_ratio.template mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
   // Compute scale(copied from LRNOp) - reusing padded_ratio
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
@@ -278,7 +275,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   const float* Ydata = Y.data<float>();
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   for (int n = 0; n < num_rows; ++n) {
     const int offset = n * C;
     for (int c = 0; c < C; ++c) {
diff --git a/caffe2/operators/local_response_normalization_op.cu b/caffe2/operators/local_response_normalization_op.cu
index a6a8f5011e33e..edcd8e878e774 100644
--- a/caffe2/operators/local_response_normalization_op.cu
+++ b/caffe2/operators/local_response_normalization_op.cu
@@ -186,7 +186,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const int W = X.dim32(3);
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   if (OutputSize() > 1) {
     scale_ = Output(1);
   } else {
@@ -195,7 +195,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
 
   int n_threads = N * H * W;
   LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
@@ -219,7 +219,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const int C = X.dim32(3);
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   if (OutputSize() > 1) {
     scale_ = Output(1);
   } else {
@@ -228,7 +228,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
 
   int n_threads = X.size();
   LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
@@ -263,13 +263,13 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
   int n_threads = N * H * W;
   LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                         0, context_.cuda_stream()>>>(
       n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
 
   LRNComputeDiffNCHW<float><<<CAFFE_GET_BLOCKS(n_threads),
                               CAFFE_CUDA_NUM_THREADS,
@@ -301,19 +301,30 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   }
   scale_->ResizeLike(X);
 
-  float* scale_data = scale_->mutable_data<float>();
+  float* scale_data = scale_->template mutable_data<float>();
   int n_threads = X.size();
   LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                         0, context_.cuda_stream()>>>(
       n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
 
-  LRNComputeDiffNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
-                              CAFFE_CUDA_NUM_THREADS, 0,
-                              context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y.data<float>(), scale_data,
-      dY.data<float>(),
-      X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3), size_, -beta_,
-      2.f * alpha_ * beta_ / size_, dX->mutable_data<float>());
+  LRNComputeDiffNHWC<float>
+      <<<CAFFE_GET_BLOCKS(X.size()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          scale_data,
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          size_,
+          -beta_,
+          2.f * alpha_ * beta_ / size_,
+          dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/local_response_normalization_op.h b/caffe2/operators/local_response_normalization_op.h
index 79f388caea23f..9b7f8da8095ef 100644
--- a/caffe2/operators/local_response_normalization_op.h
+++ b/caffe2/operators/local_response_normalization_op.h
@@ -66,8 +66,8 @@ class LRNOp final : public LRNOpBase<T, Context> {
  protected:
   // Input: X; Output: Y, scale.
   OUTPUT_TAGS(OUTPUT, SCALE);
-  Tensor<Context>* scale_ = nullptr;
-  Tensor<Context> local_scale_tensor_;
+  Tensor* scale_ = nullptr;
+  Tensor local_scale_tensor_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -83,8 +83,8 @@ class LRNGradientOp final : public LRNOpBase<T, Context> {
  protected:
   // Input: X, Y, scale, dY; Output: dX
   INPUT_TAGS(INPUT, OUTPUT, SCALE, OUTPUT_GRAD);
-  Tensor<Context>* scale_ = nullptr;
-  Tensor<Context> local_scale_tensor_;
+  Tensor* scale_ = nullptr;
+  Tensor local_scale_tensor_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/locally_connected_op.h b/caffe2/operators/locally_connected_op.h
index 7b4bbd4de87db..cf5bf63e6893e 100644
--- a/caffe2/operators/locally_connected_op.h
+++ b/caffe2/operators/locally_connected_op.h
@@ -37,9 +37,9 @@ class LocallyConnectedOp final : public ConvPoolOpBase<Context> {
       const T* filter_data,
       const T* bias_data,
       T* Y_data,
-      Tensor<Context>* column_buffer,
-      Tensor<Context>* column_transposed_buffer,
-      Tensor<Context>* output_buffer);
+      Tensor* column_buffer,
+      Tensor* column_transposed_buffer,
+      Tensor* output_buffer);
 
   void RunOnDeviceWithOrderNHWCImpl(
       const lc_op_util::ShapeParams& shape,
@@ -47,16 +47,16 @@ class LocallyConnectedOp final : public ConvPoolOpBase<Context> {
       const T* filter_data,
       const T* bias_data,
       T* Y_data,
-      Tensor<Context>* column_buffer,
-      Tensor<Context>* column_transposed_buffer,
-      Tensor<Context>* Y_transposed_buffer);
+      Tensor* column_buffer,
+      Tensor* column_transposed_buffer,
+      Tensor* Y_transposed_buffer);
 
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
 
   // Buffer.
-  Tensor<Context> column_buffer_;
-  Tensor<Context> column_transposed_buffer_;
-  Tensor<Context> Y_transposed_buffer_;
+  Tensor column_buffer_{Context::GetDeviceType()};
+  Tensor column_transposed_buffer_{Context::GetDeviceType()};
+  Tensor Y_transposed_buffer_{Context::GetDeviceType()};
 
   // Input: X, W, b
   // Output: Y
@@ -93,9 +93,9 @@ class LocallyConnectedGradientOp final : public ConvPoolOpBase<Context> {
       T* dfilter_data,
       T* dX_data,
       T* dbias_data,
-      Tensor<Context>* column_buffer,
-      Tensor<Context>* column_transposed_buffer,
-      Tensor<Context>* dY_transposed_buffer);
+      Tensor* column_buffer,
+      Tensor* column_transposed_buffer,
+      Tensor* dY_transposed_buffer);
 
   void RunOnDeviceWithOrderNHWCImpl(
       const lc_op_util::ShapeParams& shape,
@@ -105,18 +105,18 @@ class LocallyConnectedGradientOp final : public ConvPoolOpBase<Context> {
       T* dfilter_data,
       T* dX_data,
       T* dbias_data,
-      Tensor<Context>* column_buffer,
-      Tensor<Context>* column_transposed_buffer,
-      Tensor<Context>* dY_transposed_buffer);
+      Tensor* column_buffer,
+      Tensor* column_transposed_buffer,
+      Tensor* dY_transposed_buffer);
 
   const bool no_bias_;
 
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
 
   // Buffer.
-  Tensor<Context> column_buffer_;
-  Tensor<Context> column_transposed_buffer_;
-  Tensor<Context> dY_transposed_buffer_;
+  Tensor column_buffer_{Context::GetDeviceType()};
+  Tensor column_transposed_buffer_{Context::GetDeviceType()};
+  Tensor dY_transposed_buffer_{Context::GetDeviceType()};
 
   // input: X, W, dY
   // output: dW, db, and optionally dX
diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h
index 4d7762fccbbb3..76d7228c342ac 100644
--- a/caffe2/operators/locally_connected_op_impl.h
+++ b/caffe2/operators/locally_connected_op_impl.h
@@ -189,9 +189,9 @@ void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
     const T* filter_data,
     const T* bias_data,
     T* Y_data,
-    Tensor<Context>* column_buffer,
-    Tensor<Context>* column_transposed_buffer,
-    Tensor<Context>* Y_transposed_buffer) {
+    Tensor* column_buffer,
+    Tensor* column_transposed_buffer,
+    Tensor* Y_transposed_buffer) {
   const int input_stride = shape.C / group_ * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
@@ -292,9 +292,9 @@ void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
     const T* filter_data,
     const T* bias_data,
     T* Y_data,
-    Tensor<Context>* column_buffer,
-    Tensor<Context>* column_transposed_buffer,
-    Tensor<Context>* Y_transposed_buffer) {
+    Tensor* column_buffer,
+    Tensor* column_transposed_buffer,
+    Tensor* Y_transposed_buffer) {
   const int input_stride = shape.C * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
@@ -550,9 +550,9 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
     T* dfilter_data,
     T* dX_data,
     T* dbias_data,
-    Tensor<Context>* column_buffer,
-    Tensor<Context>* column_transposed_buffer,
-    Tensor<Context>* dY_transposed_buffer) {
+    Tensor* column_buffer,
+    Tensor* column_transposed_buffer,
+    Tensor* dY_transposed_buffer) {
   const int input_stride = shape.C * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
@@ -726,9 +726,9 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
     T* dfilter_data,
     T* dX_data,
     T* dbias_data,
-    Tensor<Context>* column_buffer,
-    Tensor<Context>* column_transposed_buffer,
-    Tensor<Context>* dY_transposed_buffer) {
+    Tensor* column_buffer,
+    Tensor* column_transposed_buffer,
+    Tensor* dY_transposed_buffer) {
   const int input_stride = shape.C * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
diff --git a/caffe2/operators/logit_op.cu b/caffe2/operators/logit_op.cu
index d2e8351fcac95..c431e5b519eec 100644
--- a/caffe2/operators/logit_op.cu
+++ b/caffe2/operators/logit_op.cu
@@ -54,7 +54,11 @@ bool LogitGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      n, X.data<float>(), dY.data<float>(), eps_, dX->mutable_data<float>());
+      n,
+      X.data<float>(),
+      dY.data<float>(),
+      eps_,
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc
index c3795a0bb216e..f877786648350 100644
--- a/caffe2/operators/lp_pool_op.cc
+++ b/caffe2/operators/lp_pool_op.cc
@@ -17,7 +17,7 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
   const auto inv_p = 1.0 / p;
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
   // The main loop
   int channels = X.dim32(1);
@@ -67,7 +67,7 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
   const auto inv_p = 1.0 / p;
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
   // The main loop
   int pooled_height = Y->dim32(1);
@@ -115,11 +115,11 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
   // TODO(Yangqing): Add shape checks.
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
-      X.size(), 0, dX->mutable_data<float>(), &context_);
+      X.size(), 0, dX->template mutable_data<float>(), &context_);
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Ydata = Y.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
 
   int channels = X.dim32(1);
   CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
@@ -171,9 +171,9 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
   // TODO(Yangqing): Add shape checks.
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
-      X.size(), 0, dX->mutable_data<float>(), &context_);
+      X.size(), 0, dX->template mutable_data<float>(), &context_);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   const float* Xdata = X.data<float>();
   const float* Ydata = Y.data<float>();
   // The main loop
diff --git a/caffe2/operators/lp_pool_op.cu b/caffe2/operators/lp_pool_op.cu
index 53f6110294426..1e5b66c5db2bb 100644
--- a/caffe2/operators/lp_pool_op.cu
+++ b/caffe2/operators/lp_pool_op.cu
@@ -242,27 +242,27 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
   auto* Y = Output(0);
   ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
   int output_size = Y->size();
-  LpPoolForwardNCHW<float><<<
-      CAFFE_GET_BLOCKS(output_size),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      output_size,
-      X.data<float>(),
-      X.dim32(0),
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      Y->dim32(2),
-      Y->dim32(3),
-      kernel_h(),
-      kernel_w(),
-      stride_h(),
-      stride_w(),
-      pad_t(),
-      pad_l(),
-      Y->mutable_data<float>(),
-      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolForwardNCHW<float>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->template mutable_data<float>(),
+          OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
@@ -272,27 +272,27 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
   auto* Y = Output(0);
   ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
   int output_size = Y->size();
-  LpPoolForwardNHWC<float><<<
-      CAFFE_GET_BLOCKS(output_size),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      output_size,
-      X.data<float>(),
-      X.dim32(0),
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      Y->dim32(1),
-      Y->dim32(2),
-      kernel_h(),
-      kernel_w(),
-      stride_h(),
-      stride_w(),
-      pad_t(),
-      pad_l(),
-      Y->mutable_data<float>(),
-      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolForwardNHWC<float>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(1),
+          Y->dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->template mutable_data<float>(),
+          OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
@@ -306,29 +306,29 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
   auto* dX = Output(0);
   dX->ResizeLike(X);
   ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
-  LpPoolBackwardNCHW<float><<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(),
-      dY.data<float>(),
-      Y.data<float>(),
-      X.data<float>(),
-      X.dim32(0),
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      dY.dim32(2),
-      dY.dim32(3),
-      kernel_h(),
-      kernel_w(),
-      stride_h(),
-      stride_w(),
-      pad_t(),
-      pad_l(),
-      dX->mutable_data<float>(),
-      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolBackwardNCHW<float>
+      <<<CAFFE_GET_BLOCKS(X.size()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          Y.data<float>(),
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->template mutable_data<float>(),
+          OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
@@ -342,29 +342,29 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
   auto* dX = Output(0);
   dX->ResizeLike(X);
   ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
-  LpPoolBackwardNHWC<float><<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(),
-      dY.data<float>(),
-      Y.data<float>(),
-      X.data<float>(),
-      X.dim32(0),
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      dY.dim32(1),
-      dY.dim32(2),
-      kernel_h(),
-      kernel_w(),
-      stride_h(),
-      stride_w(),
-      pad_t(),
-      pad_l(),
-      dX->mutable_data<float>(),
-      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolBackwardNHWC<float>
+      <<<CAFFE_GET_BLOCKS(X.size()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          Y.data<float>(),
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(1),
+          dY.dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->template mutable_data<float>(),
+          OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
index f79d51ad51c44..6af404d115358 100644
--- a/caffe2/operators/lpnorm_op.cc
+++ b/caffe2/operators/lpnorm_op.cc
@@ -15,12 +15,12 @@ bool LpNormOp<float, CPUContext>::RunOnDevice() {
   const float size = average_ ? (float)X.size() : 1.0f;
   CAFFE_ENFORCE_GT(size, 0);
   if (p_ == 1) {
-    *(norm->mutable_data<float>()) =
+    *(norm->template mutable_data<float>()) =
         (ConstEigenVectorMap<float>(X_data, X.size()).array()).abs().sum() /
         size;
     // L1(x) = sum(|x|), L1_average(x) = sum(\x\) / x.size()
   } else if (p_ == 2) {
-    *(norm->mutable_data<float>()) =
+    *(norm->template mutable_data<float>()) =
         (ConstEigenVectorMap<float>(X_data, X.size()).array()).square().sum() /
         size;
     // L2(x) = (sum(|x|^2)), L2_average(x) = sum(|x|^2) / x.size()
@@ -43,15 +43,17 @@ bool LpNormGradientOp<float, CPUContext>::RunOnDevice() {
     for (int i = 0; i < X.size(); ++i) {
       float temp = (X.data<float>())[i];
       if (temp < -kEps) {
-        dX->mutable_data<float>()[i] = -(dnorm.data<float>())[0] / size;
+        dX->template mutable_data<float>()[i] =
+            -(dnorm.data<float>())[0] / size;
       } else if (temp > kEps) {
-        dX->mutable_data<float>()[i] = (dnorm.data<float>())[0] / size;
+        dX->template mutable_data<float>()[i] = (dnorm.data<float>())[0] / size;
       } else {
-        dX->mutable_data<float>()[i] = 0;
+        dX->template mutable_data<float>()[i] = 0;
       }
     }
   } else if (p_ == 2) {
-    EigenVectorMap<float>(dX->mutable_data<float>(), X.size()).array() =
+    EigenVectorMap<float>(dX->template mutable_data<float>(), X.size())
+        .array() =
         ConstEigenVectorMap<float>(X.data<float>(), X.size()).array() * 2.0f *
         ((dnorm.data<float>())[0] / size);
   }
diff --git a/caffe2/operators/lstm_unit_op.h b/caffe2/operators/lstm_unit_op.h
index 73afcbe08684a..31ab070f8d26a 100644
--- a/caffe2/operators/lstm_unit_op.h
+++ b/caffe2/operators/lstm_unit_op.h
@@ -176,7 +176,7 @@ class LSTMUnitOp : public Operator<Context> {
     }
 
     const auto t = static_cast<OperatorBase*>(this)
-                       ->Input<Tensor<CPUContext>>(TIMESTEP)
+                       ->Input<Tensor>(TIMESTEP, CPU)
                        .template data<int32_t>()[0];
     Output(CELL_T)->ResizeLike(Input(CELL_T_M_1));
     auto* C = Output(CELL_T)->template mutable_data<T>();
@@ -253,7 +253,7 @@ class LSTMUnitGradientOp : public Operator<Context> {
     const auto* C_prev = Input(CELL_T_M_1).template data<T>();
     const auto* X = Input(GATES).template data<T>();
     const auto t = static_cast<OperatorBase*>(this)
-                       ->Input<Tensor<CPUContext>>(TIMESTEP)
+                       ->Input<Tensor>(TIMESTEP, CPU)
                        .template data<int32_t>()[0];
     const auto* C = Input(CELL_T).template data<T>();
     const auto* H = Input(HIDDEN_T).template data<T>();
diff --git a/caffe2/operators/map_ops.h b/caffe2/operators/map_ops.h
index 8d1a18f8fc135..5a436d7f2502a 100644
--- a/caffe2/operators/map_ops.h
+++ b/caffe2/operators/map_ops.h
@@ -201,9 +201,9 @@ class MapSerializer : public BlobSerializerBase {
     CAFFE_ENFORCE(blob.IsType<MapType>());
     const MapType& map_data = blob.template Get<MapType>();
     TIndex sz = map_data.size();
-    Tensor<CPUContext> key_tensor;
+    Tensor key_tensor(CPU);
     key_tensor.Resize(sz);
-    Tensor<CPUContext> value_tensor;
+    Tensor value_tensor(CPU);
     value_tensor.Resize(sz);
     auto* key_data = key_tensor.mutable_data<KEY_T>();
     auto* value_data = value_tensor.mutable_data<VALUE_T>();
@@ -215,7 +215,7 @@ class MapSerializer : public BlobSerializerBase {
     }
 
     TensorProtos tensor_protos;
-    TensorSerializer<CPUContext> ser;
+    TensorSerializer ser;
     ser.Serialize(
         key_tensor, name, tensor_protos.add_protos(), 0, key_tensor.size());
     ser.Serialize(
@@ -239,8 +239,8 @@ class MapDeserializer : public BlobDeserializerBase {
     CAFFE_ENFORCE(
         tensor_protos.ParseFromString(proto.content()),
         "Fail to parse TensorProtos");
-    TensorDeserializer<CPUContext> deser;
-    Tensor<CPUContext> key_tensor, value_tensor;
+    TensorDeserializer deser;
+    Tensor key_tensor(CPU), value_tensor(CPU);
     deser.Deserialize(tensor_protos.protos(0), &key_tensor);
     deser.Deserialize(tensor_protos.protos(1), &value_tensor);
     auto* key_data = key_tensor.data<KEY_T>();
diff --git a/caffe2/operators/margin_ranking_criterion_op.cc b/caffe2/operators/margin_ranking_criterion_op.cc
index b699c4bb7b756..30b4f2731af5f 100644
--- a/caffe2/operators/margin_ranking_criterion_op.cc
+++ b/caffe2/operators/margin_ranking_criterion_op.cc
@@ -23,7 +23,7 @@ bool MarginRankingCriterionOp<CPUContext>::RunOnDevice() {
   const float* X1data = X1.data<float>();
   const float* X2data = X2.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output = loss->mutable_data<float>();
+  float* output = loss->template mutable_data<float>();
   for (int i = 0; i < X1.size(); ++i) {
     output[i] = std::max(-Ydata[i] * (X1data[i] - X2data[i]) + margin_, 0.f);
   }
@@ -47,8 +47,8 @@ bool MarginRankingCriterionGradientOp<CPUContext>::RunOnDevice() {
   const int* Ydata = Y.data<int>();
   const float* dLoss_data = dLoss.data<float>();
 
-  float* dX1_data = dX1->mutable_data<float>();
-  float* dX2_data = dX2->mutable_data<float>();
+  float* dX1_data = dX1->template mutable_data<float>();
+  float* dX2_data = dX2->template mutable_data<float>();
   for (int i = 0; i < X1.size(); ++i) {
     auto dist = -Ydata[i] * (X1data[i] - X2data[i]) + margin_;
     if (dist < 0.f) {
@@ -72,9 +72,9 @@ OPERATOR_SCHEMA(MarginRankingCriterion)
     .NumInputs(3)
     .NumOutputs(1)
     .SetDoc(R"DOC(
-MarginRankingCriterion takes two input data X1 (Tensor<float>),
-X2 (Tensor<float>), and label Y (Tensor<int>) to produce the
-loss (Tensor<float>) where the loss function,
+MarginRankingCriterion takes two input data X1 (Tensor),
+X2 (Tensor), and label Y (Tensor) to produce the
+loss (Tensor) where the loss function,
 loss(X1, X2, Y) = max(0, -Y * (X1 - X2) + margin), is applied to
 the tensor elementwise.
 
diff --git a/caffe2/operators/margin_ranking_criterion_op.cu b/caffe2/operators/margin_ranking_criterion_op.cu
index b01513a40b4f7..5593a1db8cd49 100644
--- a/caffe2/operators/margin_ranking_criterion_op.cu
+++ b/caffe2/operators/margin_ranking_criterion_op.cu
@@ -45,7 +45,7 @@ bool MarginRankingCriterionOp<CUDAContext>::RunOnDevice() {
   const float* X1data = X1.data<float>();
   const float* X2data = X2.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output_data = loss->mutable_data<float>();
+  float* output_data = loss->template mutable_data<float>();
 
   MRCKernel<<<CAFFE_GET_BLOCKS(X1.size()), CAFFE_CUDA_NUM_THREADS,
               0, context_.cuda_stream()>>>(
@@ -70,8 +70,8 @@ bool MarginRankingCriterionGradientOp<CUDAContext>::RunOnDevice() {
   const int* Ydata = Y.data<int>();
   const float* dOutput_data = dOutput.data<float>();
 
-  float* dX1_data = dX1->mutable_data<float>();
-  float* dX2_data = dX2->mutable_data<float>();
+  float* dX1_data = dX1->template mutable_data<float>();
+  float* dX2_data = dX2->template mutable_data<float>();
   MRCGradientKernel<<<CAFFE_GET_BLOCKS(X1.size()), CAFFE_CUDA_NUM_THREADS,
                       0, context_.cuda_stream()>>>(
       X1.size(), Ydata, X1data, X2data,
diff --git a/caffe2/operators/max_pool_with_index.cu b/caffe2/operators/max_pool_with_index.cu
index 5ac3c58bb5f89..eee10c488f5da 100644
--- a/caffe2/operators/max_pool_with_index.cu
+++ b/caffe2/operators/max_pool_with_index.cu
@@ -115,27 +115,27 @@ bool MaxPoolWithIndexOp::DoRunWithType() {
   int output_size = Y->size();
   mask->Resize(output_size);
 
-  MaxPoolForward<T><<<
-      CAFFE_GET_BLOCKS(output_size),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      output_size,
-      X.data<T>(),
-      X.dim32(0),
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      Y->dim32(2),
-      Y->dim32(3),
-      kernel_h(),
-      kernel_w(),
-      stride_h(),
-      stride_w(),
-      pad_t(),
-      pad_l(),
-      Y->mutable_data<T>(),
-      mask->mutable_data<int>());
+  MaxPoolForward<T>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<T>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->template mutable_data<T>(),
+          mask->template mutable_data<int>());
   return true;
 }
 
diff --git a/caffe2/operators/mem_query_op.cu b/caffe2/operators/mem_query_op.cu
index 767be8af2c385..e8351376f23e2 100644
--- a/caffe2/operators/mem_query_op.cu
+++ b/caffe2/operators/mem_query_op.cu
@@ -19,12 +19,14 @@ class GetGPUMemoryUsageOp final : public Operator<CUDAContext> {
 
     auto* stats = Output(0);
     stats->Resize(2, total_by_gpu.size());
-    context_.Copy<long, CPUContext, CUDAContext>(
-        total_by_gpu.size(), total_by_gpu.data(), stats->mutable_data<long>());
-    context_.Copy<long, CPUContext, CUDAContext>(
+    context_.CopyFromCPU<long>(
+        total_by_gpu.size(),
+        total_by_gpu.data(),
+        stats->template mutable_data<long>());
+    context_.CopyFromCPU<long>(
         max_by_gpu.size(),
         max_by_gpu.data(),
-        stats->mutable_data<long>() + total_by_gpu.size());
+        stats->template mutable_data<long>() + total_by_gpu.size());
     return true;
   }
 };
diff --git a/caffe2/operators/multi_class_accuracy_op.cc b/caffe2/operators/multi_class_accuracy_op.cc
index 9eda6fbe2c1b0..581c034c2b8bd 100644
--- a/caffe2/operators/multi_class_accuracy_op.cc
+++ b/caffe2/operators/multi_class_accuracy_op.cc
@@ -20,8 +20,8 @@ bool MultiClassAccuracyOp<float, CPUContext>::RunOnDevice() {
 
   const auto* Xdata = X.data<float>();
   const auto* labeldata = label.data<int>();
-  auto* accuracies = Y0->mutable_data<float>();
-  auto* amounts = Y1->mutable_data<int>();
+  auto* accuracies = Y0->template mutable_data<float>();
+  auto* amounts = Y1->template mutable_data<int>();
   std::fill(accuracies, accuracies + D, 0);
   std::fill(amounts, amounts + D, 0);
 
diff --git a/caffe2/operators/multi_class_accuracy_op.cu b/caffe2/operators/multi_class_accuracy_op.cu
index f5672146fd418..275005be50889 100644
--- a/caffe2/operators/multi_class_accuracy_op.cu
+++ b/caffe2/operators/multi_class_accuracy_op.cu
@@ -51,8 +51,8 @@ bool MultiClassAccuracyOp<float, CUDAContext>::RunOnDevice() {
 
   const float* Xdata = X.data<float>();
   const int* labeldata = label.data<int>();
-  float* accuracies = Y0->mutable_data<float>();
-  int* amounts = Y1->mutable_data<int>();
+  float* accuracies = Y0->template mutable_data<float>();
+  int* amounts = Y1->template mutable_data<int>();
   math::Set<float, CUDAContext>(D, 0.0, accuracies, &context_);
   math::Set<int, CUDAContext>(D, 0, amounts, &context_);
 
diff --git a/caffe2/operators/norm_planar_yuv_op.cc b/caffe2/operators/norm_planar_yuv_op.cc
index ea3ccc222dc96..c0f997484ca2a 100644
--- a/caffe2/operators/norm_planar_yuv_op.cc
+++ b/caffe2/operators/norm_planar_yuv_op.cc
@@ -28,7 +28,7 @@ class NormalizePlanarYUVOp : public Operator<CPUContext> {
     CAFFE_ENFORCE(C == M.dim(1));
     CAFFE_ENFORCE(C == S.dim(1));
     const auto* Xdata = X.data<float>();
-    auto* Zdata = Z->mutable_data<float>();
+    auto* Zdata = Z->template mutable_data<float>();
 
     int offset = H * W;
     for (auto n = 0; n < N; n++) { // realistically N will always be 1
diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
index dcffe02f650ab..8a8adb6dbe313 100644
--- a/caffe2/operators/normalize_ops.cu
+++ b/caffe2/operators/normalize_ops.cu
@@ -112,7 +112,12 @@ bool NormalizeGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      M, N, SF, X.data<float>(), dY.data<float>(), dX->mutable_data<float>());
+      M,
+      N,
+      SF,
+      X.data<float>(),
+      dY.data<float>(),
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h
index 66a5cc42393c3..2413652e32771 100644
--- a/caffe2/operators/numpy_tile_op.h
+++ b/caffe2/operators/numpy_tile_op.h
@@ -37,7 +37,7 @@ class NumpyTileOp : public Operator<Context> {
     // Alternate inputs and outputs between two buffers. Repeatedly apply the
     // Tile kernel along each axis. Then copy out the resulting data into the
     // output tensor.
-    Tensor<Context> *src = &buffer, *dst = output;
+    Tensor *src = &buffer, *dst = output;
     src->CopyFrom(input);
     vector<TIndex> output_dims(input.dims());
     for (size_t i = 0; i < repeats.size(); ++i) {
@@ -98,15 +98,14 @@ class NumpyTileOp : public Operator<Context> {
       char* output_data) {
     for (auto i = 0; i < outer_dim; ++i) {
       for (auto t = 0; t < num_tiles; ++t) {
-        context_.template CopyItems<Context, Context>(
-            meta, inner_dim, input_data, output_data);
+        context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
         output_data += inner_dim * item_size;
       }
       input_data += inner_dim * item_size;
     }
   }
 
-  Tensor<Context> buffer;
+  Tensor buffer{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index bb8a1dbc77441..dda273dd47cc3 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -92,8 +92,8 @@ template <>
 void OneHotOp<CPUContext>::DoOneHotOp(
     TIndex batch_size,
     TIndex index_size,
-    const Tensor<CPUContext>& indices,
-    Tensor<CPUContext>* one_hots) {
+    const Tensor& indices,
+    Tensor* one_hots) {
   const TIndex* indices_ptr = indices.template data<TIndex>();
   float* one_hots_ptr = one_hots->template mutable_data<float>();
   memset(one_hots_ptr, 0, one_hots->nbytes());
@@ -187,7 +187,7 @@ class SegmentOneHotOp : public Operator<CPUContext> {
     auto* indices_ptr = indices.data<int64_t>();
     auto* one_hots = Output(0);
     one_hots->Resize(batch_size, index_size);
-    auto* one_hots_ptr = one_hots->mutable_data<float>();
+    auto* one_hots_ptr = one_hots->template mutable_data<float>();
     if (one_hots->size() == 0) {
       return true;
     }
diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
index 9cca0a5b2ffeb..e1b6e18daf870 100644
--- a/caffe2/operators/one_hot_ops.cu
+++ b/caffe2/operators/one_hot_ops.cu
@@ -19,9 +19,9 @@ template <>
 void OneHotOp<CUDAContext>::DoOneHotOp(
     TIndex batch_size,
     TIndex index_size,
-    const Tensor<CUDAContext>& indices,
-    Tensor<CUDAContext>* output) {
-  float* output_ptr = output->mutable_data<float>();
+    const Tensor& indices,
+    Tensor* output) {
+  float* output_ptr = output->template mutable_data<float>();
   math::Set<float, CUDAContext>(output->size(), 0., output_ptr, &context_);
   OneHotOpKernel<<<
       CAFFE_GET_BLOCKS(batch_size),
diff --git a/caffe2/operators/one_hot_ops.h b/caffe2/operators/one_hot_ops.h
index 1b48b69326f3e..f8d8c3262be11 100644
--- a/caffe2/operators/one_hot_ops.h
+++ b/caffe2/operators/one_hot_ops.h
@@ -24,7 +24,7 @@ class OneHotOp final : public Operator<Context> {
         "indices input must be 1D tensor of data type TIndex");
 
     // Index size input must be in CPU context
-    auto& index_size_tensor = OperatorBase::Input<Tensor<CPUContext>>(1);
+    auto& index_size_tensor = OperatorBase::Input<Tensor>(1, CPU);
     CAFFE_ENFORCE_EQ(
         index_size_tensor.size(),
         1,
@@ -47,8 +47,8 @@ class OneHotOp final : public Operator<Context> {
   void DoOneHotOp(
       TIndex batch_size,
       TIndex index_size,
-      const Tensor<Context>& indices,
-      Tensor<Context>* output);
+      const Tensor& indices,
+      Tensor* output);
 };
 
 template <class Context>
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index d6a72bf9b1e5f..874bf075458ee 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -140,16 +140,15 @@ class ONNXWhileOp final : public Operator<Context> {
         for (int i = 0; i < num_loop_carried_deps; ++i) {
           Blob* b = cur_ws->GetBlob(
               scope_->net()->external_output()[i + 1]);
-          const Tensor<Context>& t = b->template Get<Tensor<Context>>();
+          const Tensor& t = b->template Get<Tensor>();
           scope_->lcd_tensor(i)->CopyFrom(t);
         }
         // Copy out scan_outputs
         for (int i = 0; i < num_scan_outputs; ++i) {
           int net_output_idx = i + 1 + num_loop_carried_deps;
-          const Tensor<Context>& scan_output =
-              cur_ws->GetBlob(
-                  scope_->net()->external_output()[net_output_idx])
-                  ->template Get<Tensor<Context>>();
+          const Tensor& scan_output =
+              cur_ws->GetBlob(scope_->net()->external_output()[net_output_idx])
+                  ->template Get<Tensor>();
           auto* scan_output_target = Output(i + num_loop_carried_deps);
           if (itr == 0) {
             auto dims = scan_output.dims();
@@ -214,22 +213,23 @@ class ONNXWhileOp final : public Operator<Context> {
       lcd_tensors_.clear();
       for (int i = 2; i < body_net_def.external_input_size(); ++i) {
         Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor<Context>* t = b->template GetMutable<Tensor<Context>>();
+        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
         lcd_tensors_.push_back(t);
       }
       // First output is the iteration variable
       auto* iteration_var_blob = loop_ws_->CreateBlob(
           body_net_def.external_input(0));
       iteration_var_ =
-          iteration_var_blob->template GetMutable<Tensor<Context>>();
+          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
 
-      input_condition_var_ = loop_ws_->CreateBlob(
-          body_net_def.external_input(1))
-          ->template GetMutable<Tensor<Context>>();
+      input_condition_var_ =
+          loop_ws_->CreateBlob(body_net_def.external_input(1))
+              ->GetMutableTensor(Context::GetDeviceType());
 
       auto* condition_var_blob =
           loop_ws_->CreateBlob(body_net_def.external_output(0));
-      condition_var_ = condition_var_blob->template GetMutable<Tensor<Context>>();
+      condition_var_ =
+          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
       condition_var_->Resize(1);
       condition_var_->template mutable_data<bool>();
 
@@ -254,7 +254,7 @@ class ONNXWhileOp final : public Operator<Context> {
       return *iteration_var_ptr;
     }
 
-    Tensor<Context>* lcd_tensor(int idx) {
+    Tensor* lcd_tensor(int idx) {
       return lcd_tensors_[idx];
     }
 
@@ -284,11 +284,11 @@ class ONNXWhileOp final : public Operator<Context> {
     Workspace *loop_ws_;
 
     NetBase* body_net_; // owned by a workspace
-    Tensor<Context>* iteration_var_;
-    Tensor<Context>* input_condition_var_;
-    Tensor<Context>* condition_var_;
+    Tensor* iteration_var_;
+    Tensor* input_condition_var_;
+    Tensor* condition_var_;
 
-    std::vector<Tensor<Context>*> lcd_tensors_;
+    std::vector<Tensor*> lcd_tensors_;
   };
 
   NetDef body_net_def_;
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 3030f45babde5..3cdf252c8c0ed 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->template IsType<TensorCPU>(),
+      blob->template IsType<Tensor>(CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 62d9cdbdaef07..e66ba7ea7383c 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -62,8 +62,8 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
   bool RunOnDevice() override {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
-      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
-        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
+      if (OperatorBase::InputIsType<Tensor>(i, CUDA)) {
+        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
             Input(i), &context_);
         need_sync = true;
       } else {
@@ -93,11 +93,10 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<TensorCPU>(),
+          local_output_blobs_[i]->template IsType<Tensor>(CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
-      Output(i)->CopyFrom(
-          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
+      Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
     }
     return true;
   }
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index eb6c225478cb1..e562858c073ec 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -37,11 +37,11 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
       "IncrementByOne", "", vector<string>{"X"},
       vector<string>{"X"});
   Workspace ws;
-  TensorCPU source_tensor(vector<TIndex>{2, 3});
+  Tensor source_tensor(vector<TIndex>{2, 3}, CPU);
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
+  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
@@ -61,16 +61,16 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
       vector<string>{"X"});
   op_def.mutable_device_option()->set_device_type(CUDA);
   Workspace ws;
-  TensorCPU source_tensor(vector<TIndex>{2, 3});
+  Tensor source_tensor(vector<TIndex>{2, 3}, CPU);
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutable<TensorCUDA>()->CopyFrom(source_tensor);
+  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
   const TensorCUDA& output = ws.GetBlob("X")->Get<TensorCUDA>();
-  TensorCPU output_cpu(output);
+  Tensor output_cpu(output, CPU);
   EXPECT_EQ(output.ndim(), 2);
   EXPECT_EQ(output.dim(0), 2);
   EXPECT_EQ(output.dim(1), 3);
diff --git a/caffe2/operators/order_switch_ops.cc b/caffe2/operators/order_switch_ops.cc
index 11cc6dedc24f9..7e862eb39ff11 100644
--- a/caffe2/operators/order_switch_ops.cc
+++ b/caffe2/operators/order_switch_ops.cc
@@ -10,7 +10,7 @@ bool NHWC2NCHWOp<float, CPUContext>::RunOnDevice() {
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   Y->Resize(N, C, H, W);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   for (int n = 0; n < N; ++n) {
     for (int h = 0; h < H; ++h) {
       for (int w = 0; w < W; ++w) {
@@ -31,7 +31,7 @@ bool NCHW2NHWCOp<float, CPUContext>::RunOnDevice() {
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   Y->Resize(N, H, W, C);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   for (int n = 0; n < N; ++n) {
     for (int c = 0; c < C; ++c) {
       for (int h = 0; h < H; ++h) {
@@ -66,20 +66,18 @@ OPERATOR_SCHEMA(NHWC2NCHW)
 The operator switches the order of data in a tensor from NHWC- sample index N,
 height H, width H and channels C, to the NCHW order.
 )DOC")
-    .Input(0, "data", "The input data (Tensor<float>) in the NHWC order.")
-    .Output(
-        0,
-        "output",
-        "The output tensor (Tensor<float>) in the NCHW order.");
+    .Input(0, "data", "The input data (Tensor) in the NHWC order.")
+    .Output(0, "output", "The output tensor (Tensor) in the NCHW order.");
 
-OPERATOR_SCHEMA(NCHW2NHWC).NumInputs(1).NumOutputs(1)
-  .SetDoc(R"DOC(
+OPERATOR_SCHEMA(NCHW2NHWC)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
 The operator switches the order of data in a tensor from NCHW- sample index N,
 channels C, height H and width W, to the NHWC order.
 )DOC")
-  .Input(0, "data", "The input data (Tensor<float>) in the NCHW order.")
-  .Output(0, "output", "The output tensor (Tensor<float>) in the NHWC order.");
-
+    .Input(0, "data", "The input data (Tensor) in the NCHW order.")
+    .Output(0, "output", "The output tensor (Tensor) in the NHWC order.");
 
 class GetNHWC2NCHWGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
index 2d77b5da85a24..f7fa090248c43 100644
--- a/caffe2/operators/order_switch_ops.cu
+++ b/caffe2/operators/order_switch_ops.cu
@@ -30,9 +30,12 @@ bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(X.ndim(), 4);
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   Y->Resize(N, C, H, W);
-  NHWC2NCHWKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
-                    0, context_.cuda_stream()>>>(
-      N, H * W, C, X.data<float>(), Y->mutable_data<float>());
+  NHWC2NCHWKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, H * W, C, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -43,9 +46,12 @@ bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(X.ndim(), 4);
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   Y->Resize(N, H, W, C);
-  NCHW2NHWCKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
-                    0, context_.cuda_stream()>>>(
-      N, C, H * W, X.data<float>(), Y->mutable_data<float>());
+  NCHW2NHWCKernel<<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N, C, H * W, X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/pack_rnn_sequence_op.h b/caffe2/operators/pack_rnn_sequence_op.h
index c2fcb7d6beb00..74d40f6bfd47d 100644
--- a/caffe2/operators/pack_rnn_sequence_op.h
+++ b/caffe2/operators/pack_rnn_sequence_op.h
@@ -73,7 +73,7 @@ class PackRNNSequenceOpBase : public Operator<Context> {
       for (int r = 0; r < lengths_vec[c]; r++) {
         auto input_offset = Forward ? (offset + r) : (r * cols + c);
         auto output_offset = Forward ? (r * cols + c) : (offset + r);
-        context_.template CopyItems<Context, Context>(
+        context_.CopyItemsSameDevice(
             values.meta(),
             block_size,
             values_vec + input_offset * block_size,
diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
index 2c2f3fdc4fafc..ab831445e56e3 100644
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@@ -16,7 +16,7 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   const auto& data = Input(DATA);
   const auto& lengths = Input(LENGTHS);
   auto* output = Output(0);
-  Tensor<CPUContext>* presence_mask = nullptr;
+  Tensor* presence_mask = nullptr;
   if (return_presence_mask_) {
     presence_mask = Output(1);
   }
@@ -88,7 +88,7 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   const auto* d = static_cast<const char*>(data.raw_data());
   TIndex start = 0;
   for (TIndex i = 0; i < lengths.dim(0); ++i) {
-    context_.template CopyItems<CPUContext, CPUContext>(
+    context_.CopyItemsSameDevice(
         data.meta(),
         l[i] * block_size,
         d + block_bytesize * start,
@@ -145,7 +145,7 @@ bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
   const auto* d = static_cast<const char*>(data.raw_data());
   TIndex start = 0;
   for (TIndex i = 0; i < lengths.dim(0); ++i) {
-    context_.template CopyItems<CPUContext, CPUContext>(
+    context_.CopyItemsSameDevice(
         data.meta(),
         l[i] * block_size,
         d + block_bytesize * data.dim(1) * i,
diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
index 8e4bdc49339c1..ae573adde3db4 100644
--- a/caffe2/operators/pack_segments.cu
+++ b/caffe2/operators/pack_segments.cu
@@ -53,9 +53,9 @@ template <typename T>
 int64_t int_array_sum(
     const T* dev_array,
     int64_t num_items,
-    Tensor<CUDAContext>& dev_buffer,
-    Tensor<CUDAContext>& dev_sum,
-    Tensor<CPUContext>& host_sum,
+    Tensor& dev_buffer,
+    Tensor& dev_sum,
+    Tensor& host_sum,
     CUDAContext& context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -82,7 +82,7 @@ int64_t int_array_sum(
       context.cuda_stream());
 
   // Copy to host
-  host_sum.CopyFrom<CUDAContext>(dev_sum);
+  host_sum.CopyFrom(dev_sum);
   context.FinishDeviceComputation();
   return *host_sum.data<int64_t>();
 }
@@ -91,9 +91,9 @@ template <typename T>
 T array_max(
     const T* dev_array,
     int64_t num_items,
-    Tensor<CUDAContext>& dev_max_buffer,
-    Tensor<CUDAContext>& dev_max,
-    Tensor<CPUContext>& host_max,
+    Tensor& dev_max_buffer,
+    Tensor& dev_max,
+    Tensor& host_max,
     CUDAContext& context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -120,7 +120,7 @@ T array_max(
       context.cuda_stream());
 
   // Copy to host
-  host_max.CopyFrom<CUDAContext>(dev_max);
+  host_max.CopyFrom(dev_max);
   context.FinishDeviceComputation();
   return *host_max.data<T>();
 }
@@ -129,8 +129,8 @@ template <typename T>
 void array_prefix_sum_exclusive(
     const T* dev_array,
     const int32_t num_items,
-    Tensor<CUDAContext>& prefix_buffer,
-    Tensor<CUDAContext>& prefix_sum,
+    Tensor& prefix_buffer,
+    Tensor& prefix_sum,
     CUDAContext& context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
diff --git a/caffe2/operators/pack_segments.h b/caffe2/operators/pack_segments.h
index c35299d66d1a2..c6eb759f36dba 100644
--- a/caffe2/operators/pack_segments.h
+++ b/caffe2/operators/pack_segments.h
@@ -51,10 +51,10 @@ class PackSegmentsOp final : public Operator<Context> {
   bool return_presence_mask_;
 
   // Scratch space required by the CUDA version
-  Tensor<Context> dev_buffer_;
-  Tensor<Context> dev_lengths_prefix_sum_;
-  Tensor<Context> dev_max_length_;
-  Tensor<CPUContext> host_max_length_;
+  Tensor dev_buffer_{Context::GetDeviceType()};
+  Tensor dev_lengths_prefix_sum_{Context::GetDeviceType()};
+  Tensor dev_max_length_{Context::GetDeviceType()};
+  Tensor host_max_length_{CPU};
 };
 
 template <class Context>
@@ -81,12 +81,12 @@ class UnpackSegmentsOp final : public Operator<Context> {
 
  private:
   TIndex max_length_;
-  Tensor<Context> dev_buffer_;
-  Tensor<Context> dev_lengths_prefix_sum_;
-  Tensor<Context> dev_max_length_;
-  Tensor<Context> dev_num_cell_;
-  Tensor<CPUContext> host_max_length_;
-  Tensor<CPUContext> host_num_cell_;
+  Tensor dev_buffer_{Context::GetDeviceType()};
+  Tensor dev_lengths_prefix_sum_{Context::GetDeviceType()};
+  Tensor dev_max_length_{Context::GetDeviceType()};
+  Tensor dev_num_cell_{Context::GetDeviceType()};
+  Tensor host_max_length_{CPU};
+  Tensor host_num_cell_{CPU};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/pad_op.cc b/caffe2/operators/pad_op.cc
index 74de23e2a9763..d7ac46ff8136b 100644
--- a/caffe2/operators/pad_op.cc
+++ b/caffe2/operators/pad_op.cc
@@ -29,7 +29,7 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   ConvPoolOpBase::SetOutputSize(X, Y, channels);
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   // The main loop
   int padded_height = Y->dim32(2);
   int padded_width = Y->dim32(3);
@@ -166,7 +166,7 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   int channels = X.dim32(3);
   ConvPoolOpBase::SetOutputSize(X, Y, channels);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
 
   // The main loop
   int padded_height = Y->dim32(1);
@@ -259,7 +259,7 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   int width = dX->dim32(3);
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   math::Set<float, CPUContext>(dX->size(), 0, dXdata, &context_);
   // The main loop
   switch (mode_) {
@@ -339,7 +339,7 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   int width = dX->dim32(2);
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   math::Set<float, CPUContext>(dX->size(), 0, dXdata, &context_);
 
   switch (mode_) {
diff --git a/caffe2/operators/pad_op_gpu.cu b/caffe2/operators/pad_op_gpu.cu
index bfb4542ca81d1..fa812d2a11f8b 100644
--- a/caffe2/operators/pad_op_gpu.cu
+++ b/caffe2/operators/pad_op_gpu.cu
@@ -261,7 +261,7 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const int padded_height = Y->dim32(2);
   const int padded_width = Y->dim32(3);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
 
   switch (mode_) {
     case PadMode::CONSTANT:
@@ -337,7 +337,7 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const int padded_height = Y->dim32(1);
   const int padded_width = Y->dim32(2);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
 
   switch (mode_) {
     case PadMode::CONSTANT:
@@ -418,7 +418,7 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const int height = dX->dim32(2);
   const int width = dX->dim32(3);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   math::Set<float, CUDAContext>(output_size, 0, dXdata, &context_);
 
   switch (mode_) {
@@ -499,7 +499,7 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const int width = dX->dim32(2);
   const int channels = dX->dim32(3);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   math::Set<float, CUDAContext>(output_size, 0, dXdata, &context_);
 
   switch (mode_) {
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index 563a02638edd4..003653cbc8976 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -81,7 +81,7 @@ class GatherByKeyOp : public Operator<CPUContext> {
         if (currentShard != -1) {
           auto inStartOffset = inStartOffsets_[currentShard];
           auto numItems = i - outStartOffset;
-          context_.template CopyItems<CPUContext, CPUContext>(
+          context_.CopyItemsSameDevice(
               meta,
               numItems * blockSize,
               inputDatas_[currentShard] +
@@ -183,7 +183,7 @@ class PartitionOpBase : public Operator<CPUContext> {
         auto bs = block_sizes_[i];
         auto meta = metas_[i];
         // special case for small bs?
-        context_.template CopyItems<CPUContext, CPUContext>(
+        context_.CopyItemsSameDevice(
             meta,
             bs,
             static_cast<const char*>(raw_datas_[i]) + p * bs * meta.itemsize(),
diff --git a/caffe2/operators/percentile_op.h b/caffe2/operators/percentile_op.h
index 2cc96e78c0c8a..895281c6a88b8 100644
--- a/caffe2/operators/percentile_op.h
+++ b/caffe2/operators/percentile_op.h
@@ -25,8 +25,8 @@ class PercentileOp final : public Operator<Context> {
  protected:
   INPUT_TAGS(X, VAL_PCT_PAIRS, LENS);
   OUTPUT_TAGS(PCT);
-  Tensor<Context> values_tensor;
-  Tensor<Context> percentiles_tensor;
+  Tensor values_tensor{Context::GetDeviceType()};
+  Tensor percentiles_tensor{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/perplexity_op.cc b/caffe2/operators/perplexity_op.cc
index a7c4d52285e3d..028a6077cc860 100644
--- a/caffe2/operators/perplexity_op.cc
+++ b/caffe2/operators/perplexity_op.cc
@@ -17,7 +17,7 @@ bool PerplexityOp<float, CPUContext>::RunOnDevice() {
   for (int i = 0; i < N; ++i) {
     perplexity *= pow(Xdata[i], -1.0/N);
   }
-  *(Y->mutable_data<float>()) = perplexity;
+  *(Y->template mutable_data<float>()) = perplexity;
   return true;
 }
 
diff --git a/caffe2/operators/perplexity_op.cu b/caffe2/operators/perplexity_op.cu
index afb4d3dc27261..230bdb1601cb4 100644
--- a/caffe2/operators/perplexity_op.cu
+++ b/caffe2/operators/perplexity_op.cu
@@ -26,7 +26,7 @@ bool PerplexityOp<float, CUDAContext>::RunOnDevice() {
   int N = X.dim32(0);
 
   Y->Resize(vector<TIndex>());
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   const float* Xdata = X.data<float>();
 
   float perplexity = thrust::transform_reduce(
diff --git a/caffe2/operators/piecewise_linear_transform_op.cc b/caffe2/operators/piecewise_linear_transform_op.cc
index 528b33619b222..a572d60651f16 100644
--- a/caffe2/operators/piecewise_linear_transform_op.cc
+++ b/caffe2/operators/piecewise_linear_transform_op.cc
@@ -11,7 +11,7 @@ OPERATOR_SCHEMA(PiecewiseLinearTransform)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 PiecewiseLinearTransform takes inputs -- predictions, a 2-D or 1-D tensor
-(Tensor<float>) of size (batch_size x prediction_dimensions). The piecewise
+(Tensor) of size (batch_size x prediction_dimensions). The piecewise
 linear functions are stored in bounds, slopes and intercepts. The output tensor
 has the same shape of input `predictions` and contains the predictions
 transformed by the piecewise linear functions. Each column of predictions has
@@ -57,7 +57,7 @@ bound.
     .Input(
         0,
         "predictions",
-        "2-D tensor (Tensor<float>) of size "
+        "2-D tensor (Tensor) of size "
         "(num_batches x num_classes) containing scores")
     .Input(
         1,
@@ -77,7 +77,7 @@ bound.
     .Output(
         0,
         "transforms",
-        "2-D tensor (Tensor<float>) of size (num_batches x num_classes) "
+        "2-D tensor (Tensor) of size (num_batches x num_classes) "
         "containing transformed predictions");
 
 SHOULD_NOT_DO_GRADIENT(PiecewiseLinearTransform);
diff --git a/caffe2/operators/piecewise_linear_transform_op.cu b/caffe2/operators/piecewise_linear_transform_op.cu
index ecc9f0f249397..8dc2d4e022850 100644
--- a/caffe2/operators/piecewise_linear_transform_op.cu
+++ b/caffe2/operators/piecewise_linear_transform_op.cu
@@ -137,27 +137,27 @@ void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
       }
 
       int length = num_group * num_func_per_group;
-      TensorCPU bounds_host;
+      Tensor bounds_host{CPU};
       bounds_host.Resize(length + num_group);
       memcpy(
           bounds_host.mutable_data<float>(),
           bounds,
           (length + num_group) * sizeof(float));
 
-      TensorCPU intercepts_host;
+      Tensor intercepts_host{CPU};
       intercepts_host.Resize(length);
       memcpy(
           intercepts_host.mutable_data<float>(),
           intercepts,
           (length) * sizeof(float));
-      TensorCPU slopes_host;
+      Tensor slopes_host{CPU};
       slopes_host.Resize(length);
       memcpy(
           slopes_host.mutable_data<float>(), slopes, (length) * sizeof(float));
 
-      bounds_device_.CopyFrom<CPUContext>(bounds_host);
-      intercepts_device_.CopyFrom<CPUContext>(intercepts_host);
-      slopes_device_.CopyFrom<CPUContext>(slopes_host);
+      bounds_device_.CopyFrom(bounds_host);
+      intercepts_device_.CopyFrom(intercepts_host);
+      slopes_device_.CopyFrom(slopes_host);
 
       gpu_copied_ = true;
     }
@@ -185,9 +185,9 @@ void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
       CAFFE_ENFORCE_EQ(num_group, M);
     }
 
-    bounds_device_.CopyFrom<CUDAContext>(bounds_input);
-    slopes_device_.CopyFrom<CUDAContext>(slopes_input);
-    intercepts_device_.CopyFrom<CUDAContext>(intercepts_input);
+    bounds_device_.CopyFrom(bounds_input);
+    slopes_device_.CopyFrom(slopes_input);
+    intercepts_device_.CopyFrom(intercepts_input);
   }
 }
 
@@ -218,7 +218,7 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformGeneral() {
       slopes_device_.data<float>(),
       intercepts_device_.data<float>(),
       X.data<float>(),
-      Y->mutable_data<float>());
+      Y->template mutable_data<float>());
 
   return true;
 }
@@ -254,7 +254,7 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
         slopes_device_.data<float>(),
         intercepts_device_.data<float>(),
         X.data<float>(),
-        Y->mutable_data<float>());
+        Y->template mutable_data<float>());
   } else {
     // don't want N*M threads, only N*M/2
     PieceWiseLinearTransformBinaryKernel2<<<
@@ -270,7 +270,7 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
         slopes_device_.data<float>(),
         intercepts_device_.data<float>(),
         X.data<float>(),
-        Y->mutable_data<float>());
+        Y->template mutable_data<float>());
   }
 
   return true;
diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h
index 701acb87f9ad8..7428b6cc24e4b 100644
--- a/caffe2/operators/piecewise_linear_transform_op.h
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@@ -233,9 +233,9 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
   vector<T> slopes_from_arg_;
   vector<T> intercepts_from_arg_;
 
-  Tensor<Context> bounds_device_;
-  Tensor<Context> intercepts_device_;
-  Tensor<Context> slopes_device_;
+  Tensor bounds_device_{Context::GetDeviceType()};
+  Tensor intercepts_device_{Context::GetDeviceType()};
+  Tensor slopes_device_{Context::GetDeviceType()};
   bool gpu_copied_ = false;
 
   // If true, the piecewise linear functions are passed through args,
diff --git a/caffe2/operators/pool_op.cu b/caffe2/operators/pool_op.cu
index 48b30afaa4d00..4af3be93a6b8f 100644
--- a/caffe2/operators/pool_op.cu
+++ b/caffe2/operators/pool_op.cu
@@ -564,70 +564,70 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      Average1DPoolForwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          Y->dim32(2),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          Y->mutable_data<float>());
+      Average1DPoolForwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              Y->dim32(2),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              Y->template mutable_data<float>());
       break;
     case 2:
-      Average2DPoolForwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(2),
-          Y->dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->mutable_data<float>());
+      Average2DPoolForwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              Y->dim32(2),
+              Y->dim32(3),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              Y->template mutable_data<float>());
       break;
     case 3:
-      Average3DPoolForwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          Y->dim32(2),
-          Y->dim32(3),
-          Y->dim32(4),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          Y->mutable_data<float>());
+      Average3DPoolForwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              Y->dim32(2),
+              Y->dim32(3),
+              Y->dim32(4),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              Y->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -643,70 +643,70 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      Average1DPoolForwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          Y->dim32(1),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          Y->mutable_data<float>());
+      Average1DPoolForwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              Y->dim32(1),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              Y->template mutable_data<float>());
       break;
     case 2:
-      Average2DPoolForwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(1),
-          Y->dim32(2),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->mutable_data<float>());
+      Average2DPoolForwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              Y->dim32(1),
+              Y->dim32(2),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              Y->template mutable_data<float>());
       break;
     case 3:
-      Average3DPoolForwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          Y->dim32(1),
-          Y->dim32(2),
-          Y->dim32(3),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          Y->mutable_data<float>());
+      Average3DPoolForwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              Y->dim32(1),
+              Y->dim32(2),
+              Y->dim32(3),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              Y->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -726,70 +726,70 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      Ave1DPoolBackwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          dY.dim32(2),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          dX->mutable_data<float>());
+      Ave1DPoolBackwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              dY.dim32(2),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              dX->template mutable_data<float>());
       break;
     case 2:
-      Ave2DPoolBackwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          dY.dim32(2),
-          dY.dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          dX->mutable_data<float>());
+      Ave2DPoolBackwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              dY.dim32(2),
+              dY.dim32(3),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              dX->template mutable_data<float>());
       break;
     case 3:
-      Ave3DPoolBackwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          dY.dim32(2),
-          dY.dim32(3),
-          dY.dim32(4),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          dX->mutable_data<float>());
+      Ave3DPoolBackwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              dY.dim32(2),
+              dY.dim32(3),
+              dY.dim32(4),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              dX->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -810,70 +810,70 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      Ave1DPoolBackwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          dY.dim32(1),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          dX->mutable_data<float>());
+      Ave1DPoolBackwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              dY.dim32(1),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              dX->template mutable_data<float>());
       break;
     case 2:
-      Ave2DPoolBackwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          dY.dim32(1),
-          dY.dim32(2),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          dX->mutable_data<float>());
+      Ave2DPoolBackwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              dY.dim32(1),
+              dY.dim32(2),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              dX->template mutable_data<float>());
       break;
     case 3:
-      Ave3DPoolBackwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          dY.dim32(1),
-          dY.dim32(2),
-          dY.dim32(3),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          dX->mutable_data<float>());
+      Ave3DPoolBackwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              dY.dim32(1),
+              dY.dim32(2),
+              dY.dim32(3),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              dX->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1405,67 +1405,67 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DForwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(1),
-          X.dim32(2),
-          Y->dim32(2),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          Y->mutable_data<float>());
+      MaxPool1DForwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(1),
+              X.dim32(2),
+              Y->dim32(2),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              Y->template mutable_data<float>());
       break;
     case 2:
-      MaxPool2DForwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(2),
-          Y->dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->mutable_data<float>());
+      MaxPool2DForwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              Y->dim32(2),
+              Y->dim32(3),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              Y->template mutable_data<float>());
       break;
     case 3:
-      MaxPool3DForwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          Y->dim32(2),
-          Y->dim32(3),
-          Y->dim32(4),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          Y->mutable_data<float>());
+      MaxPool3DForwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              Y->dim32(2),
+              Y->dim32(3),
+              Y->dim32(4),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              Y->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1481,67 +1481,67 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DForwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(1),
-          X.dim32(2),
-          Y->dim32(1),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          Y->mutable_data<float>());
+      MaxPool1DForwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(1),
+              X.dim32(2),
+              Y->dim32(1),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              Y->template mutable_data<float>());
       break;
     case 2:
-      MaxPool2DForwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(1),
-          Y->dim32(2),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->mutable_data<float>());
+      MaxPool2DForwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              Y->dim32(1),
+              Y->dim32(2),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              Y->template mutable_data<float>());
       break;
     case 3:
-      MaxPool3DForwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(output_size),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          Y->dim32(1),
-          Y->dim32(2),
-          Y->dim32(3),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          Y->mutable_data<float>());
+      MaxPool3DForwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(output_size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output_size,
+              X.data<float>(),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              Y->dim32(1),
+              Y->dim32(2),
+              Y->dim32(3),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              Y->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1561,76 +1561,76 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DBackwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          dY.dim32(2),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          dX->mutable_data<float>());
+      MaxPool1DBackwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              X.data<float>(),
+              Y.data<float>(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              dY.dim32(2),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              dX->template mutable_data<float>());
       break;
     case 2:
-      MaxPool2DBackwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          dY.dim32(2),
-          dY.dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          dX->mutable_data<float>());
+      MaxPool2DBackwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              X.data<float>(),
+              Y.data<float>(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              dY.dim32(2),
+              dY.dim32(3),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              dX->template mutable_data<float>());
       break;
     case 3:
-      MaxPool3DBackwardNCHW<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          dY.dim32(2),
-          dY.dim32(3),
-          dY.dim32(4),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          dX->mutable_data<float>());
+      MaxPool3DBackwardNCHW<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              X.data<float>(),
+              Y.data<float>(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              dY.dim32(2),
+              dY.dim32(3),
+              dY.dim32(4),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              dX->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1650,75 +1650,75 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DBackwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          dY.dim32(1),
-          kernel_h(),
-          stride_h(),
-          pad_t(),
-          dX->mutable_data<float>());
+      MaxPool1DBackwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              X.data<float>(),
+              Y.data<float>(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              dY.dim32(1),
+              kernel_h(),
+              stride_h(),
+              pad_t(),
+              dX->template mutable_data<float>());
     case 2:
-      MaxPool2DBackwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          dY.dim32(1),
-          dY.dim32(2),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          dX->mutable_data<float>());
+      MaxPool2DBackwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              X.data<float>(),
+              Y.data<float>(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              dY.dim32(1),
+              dY.dim32(2),
+              kernel_h(),
+              kernel_w(),
+              stride_h(),
+              stride_w(),
+              pad_t(),
+              pad_l(),
+              dX->template mutable_data<float>());
       break;
     case 3:
-      MaxPool3DBackwardNHWC<float><<<
-          CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          X.dim32(4),
-          dY.dim32(1),
-          dY.dim32(2),
-          dY.dim32(3),
-          kernel_h(),
-          kernel_w(),
-          kernel_[2],
-          stride_h(),
-          stride_w(),
-          stride_[2],
-          pad_t(),
-          pad_l(),
-          pads_[2],
-          dX->mutable_data<float>());
+      MaxPool3DBackwardNHWC<float>
+          <<<CAFFE_GET_BLOCKS(X.size()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              X.size(),
+              X.data<float>(),
+              Y.data<float>(),
+              dY.data<float>(),
+              X.dim32(0),
+              X.dim32(1),
+              X.dim32(2),
+              X.dim32(3),
+              X.dim32(4),
+              dY.dim32(1),
+              dY.dim32(2),
+              dY.dim32(3),
+              kernel_h(),
+              kernel_w(),
+              kernel_[2],
+              stride_h(),
+              stride_w(),
+              stride_[2],
+              pad_t(),
+              pad_l(),
+              pads_[2],
+              dX->template mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
diff --git a/caffe2/operators/pool_op_cudnn.cu b/caffe2/operators/pool_op_cudnn.cu
index 00f719b819975..809828a99defa 100644
--- a/caffe2/operators/pool_op_cudnn.cu
+++ b/caffe2/operators/pool_op_cudnn.cu
@@ -196,7 +196,10 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
                  CAFFE_CUDA_NUM_THREADS,
                  0,
                  context_.cuda_stream()>>>(
-                  N * C, H * W * D, X.data<float>(), Y->mutable_data<float>());
+                  N * C,
+                  H * W * D,
+                  X.data<float>(),
+                  Y->template mutable_data<float>());
           return true;
         }
         if (mode_ == CUDNN_POOLING_MAX) {
@@ -205,7 +208,10 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
                  CAFFE_CUDA_NUM_THREADS,
                  0,
                  context_.cuda_stream()>>>(
-                  N * C, H * W * D, X.data<float>(), Y->mutable_data<float>());
+                  N * C,
+                  H * W * D,
+                  X.data<float>(),
+                  Y->template mutable_data<float>());
           return true;
         }
       }
@@ -379,7 +385,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
                   N * C,
                   H * W * D,
                   dY.data<float>(),
-                  dX->mutable_data<float>());
+                  dX->template mutable_data<float>());
           return true;
         }
 #if CUDNN_VERSION_MIN(6, 0, 0)
@@ -396,7 +402,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
                   N * C,
                   H * W * D,
                   dY.data<float>(),
-                  dX->mutable_data<float>(),
+                  dX->template mutable_data<float>(),
                   Y.data<float>(),
                   X.data<float>());
           return true;
diff --git a/caffe2/operators/prelu_op.cc b/caffe2/operators/prelu_op.cc
index 8bacf1e29153c..2edebecf82f2f 100644
--- a/caffe2/operators/prelu_op.cc
+++ b/caffe2/operators/prelu_op.cc
@@ -188,8 +188,8 @@ bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Wdata = W.data<float>();
-  float* dXdata = dX->mutable_data<float>();
-  float* dWdata = dW->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
+  float* dWdata = dW->template mutable_data<float>();
 
   // non-shared case.
   switch (order_) {
diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
index b14393d81b50d..9cf5d5893a96f 100644
--- a/caffe2/operators/prelu_op.cu
+++ b/caffe2/operators/prelu_op.cu
@@ -154,7 +154,7 @@ bool PReluOp<float, CUDAContext>::RunOnDevice() {
   Y->ResizeLike(X);
   const auto* Xdata = X.data<float>();
   const auto* Wdata = W.data<float>();
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
 
   const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
   const auto C_shared = (W.size() == 1);
@@ -221,8 +221,8 @@ bool PReluGradientOp<float, CUDAContext>::RunOnDevice() {
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Wdata = W.data<float>();
-  float* dXdata = dX->mutable_data<float>();
-  float* dWdata = dW->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
+  float* dWdata = dW->template mutable_data<float>();
   int N = Y.dim(0);
 
   if (C_shared) {
diff --git a/caffe2/operators/prepend_dim_op.h b/caffe2/operators/prepend_dim_op.h
index 16cbb11eec74c..ab40085bf3c89 100644
--- a/caffe2/operators/prepend_dim_op.h
+++ b/caffe2/operators/prepend_dim_op.h
@@ -40,7 +40,7 @@ class PrependDimOp : public Operator<Context> {
 
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           input.meta(),
           input.size(),
           input.raw_data(),
@@ -75,7 +75,7 @@ class MergeDimOp : public Operator<Context> {
 
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           input.meta(),
           input.size(),
           input.raw_data(),
diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h
index 768d879fdf0b7..8068b2e00510e 100644
--- a/caffe2/operators/quant_decode_op.h
+++ b/caffe2/operators/quant_decode_op.h
@@ -12,10 +12,10 @@ namespace {
 
 template <class CodebookT, class CodeT>
 void Decode(
-    const TensorCPU& codebook,
-    const TensorCPU& codes,
-    /* optional */ const TensorCPU* const decoded_grad,
-    TensorCPU* const output,
+    const Tensor& codebook,
+    const Tensor& codes,
+    /* optional */ const Tensor* const decoded_grad,
+    Tensor* const output,
     bool resizeOnly) {
   CAFFE_ENFORCE(codebook.IsType<CodebookT>());
 
@@ -28,7 +28,7 @@ void Decode(
   if (decoded_grad == nullptr) {
     // Forward pass: decode and store codebook values in output.
     output->ResizeLike(codes);
-    auto* out_ptr = output->mutable_data<CodebookT>();
+    auto* out_ptr = output->template mutable_data<CodebookT>();
     if (resizeOnly) {
       return;
     }
@@ -45,7 +45,7 @@ void Decode(
     auto* const gradient_end = gradient_ptr + decoded_grad->size();
 
     CAFFE_ENFORCE_EQ(cb_size, output->size());
-    auto* out_ptr = output->mutable_data<CodebookT>();
+    auto* out_ptr = output->template mutable_data<CodebookT>();
     while (gradient_ptr < gradient_end) {
       DCHECK_LE(*code_ptr, cb_size);
       out_ptr[*code_ptr++] += *gradient_ptr++;
@@ -56,10 +56,10 @@ void Decode(
 #define REGISTER_DECODER(codebookType, codesType)                      \
   {                                                                    \
     {TypeMeta::Id<codebookType>(), TypeMeta::Id<codesType>()},         \
-        [](const TensorCPU& codebook_,                                 \
-           const TensorCPU& codes_,                                    \
-           const TensorCPU* gradient_,                                 \
-           TensorCPU* outDecoded_,                                     \
+        [](const Tensor& codebook_,                                    \
+           const Tensor& codes_,                                       \
+           const Tensor* gradient_,                                    \
+           Tensor* outDecoded_,                                        \
            bool resizeOnly_) {                                         \
           Decode<codebookType, codesType>(                             \
               codebook_, codes_, gradient_, outDecoded_, resizeOnly_); \
@@ -67,18 +67,18 @@ void Decode(
   }
 
 inline void DecodeGeneral(
-    const TensorCPU& codebook,
-    const TensorCPU& codes,
-    const TensorCPU* gradient,
-    TensorCPU* outDecoded,
+    const Tensor& codebook,
+    const Tensor& codes,
+    const Tensor* gradient,
+    Tensor* outDecoded,
     bool resizeOnly) {
   const static std::map<
       std::pair<CaffeTypeId, CaffeTypeId>,
       std::function<void(
-          const TensorCPU& codebook,
-          const TensorCPU& codes,
-          const TensorCPU* gradient,
-          TensorCPU* outDecoded,
+          const Tensor& codebook,
+          const Tensor& codes,
+          const Tensor* gradient,
+          Tensor* outDecoded,
           bool resizeOnly)>>
       gDecoderMapper = {REGISTER_DECODER(float, uint8_t),
                         REGISTER_DECODER(float, uint16_t),
@@ -153,7 +153,7 @@ class QuantDecodeGradientOp final : public Operator<CPUContext> {
 
     auto* gradient = Output(0);
     gradient->ResizeLike(codebook);
-    auto* gradient_ptr = gradient->mutable_data<float>();
+    auto* gradient_ptr = gradient->template mutable_data<float>();
     std::fill(gradient_ptr, gradient_ptr + gradient->size(), 0);
 
     for (int i = 0; i < num_code_tensors; i++) {
diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h
index f3dd35b956078..6d357e1b9f996 100644
--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@@ -51,7 +51,7 @@ class SumRangeReducerGradient {
       Context* context) {
     // do we have some op that does it smartly with minimum number of memcpy?
     for (TIndex i = 0; i < blocks; ++i) {
-      context->template Copy<T, Context, Context>(
+      context->template CopySameDevice<T>(
           block_size, segment_grad, data_grad + block_size * i);
     }
   }
@@ -342,8 +342,7 @@ class BaseReducer {
                              : size_from_dim_(dims.size() - skip_dims, dims);
     }
 
-    void
-    observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
+    void observeInput(int input, const Tensor& value, int skip_dims) {
       DCHECK_EQ(0, input);
       auto& dims = value.dims();
       computeMeta(dims, skip_dims);
@@ -394,10 +393,7 @@ class BaseReducerGradient {
     vector<TIndex> block_shape;
     bool first_dim;
 
-    Meta(
-        const Tensor<CPUContext>& out_grad,
-        int skip_dims,
-        bool first_dim = true)
+    Meta(const Tensor& out_grad, int skip_dims, bool first_dim = true)
         : first_dim(first_dim) {
       auto& dims = out_grad.dims();
       first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
@@ -409,8 +405,8 @@ class BaseReducerGradient {
 
     void observeOriginalInput(
         int /*original_input*/,
-        const Tensor<CPUContext>& /*value*/,
-        Tensor<CPUContext>* /*input_grad*/, // optional grad to populate
+        const Tensor& /*value*/,
+        Tensor* /*input_grad*/, // optional grad to populate
         int /*skip_dims*/) {}
 
     void appendGradShape(vector<TIndex>* output_shape) {
@@ -479,8 +475,7 @@ class SumReducerGradient : public BaseReducerGradient {
     if (FixedSize == 1) { // static if
       *data_grad = *s_grad_;
     } else if (meta.first_dim) {
-      context->template Copy<T, Context, Context>(
-          meta.block_size, s_grad_, data_grad);
+      context->template CopySameDevice<T>(meta.block_size, s_grad_, data_grad);
     } else {
       math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
     }
@@ -522,8 +517,7 @@ class WeightedSumReducer<T, CPUContext> : public BaseReducer {
 
     explicit Meta(bool first = true) : first_dim(first) {}
 
-    void
-    observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
+    void observeInput(int input, const Tensor& value, int skip_dims) {
       if (input == 1) {
         CAFFE_ENFORCE_EQ(
             skip_dims, value.ndim(), "SCALARS mustn't have extra dimensions");
@@ -580,14 +574,14 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
 
     void observeOriginalInput(
         int original_input,
-        const Tensor<CPUContext>& value,
-        Tensor<CPUContext>* input_grad, // optional grad to populate
+        const Tensor& value,
+        Tensor* input_grad, // optional grad to populate
         int /*skip_dims*/) {
       CAFFE_ENFORCE_EQ(1, original_input);
       scalars = value.data<T>();
       if (input_grad) {
         input_grad->ResizeLike(value);
-        scalars_grad = input_grad->mutable_data<T>();
+        scalars_grad = input_grad->template mutable_data<T>();
       }
     }
   };
diff --git a/caffe2/operators/reduction_front_back_ops.h b/caffe2/operators/reduction_front_back_ops.h
index 03633ce7c4a37..85de482e12273 100644
--- a/caffe2/operators/reduction_front_back_ops.h
+++ b/caffe2/operators/reduction_front_back_ops.h
@@ -157,7 +157,7 @@ class SumReduceDimsGradientOp final : public Operator<Context> {
       T* dXdata);
   int num_reduce_dims_;
   // scratch space used for former version of this reducer
-  Tensor<CPUContext> shape_;
+  Tensor shape_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context, bool FIRSTDIMS>
diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc
index 6f043eb4c5678..0d01d50ca000e 100644
--- a/caffe2/operators/reduction_ops.cc
+++ b/caffe2/operators/reduction_ops.cc
@@ -296,9 +296,7 @@ bool SumElementsGradientOp<T, Context>::RunOnDevice()
 #endif
 {
   auto& X = Input(0);
-  // Copy Input(1) from Context to CPUContext
-  CPUContext context;
-  TensorCPU sum_grad(Input(1), &context);
+  Tensor sum_grad(Input(1), CPU);
   auto* dX = Output(0);
   dX->ResizeLike(X);
   DCHECK_EQ(sum_grad.size(), 1);
diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
index 3f9728c860975..f5937cd926cf7 100644
--- a/caffe2/operators/reduction_ops.cu
+++ b/caffe2/operators/reduction_ops.cu
@@ -86,12 +86,15 @@ bool SumElementsGradientOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(dY.size(), 1);
   auto* dX = Output(0);
   dX->ResizeLike(X);
-  SumElementsGradientKernel<float><<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      average_, X.size(), dY.data<float>(), dX->mutable_data<float>());
+  SumElementsGradientKernel<float>
+      <<<CAFFE_GET_BLOCKS(X.size()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          average_,
+          X.size(),
+          dY.data<float>(),
+          dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h
index ba502489b148a..11cf06a2f15a1 100644
--- a/caffe2/operators/reduction_ops.h
+++ b/caffe2/operators/reduction_ops.h
@@ -43,7 +43,7 @@ class SumElementsOp : public Operator<Context> {
 
  private:
   bool average_;
-  Tensor<Context> scratch_;
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -66,7 +66,7 @@ class SumElementsIntOp : public Operator<Context> {
   }
 
  private:
-  Tensor<Context> scratch_;
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -124,7 +124,7 @@ class SumSqrElementsOp : public Operator<Context> {
   }
 
  private:
-  Tensor<Context> scratch_;
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context, bool ROWWISE>
diff --git a/caffe2/operators/relu_n_op.cc b/caffe2/operators/relu_n_op.cc
index f04769a98df60..abaa8a14aefc0 100644
--- a/caffe2/operators/relu_n_op.cc
+++ b/caffe2/operators/relu_n_op.cc
@@ -69,8 +69,8 @@ OPERATOR_SCHEMA(ReluN)
     .CostInferenceFunction(CostInferenceForReluN)
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(
-Relu takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the rectified linear function, y = min(max(0, x), n),
+Relu takes one input data (Tensor) and produces one output data
+(Tensor) where the rectified linear function, y = min(max(0, x), n),
 is applied to the tensor elementwise.
 )DOC")
     .Input(0, "X", "1D input tensor")
diff --git a/caffe2/operators/remove_data_blocks_op.h b/caffe2/operators/remove_data_blocks_op.h
index 9530242df9de2..243f27e1c1797 100644
--- a/caffe2/operators/remove_data_blocks_op.h
+++ b/caffe2/operators/remove_data_blocks_op.h
@@ -65,7 +65,7 @@ class RemoveDataBlocksOp final : public Operator<Context> {
       int64_t interval_end =
           (i == ind_vec_size - 1) ? outer_size : ind_vec[i + 1];
       auto num_items = interval_end - interval_start;
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           data.meta(),
           num_items * block_size,
           data_ptr + block_size_bytes * interval_start,
diff --git a/caffe2/operators/reservoir_sampling.cc b/caffe2/operators/reservoir_sampling.cc
index 79198d7c6b510..5d6b94248b54c 100644
--- a/caffe2/operators/reservoir_sampling.cc
+++ b/caffe2/operators/reservoir_sampling.cc
@@ -153,7 +153,7 @@ class ReservoirSamplingOp final : public Operator<Context> {
         CAFFE_ENFORCE_GE(*num_visited, numToCollect_);
       } else {
         // replace
-        context_.template CopyItems<Context, Context>(
+        context_.CopyItemsSameDevice(
             input.meta(),
             block_size,
             input_data + i * block_bytesize,
diff --git a/caffe2/operators/reshape_op.h b/caffe2/operators/reshape_op.h
index f59da8ab779ab..f332192b55e0e 100644
--- a/caffe2/operators/reshape_op.h
+++ b/caffe2/operators/reshape_op.h
@@ -35,9 +35,7 @@ class ReshapeOp : public Operator<Context> {
 
  protected:
   template <typename T>
-  void DoRunWithTypeImpl(
-      const Tensor<Context>& input,
-      Tensor<Context>* output) {
+  void DoRunWithTypeImpl(const Tensor& input, Tensor* output) {
     vector<int64_t> actual_new_shape = new_shape_;
     if (InputSize() == 2) {
       CAFFE_ENFORCE(
@@ -52,8 +50,7 @@ class ReshapeOp : public Operator<Context> {
 
       // Bit awkward, but needed so works on both CPU and CUDA contexts
       std::vector<T> tmpv(shape.size());
-      context_.template CopyBytes<Context, CPUContext>(
-          shape.size() * sizeof(T), shape_data, &tmpv[0]);
+      context_.CopyBytesToCPU(shape.size() * sizeof(T), shape_data, &tmpv[0]);
       actual_new_shape.assign(tmpv.begin(), tmpv.begin() + shape.size());
     }
 
@@ -124,7 +121,7 @@ class ReshapeOp : public Operator<Context> {
     output->Resize(actual_new_shape);
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           input.meta(),
           input.size(),
           input.raw_data(),
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index 300cf87f59d18..86faf736ca391 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -20,10 +20,10 @@ static void AddConstInput(
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
-      tensor->size(), value, tensor->mutable_data<float>(), &context);
+      tensor->size(), value, tensor->template mutable_data<float>(), &context);
   return;
 }
 
@@ -44,7 +44,7 @@ TEST(ReshapeOpGPUTest, testReshapeWithScalar) {
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
   Blob* XNew = ws.GetBlob("XNew");
-  const Tensor<CUDAContext>& XNewTensor = XNew->Get<Tensor<CUDAContext>>();
+  const Tensor& XNewTensor = XNew->Get<Tensor>();
   EXPECT_EQ(1, XNewTensor.ndim());
   EXPECT_EQ(1, XNewTensor.size());
 }
diff --git a/caffe2/operators/resize_op.cc b/caffe2/operators/resize_op.cc
index 8a272a3d40f96..d79b90b0d3d40 100644
--- a/caffe2/operators/resize_op.cc
+++ b/caffe2/operators/resize_op.cc
@@ -67,7 +67,7 @@ bool ResizeNearestOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(batch_size, num_channels, output_height, output_width);
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
 
   // Specialized implementation for fast 2x upsampling
   if (width_scale_ == 2.0 && height_scale_ == 2.0) {
@@ -108,13 +108,11 @@ bool ResizeNearestGradientOp<float, CPUContext>::RunOnDevice() {
   const int output_height = X.dim32(2);
   const int output_width = X.dim32(3);
   dX->Resize(batch_size, num_channels, output_height, output_width);
-  math::Set<float, CPUContext>(dX->size(),
-                               0.0f,
-                               dX->mutable_data<float>(),
-                               &context_);
+  math::Set<float, CPUContext>(
+      dX->size(), 0.0f, dX->template mutable_data<float>(), &context_);
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
 
   for (int n = 0; n < batch_size; ++n) {
     for (int c = 0; c < num_channels; ++c) {
diff --git a/caffe2/operators/resize_op.cu b/caffe2/operators/resize_op.cu
index 0e1d55e5a4f30..6c433c3b00e0c 100644
--- a/caffe2/operators/resize_op.cu
+++ b/caffe2/operators/resize_op.cu
@@ -98,7 +98,7 @@ bool ResizeNearestOp<float, CUDAContext>::RunOnDevice() {
       height_scale_,
       width_scale_,
       X.data<float>(),
-      Y->mutable_data<float>());
+      Y->template mutable_data<float>());
 
   return true;
 }
@@ -117,7 +117,7 @@ bool ResizeNearestGradientOp<float, CUDAContext>::RunOnDevice() {
   int output_width = X.dim32(3);
   dX->Resize(batch_size, num_channels, output_height, output_width);
   math::Set<float, CUDAContext>(
-      dX->size(), 0.0f, dX->mutable_data<float>(), &context_);
+      dX->size(), 0.0f, dX->template mutable_data<float>(), &context_);
 
   const auto size = dY.size();
   NearestNeighborGradientKernel<<<
@@ -134,7 +134,7 @@ bool ResizeNearestGradientOp<float, CUDAContext>::RunOnDevice() {
       height_scale_,
       width_scale_,
       dY.data<float>(),
-      dX->mutable_data<float>());
+      dX->template mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h
index f2f1122ceabe3..f0bdbcf482116 100644
--- a/caffe2/operators/reverse_packed_segs_op.h
+++ b/caffe2/operators/reverse_packed_segs_op.h
@@ -58,7 +58,7 @@ class ReversePackedSegsOp final : public Operator<Context> {
     const LengthType* lengths_ptr = lengths.template data<LengthType>();
 
     vector<LengthType> lengths_host(batch_size);
-    context_.template Copy<LengthType, Context, CPUContext>(
+    context_.template CopyToCPU<LengthType>(
         batch_size, lengths_ptr, &lengths_host[0]);
     context_.FinishDeviceComputation();
 
@@ -71,14 +71,14 @@ class ReversePackedSegsOp final : public Operator<Context> {
         const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
         T* rev_data_block_ptr =
             rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size;
-        context_.template Copy<T, Context, Context>(
+        context_.template CopySameDevice<T>(
             block_size, data_block_ptr, rev_data_block_ptr);
       }
       for (; j < max_length; j++) {
         const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
         T* rev_data_block_ptr =
             rev_data_ptr + (j * batch_size + i) * block_size;
-        context_.template Copy<T, Context, Context>(
+        context_.template CopySameDevice<T>(
             block_size, data_block_ptr, rev_data_block_ptr);
       }
     }
diff --git a/caffe2/operators/rmac_regions_op.cc b/caffe2/operators/rmac_regions_op.cc
index ba6ab10973504..da0df05d63fd7 100644
--- a/caffe2/operators/rmac_regions_op.cc
+++ b/caffe2/operators/rmac_regions_op.cc
@@ -56,7 +56,7 @@ bool RMACRegionsOp<CPUContext>::RunOnDevice() {
 
     int cur_rows = output->dim32(0);
     output->Extend((l + Wd) * (l + Hd), 50, &context_);
-    auto* outputData = output->mutable_data<float>() + cur_rows * 5;
+    auto* outputData = output->template mutable_data<float>() + cur_rows * 5;
 
     for (int i = 0; i < l + Wd; ++i) {
       for (int j = 0; j < l + Hd; ++j) {
@@ -85,7 +85,7 @@ bool RMACRegionsOp<CPUContext>::RunOnDevice() {
   // Replicate regions for all items in batch
   int num_rois = output->dim32(0);
   output->Extend((batch_size - 1) * num_rois, 50, &context_);
-  auto* outputData = output->mutable_data<float>();
+  auto* outputData = output->template mutable_data<float>();
   for (int b = 1; b < batch_size; ++b) {
     // Copy all rois
     std::copy_n(outputData, num_rois * 5, outputData + b * num_rois * 5);
diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
index 49faf7a8403cb..ee0fb38459c18 100644
--- a/caffe2/operators/rmac_regions_op.cu
+++ b/caffe2/operators/rmac_regions_op.cu
@@ -192,8 +192,7 @@ bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
   // NumRMACRegionsKernel (number of RoIs), so need to copy that to CPU
   // to Resize() output appropriately.
   int num_rois = 0;
-  context_.CopyBytes<CUDAContext, CPUContext>(
-      sizeof(int), num_rois_.data<int>(), &num_rois);
+  context_.CopyBytesToCPU(sizeof(int), num_rois_.data<int>(), &num_rois);
   int N = batch_size * num_rois;
   output->Resize(N, 5); // [batch_id x1 y1 x2 y2]
 
@@ -203,7 +202,7 @@ bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      W, H, N, num_rois_.data<int>(), output->mutable_data<float>());
+      W, H, N, num_rois_.data<int>(), output->template mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/rmac_regions_op.h b/caffe2/operators/rmac_regions_op.h
index ec5e86f841419..93af252e9af34 100644
--- a/caffe2/operators/rmac_regions_op.h
+++ b/caffe2/operators/rmac_regions_op.h
@@ -21,7 +21,7 @@ class RMACRegionsOp final : public Operator<Context> {
  protected:
   int scales_;
   float overlap_;
-  Tensor<Context> num_rois_;
+  Tensor num_rois_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 1f2e62fdc8f28..501b7d8dc2b10 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -37,19 +37,18 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
 
       for (auto& blob_name : blob_names) {
         const Blob* currentBlob = currentStepWorkspace->GetBlob(blob_name);
-        const auto& currentTensor = currentBlob->Get<Tensor<Context>>();
+        const auto& currentTensor = currentBlob->Get<Tensor>();
 
         std::string newBlobName =
             prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
         ws_->CreateBlob(newBlobName)
-            ->template GetMutable<TensorCPU>()
+            ->GetMutableTensor(CPU)
             ->ResizeLike(currentTensor);
-
-        auto* newTensor =
-            ws_->GetBlob(newBlobName)->template GetMutable<Tensor<Context>>();
-        newTensor->template CopyFrom<Context>(currentTensor);
+        auto type = Context::GetDeviceType();
+        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
+        newTensor->CopyFrom(currentTensor);
       }
     }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index c241931978407..3afaedf577c60 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
       // the forward-only mode.
       std::string this_timestep_blob =
           timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      ws->CreateBlob(this_timestep_blob)->GetMutable<TensorCPU>()->Resize(1);
+      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
-      b->GetMutable<TensorCPU>()->mutable_data<int32_t>()[0] = t;
+      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
 
       // Copy the operators from template
       for (auto& template_rnn_op : timestep_ops_template_) {
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index c50d18e9223d1..a92c7690c7d91 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -52,10 +52,10 @@ struct ScratchWorkspaces {
 };
 
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  ws->CreateBlob(blob_name)->GetMutable<TensorCPU>()->Resize(1);
+  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
   auto timestepBlob = ws->GetBlob(blob_name);
   CAFFE_ENFORCE(timestepBlob);
-  timestepBlob->GetMutable<TensorCPU>()->mutable_data<int32_t>()[0] = t;
+  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
 }
 
 std::map<string, string> GetRecurrentMapping(
@@ -70,8 +70,8 @@ void applyOffsetAlias(
           << " at offset: " << oc.offset;
   auto srcBlob = ws->GetBlob(oc.src);
   CAFFE_ENFORCE(srcBlob);
-  auto* src = srcBlob->template GetMutable<Tensor<Context>>();
-  auto* dst = ws->GetBlob(oc.dst)->template GetMutable<Tensor<Context>>();
+  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
   auto timestep = src->size() / src->dim(0);
   auto dims = src->dims();
   const int32_t startDstTimestep =
@@ -95,7 +95,7 @@ void repeatCopy(
     T* dst,
     Context* context) {
   for (int i = 0; i < repeat_n; ++i) {
-    context->template Copy<T, Context, Context>(n, src, dst + i * n);
+    context->template CopySameDevice<T>(n, src, dst + i * n);
   }
 }
 
@@ -112,11 +112,11 @@ void initializeRecurrentInput(
     Context* context) {
   auto stateBlob = ws->GetBlob(rc.state);
   CAFFE_ENFORCE(stateBlob);
-  auto* state = stateBlob->template GetMutable<Tensor<Context>>();
+  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
 
   auto inputBlob = ws->GetBlob(rc.input);
   CAFFE_ENFORCE(inputBlob);
-  const auto& input = inputBlob->template Get<Tensor<Context>>();
+  const auto& input = inputBlob->template Get<Tensor>();
   CAFFE_ENFORCE_GE(input.ndim(), 1, rc.input);
   CAFFE_ENFORCE_LE(input.ndim(), 3, rc.input);
 
@@ -134,7 +134,7 @@ void initializeRecurrentInput(
 
   if (input.ndim() >= 2) {
     CAFFE_ENFORCE_EQ(input.dim(input.ndim() - 2), batchSize, rc.input);
-    context->template Copy<T, Context, Context>(
+    context->template CopySameDevice<T>(
         batchSize * stateSize * initialStateLength,
         input.template data<T>(),
         state->template mutable_data<T>());
@@ -654,11 +654,11 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
     for (auto& param : params_) {
       auto pBlob = sharedWs_->GetBlob(param.param);
       CAFFE_ENFORCE(pBlob);
-      const auto& p = pBlob->template Get<Tensor<Context>>();
+      const auto& p = pBlob->template Get<Tensor>();
 
       auto gBlob = sharedWs_->GetBlob(param.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->template GetMutable<Tensor<Context>>();
+      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
       g->ResizeLike(p);
       math::Set<T, Context>(
           g->size(),
@@ -670,11 +670,11 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
     for (auto& rg : recurrentGradients_) {
       auto pBlob = sharedWs_->GetBlob(rg.param);
       CAFFE_ENFORCE(pBlob);
-      const auto& p = pBlob->template Get<Tensor<Context>>();
+      const auto& p = pBlob->template Get<Tensor>();
 
       auto gBlob = sharedWs_->CreateBlob(rg.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->template GetMutable<Tensor<Context>>();
+      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
       g->ResizeLike(p);
       CAFFE_ENFORCE_EQ(g->ndim(), 3);
       const auto timestep = g->size() / g->dim(0);
@@ -701,7 +701,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
               << ". Size: " << Input(gradientInputIndex).size();
       auto pGradientBlob = sharedWs_->GetBlob(gradientName);
       CAFFE_ENFORCE(pGradientBlob);
-      auto* g = pGradientBlob->template GetMutable<Tensor<Context>>();
+      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
       g->ResizeLike(Input(gradientInputIndex));
       g->template mutable_data<T>();
     }
@@ -715,11 +715,11 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                 << rg.lastExternalGrad << " for final time step (sep. blob)";
         auto gBlob = sharedWs_->GetBlob(rg.grad);
         CAFFE_ENFORCE(gBlob);
-        auto* g = gBlob->template GetMutable<Tensor<Context>>();
+        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
 
         auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
         CAFFE_ENFORCE(oglastBlob);
-        const auto& oglast = oglastBlob->template Get<Tensor<Context>>();
+        const auto& oglast = oglastBlob->template Get<Tensor>();
         CAFFE_ENFORCE_EQ(g->dim(1), oglast.dim(1));
         CAFFE_ENFORCE_EQ(g->dim(2), oglast.dim(2));
 
@@ -777,7 +777,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
       T* output_data = Output(outputIdx)->template mutable_data<T>();
       auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
       CAFFE_ENFORCE(pBlob);
-      auto* p = pBlob->template GetMutable<Tensor<Context>>();
+      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
 
       if (Input(inputId).ndim() >= 2) {
         // Gradient states blob should live. And if it gets changed by the
@@ -841,7 +841,7 @@ class AccumulateInputGradientOp : public Operator<Context> {
 
   template<typename T>
   bool DoRunWithType() {
-    const auto& t0 = OperatorBase::Input<Tensor<CPUContext>>(0);
+    const auto& t0 = OperatorBase::Input<Tensor>(0, CPU);
     const auto t = t0.template data<int32_t>()[0];
     auto& og = Input(1);
     auto* g = Output(0);
@@ -890,7 +890,7 @@ class RNNApplyLinkOp : public Operator<Context> {
   bool DoRunWithType() {
     // Both internal and external appear as both input and output to enforce
     // correct dependency computation.
-    const auto& t0 = OperatorBase::Input<Tensor<CPUContext>>(0);
+    const auto& t0 = OperatorBase::Input<Tensor>(0, CPU);
     const auto t = t0.template data<int32_t>()[0];
     auto& external = Input(1);
 
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index fe556147ad67a..4b3496558436f 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -60,11 +60,11 @@ RecurrentBaseOp<T>::~RecurrentBaseOp() {
 
 template <typename T>
 void RecurrentBaseOp<T>::initialize(
-    const Tensor<CUDAContext>& input,
-    Tensor<CUDAContext>* dropoutStates,
-    Tensor<CUDAContext>* output,
-    Tensor<CUDAContext>* hiddenOutput,
-    Tensor<CUDAContext>* cellOutput) {
+    const Tensor& input,
+    Tensor* dropoutStates,
+    Tensor* output,
+    Tensor* hiddenOutput,
+    Tensor* cellOutput) {
   static_assert(sizeof(T) == 4, ""); // workaround clang bug
   CAFFE_ENFORCE_GE(input.ndim(), 3);
   const int seqLength = input.dim(0);
@@ -458,13 +458,13 @@ bool RecurrentParamAccessOp<T, mode>::RunOnDevice() {
     if (mode == SET_PARAM) {
       CAFFE_ENFORCE_EQ(
           biasDims[0] * biasDims[1] * biasDims[2], Input(2).size());
-      context_.template Copy<T, CUDAContext, CUDAContext>(
+      context_.template CopySameDevice<T>(
           biasDims[0] * biasDims[1] * biasDims[2],
           Input(2).template data<T>(),
           static_cast<T*>(bias));
     } else {
       Output(0)->Resize(biasDims);
-      context_.template Copy<T, CUDAContext, CUDAContext>(
+      context_.template CopySameDevice<T>(
           biasDims[0] * biasDims[1] * biasDims[2],
           static_cast<T*>(bias),
           Output(0)->template mutable_data<T>());
@@ -495,13 +495,13 @@ bool RecurrentParamAccessOp<T, mode>::RunOnDevice() {
     CAFFE_ENFORCE_EQ(numDims, 3);
     if (mode == SET_PARAM) {
       CAFFE_ENFORCE_EQ(matDims[0] * matDims[1] * matDims[2], Input(2).size());
-      context_.template Copy<T, CUDAContext, CUDAContext>(
+      context_.template CopySameDevice<T>(
           matDims[0] * matDims[1] * matDims[2],
           Input(2).template data<T>(),
           static_cast<T*>(pmatrix));
     } else {
       Output(0)->Resize(matDims);
-      context_.template Copy<T, CUDAContext, CUDAContext>(
+      context_.template CopySameDevice<T>(
           matDims[0] * matDims[1] * matDims[2],
           static_cast<T*>(pmatrix),
           Output(0)->template mutable_data<T>());
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.h b/caffe2/operators/rnn/recurrent_op_cudnn.h
index 25bcc204a1e63..5c70b52620299 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.h
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.h
@@ -37,12 +37,12 @@ class RecurrentBaseOp : public Operator<CUDAContext> {
 
  protected:
   void initialize(
-      const Tensor<CUDAContext>& input,
-      Tensor<CUDAContext>* dropoutStates = nullptr,
+      const Tensor& input,
+      Tensor* dropoutStates = nullptr,
       // If passed, reshapes to the appropriate size
-      Tensor<CUDAContext>* output = nullptr,
-      Tensor<CUDAContext>* hiddenOutput = nullptr,
-      Tensor<CUDAContext>* cellOutput = nullptr);
+      Tensor* output = nullptr,
+      Tensor* hiddenOutput = nullptr,
+      Tensor* cellOutput = nullptr);
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnDropoutDescriptor_t dropoutDesc_;
diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc
index 1cc4103a535b5..269b57f94068e 100644
--- a/caffe2/operators/roi_align_gradient_op.cc
+++ b/caffe2/operators/roi_align_gradient_op.cc
@@ -202,7 +202,7 @@ bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
   // Must zero-out dX before accumulating gradients
   // (TODO): Kaiming - is this safe?
   math::Set<float, CPUContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
 
   if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
     ROIAlignBackwardFeature<float>(
@@ -216,7 +216,7 @@ bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
         pooled_height_,
         pooled_width_,
         sampling_ratio_,
-        dX->mutable_data<float>(),
+        dX->template mutable_data<float>(),
         R.data<float>(),
         R.dim32(1));
   }
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
index 534d55ddd9a46..0ba327663ec59 100644
--- a/caffe2/operators/roi_align_gradient_op.cu
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -193,15 +193,15 @@ bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
                        // (aka "gradOutput")
-  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
-                        // (aka "gradInput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to
+                        // "forward" op (aka "gradInput")
 
   dX->ResizeLike(X);
 
   // Must zero-out dX before accumulating gradients
   // (TODO): Kaiming - is this safe?
   math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
 
   if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
     RoIAlignBackwardFeature<float>
@@ -219,7 +219,7 @@ bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
             pooled_height_,
             pooled_width_,
             sampling_ratio_,
-            dX->mutable_data<float>(),
+            dX->template mutable_data<float>(),
             R.data<float>());
   }
   return true;
diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc
index 0d62dcfd71bee..f0c6a10f7cdb9 100644
--- a/caffe2/operators/roi_align_op.cc
+++ b/caffe2/operators/roi_align_op.cc
@@ -283,7 +283,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
       Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
     }
     // The following mutable_data calls are needed to allocate the tensors
-    Y->mutable_data<float>();
+    Y->template mutable_data<float>();
     return true;
   }
 
@@ -308,7 +308,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
         sampling_ratio_,
         R.data<float>(),
         R.dim32(1),
-        Y->mutable_data<float>(),
+        Y->template mutable_data<float>(),
         order_);
   } else if (order_ == StorageOrder::NHWC) {
     Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
@@ -325,7 +325,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
         sampling_ratio_,
         R.data<float>(),
         R.dim32(1),
-        Y->mutable_data<float>(),
+        Y->template mutable_data<float>(),
         order_);
   }
 
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
index e512f3d974139..bfd108ff24c85 100644
--- a/caffe2/operators/roi_align_op.cu
+++ b/caffe2/operators/roi_align_op.cu
@@ -156,7 +156,7 @@ bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
     // Handle empty rois
     Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
     // The following mutable_data calls are needed to allocate the tensors
-    Y->mutable_data<float>();
+    Y->template mutable_data<float>();
     return true;
   }
 
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index ed4ef33a1d688..92eafefcb65eb 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -18,7 +18,7 @@ void AddConstInput(
     Context* context,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<Tensor<Context>>();
+  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
   tensor->Resize(shape);
   math::Set<float, Context>(
       tensor->size(), value, tensor->template mutable_data<float>(), context);
@@ -39,10 +39,10 @@ void AddInput<CPUContext>(
     const string& name,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
-      tensor->mutable_data<float>(), tensor->size());
+      tensor->template mutable_data<float>(), tensor->size());
   tensor_vec.array() = utils::AsEArrXt(values);
 }
 
@@ -52,12 +52,12 @@ void AddInput<CUDAContext>(
     const vector<float>& values,
     const string& name,
     Workspace* ws) {
-  TensorCPU tmp(shape);
+  Tensor tmp(shape, CPU);
   EigenVectorMap<float> tmp_vec(tmp.mutable_data<float>(), tmp.size());
   tmp_vec.array() = utils::AsEArrXt(values);
 
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->template GetMutable<Tensor<CUDAContext>>();
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->CopyFrom(tmp);
 }
 
@@ -186,7 +186,7 @@ void CreateAndRun(
   Blob* Y_blob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, Y_blob);
 
-  auto& Y = Y_blob->Get<Tensor<Context>>();
+  auto& Y = Y_blob->Get<Tensor>();
   outResult->CopyFrom(Y, &context);
 }
 
@@ -196,9 +196,9 @@ TEST(RoiAlignTest, CheckCPUGPUEqual) {
   if (!caffe2::HasCudaGPU())
     return;
 
-  TensorCPU y_cpu;
-  TensorCPU y_gpu;
-  TensorCPU y_cpu_nhwc;
+  Tensor y_cpu(CPU);
+  Tensor y_gpu(CPU);
+  Tensor y_cpu_nhwc(CPU);
 
   // tests using FAIR example
   {
diff --git a/caffe2/operators/roi_pool_op.cc b/caffe2/operators/roi_pool_op.cc
index d369aecd638e6..00e3ccde3e4ef 100644
--- a/caffe2/operators/roi_pool_op.cc
+++ b/caffe2/operators/roi_pool_op.cc
@@ -31,8 +31,8 @@ bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
 
   const float* Xdata = X.data<float>();
   const float* rois = R.data<float>();
-  float* Ydata = Y->mutable_data<float>();
-  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
+  float* Ydata = Y->template mutable_data<float>();
+  int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();
 
   // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
   for (int n = 0; n < num_rois; ++n) {
diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu
index 45839117b2eda..db18b3f551a74 100644
--- a/caffe2/operators/roi_pool_op.cu
+++ b/caffe2/operators/roi_pool_op.cu
@@ -133,10 +133,10 @@ bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
   if (R.size() == 0) {
     Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
     // mutable_data calls are needed to allocate the tensors
-    Y->mutable_data<float>();
+    Y->template mutable_data<float>();
     if (!is_test_) {
       A->Resize(Y->dims());
-      A->mutable_data<int>();
+      A->template mutable_data<int>();
     }
     return true;
   }
@@ -146,23 +146,23 @@ bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
     A->Resize(Y->dims());
   }
   int output_size = Y->size();
-  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
-  ROIPoolForward<float><<<
-      CAFFE_GET_BLOCKS(output_size),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      output_size,
-      X.data<float>(),
-      spatial_scale_,
-      X.dim32(1),
-      X.dim32(2),
-      X.dim32(3),
-      pooled_height_,
-      pooled_width_,
-      R.data<float>(),
-      Y->mutable_data<float>(),
-      argmax_data);
+  int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();
+  ROIPoolForward<float>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          spatial_scale_,
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          pooled_height_,
+          pooled_width_,
+          R.data<float>(),
+          Y->template mutable_data<float>(),
+          argmax_data);
   return true;
 }
 
@@ -179,25 +179,25 @@ bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
   dX->ResizeLike(X);
   // Must zero-out dX before accumulating gradients
   math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
   if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
-    ROIPoolBackward<float><<<
-        CAFFE_GET_BLOCKS(dY.size()),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        context_.cuda_stream()>>>(
-        dY.size(),
-        dY.data<float>(),
-        A.data<int>(),
-        R.dim32(0),
-        spatial_scale_,
-        X.dim32(1),
-        X.dim32(2),
-        X.dim32(3),
-        pooled_height_,
-        pooled_width_,
-        dX->mutable_data<float>(),
-        R.data<float>());
+    ROIPoolBackward<float>
+        <<<CAFFE_GET_BLOCKS(dY.size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            dY.size(),
+            dY.data<float>(),
+            A.data<int>(),
+            R.dim32(0),
+            spatial_scale_,
+            X.dim32(1),
+            X.dim32(2),
+            X.dim32(3),
+            pooled_height_,
+            pooled_width_,
+            dX->template mutable_data<float>(),
+            R.data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/scale_op.cc b/caffe2/operators/scale_op.cc
index f246db4495231..db0e3e1c42dac 100644
--- a/caffe2/operators/scale_op.cc
+++ b/caffe2/operators/scale_op.cc
@@ -4,15 +4,15 @@ namespace caffe2 {
 
 REGISTER_CPU_OPERATOR(Scale, ScaleOp<CPUContext>);
 OPERATOR_SCHEMA(Scale)
-  .NumInputs(1)
-  .NumOutputs(1)
-  .AllowInplace({{0, 0}})
-  .IdenticalTypeAndShape()
-  .SetDoc(R"DOC(
-Scale takes one input data (Tensor<float>) and produces one output data
-(Tensor<float>) whose value is the input data tensor scaled element-wise.
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Scale takes one input data (Tensor) and produces one output data
+(Tensor) whose value is the input data tensor scaled element-wise.
 )DOC")
-  .Arg("scale", "(float, default 1.0) the scale to apply.");
+    .Arg("scale", "(float, default 1.0) the scale to apply.");
 
 class GetScaleGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index 1d51692ac71f1..50b344611ee82 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -13,7 +13,7 @@ class BaseInputAccessor {
  public:
   BaseInputAccessor() {}
 
-  bool observeInput(const Tensor<CPUContext>& dataInput) {
+  bool observeInput(const Tensor& dataInput) {
     data_ = dataInput.raw_data();
     return dataInput.template IsType<TData>();
   }
@@ -373,7 +373,7 @@ class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& reduction_grad = Input(REDUCTION_GRAD);
-    auto& source_shape = OperatorBase::Input<TensorCPU>(SOURCE_SHAPE);
+    auto& source_shape = OperatorBase::Input<Tensor>(SOURCE_SHAPE, CPU);
 
     auto* data_grads = Output(0);
 
diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu
index 9a2d3a8f78ee0..6eec2deba9ce2 100644
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@@ -13,8 +13,8 @@ namespace {
 void inclusive_scan_wrapper(
     const int* length_data,
     int len_length,
-    Tensor<CUDAContext>* temp_buffer,
-    Tensor<CUDAContext>* prefix_sum_out,
+    Tensor* temp_buffer,
+    Tensor* prefix_sum_out,
     CUDAContext* context_) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -22,19 +22,20 @@ void inclusive_scan_wrapper(
       NULL,
       temp_storage_bytes,
       length_data,
-      prefix_sum_out->mutable_data<int>(),
+      prefix_sum_out->template mutable_data<int>(),
       len_length,
       context_->cuda_stream());
   // Allocate temporary storage
   auto buffer_size = (temp_storage_bytes + sizeof(int)) / sizeof(int);
   temp_buffer->Resize(buffer_size);
-  void* d_temp_storage = static_cast<void*>(temp_buffer->mutable_data<int>());
+  void* d_temp_storage =
+      static_cast<void*>(temp_buffer->template mutable_data<int>());
   // Run inclusive prefix sum
   cub::DeviceScan::InclusiveSum(
       d_temp_storage,
       temp_storage_bytes,
       length_data,
-      prefix_sum_out->mutable_data<int>(),
+      prefix_sum_out->template mutable_data<int>(),
       len_length,
       context_->cuda_stream());
 }
@@ -523,8 +524,8 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, class Context = CUDAContext, bool SparseFused = true>
@@ -645,8 +646,8 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, class Context = CUDAContext, bool SparseFused = true>
@@ -779,8 +780,8 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, class Context = CUDAContext, bool SparseFused = true>
@@ -879,8 +880,8 @@ class CUDASparseLengthsWeightedSumOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename SIndex>
@@ -988,7 +989,7 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
     }
 
     SIndex K = 0;
-    context_.CopyBytes<CUDAContext, CPUContext>(
+    context_.CopyBytesToCPU(
         sizeof(SIndex), K_tensor_.template data<SIndex>(), &K);
     context_.FinishDeviceComputation();
 
@@ -1046,9 +1047,9 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
   }
 
  private:
-  Tensor<CUDAContext> buffer_tensor_;
-  Tensor<CUDAContext> K_tensor_;
-  Tensor<CUDAContext> scaling_factors_; // for mean
+  Tensor buffer_tensor_{CUDA};
+  Tensor K_tensor_{CUDA};
+  Tensor scaling_factors_{CUDA}; // for mean
 };
 
 template <typename SIndex>
@@ -1097,7 +1098,7 @@ class SortedSegmentRangeMeanOp : public Operator<Context> {
     auto* output = Output(0);
     auto dims = input.dims();
     SIndex K = 0;
-    context_.template CopyBytes<Context, CPUContext>(
+    context_.CopyBytesToCPU(
         sizeof(SIndex),
         indices.template data<SIndex>() + indices.size() - 1,
         &K);
@@ -1156,9 +1157,9 @@ class SortedSegmentRangeMeanOp : public Operator<Context> {
   }
 
  private:
-  Tensor<CUDAContext> segment_len_; // for mean
-  Tensor<CUDAContext> segment_len_prefix_sum_;
-  Tensor<CUDAContext> prefix_buffer_;
+  Tensor segment_len_{CUDA}; // for mean
+  Tensor segment_len_prefix_sum_{CUDA};
+  Tensor prefix_buffer_{CUDA};
 };
 
 template <typename T, typename SIndex, bool LOGEXP = false>
@@ -1201,7 +1202,7 @@ class SortedSegmentRangeMeanGradientOp : public Operator<Context> {
     const int N = X.size_from_dim(1);
 
     SIndex K = 0;
-    context_.template CopyBytes<Context, CPUContext>(
+    context_.CopyBytesToCPU(
         sizeof(SIndex), I.template data<SIndex>() + I.size() - 1, &K);
 
     K += 1;
@@ -1241,7 +1242,7 @@ class SortedSegmentRangeMeanGradientOp : public Operator<Context> {
   }
 
  private:
-  Tensor<CUDAContext> segment_len_; // for mean
+  Tensor segment_len_{CUDA}; // for mean
 };
 
 REGISTER_CUDA_OPERATOR_STR(
@@ -1358,8 +1359,8 @@ class CUDASparseLengthsSumGradientWithIndicesOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, class Context = CUDAContext>
@@ -1437,8 +1438,8 @@ class CUDASparseLengthsMeanGradientWithIndicesOp
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, class Context = CUDAContext>
@@ -1526,8 +1527,8 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, bool ExactBlock = false>
@@ -1664,8 +1665,8 @@ class CUDALengthsMaxWithMainInputAndForwardOutputGradientOp
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 template <typename T, class Context = CUDAContext>
@@ -1792,8 +1793,8 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
 
  private:
   // menber field to manage memory
-  Tensor<Context> inclusive_scan_buffer_;
-  Tensor<Context> inclusive_scan_length_buffer_;
+  Tensor inclusive_scan_buffer_{CUDA};
+  Tensor inclusive_scan_length_buffer_{CUDA};
 };
 
 // Needed because name is auto-generated in segment_reduction_op.cc:224
diff --git a/caffe2/operators/selu_op.cc b/caffe2/operators/selu_op.cc
index 50d823d8bedf1..45467ef2c8183 100644
--- a/caffe2/operators/selu_op.cc
+++ b/caffe2/operators/selu_op.cc
@@ -12,7 +12,7 @@ bool SeluOp<float, CPUContext>::RunOnDevice() {
   Y->ResizeLike(X);
 
   ConstEigenVectorArrayMap<float> Xvec(X.data<float>(), X.size());
-  EigenVectorArrayMap<float> Yvec(Y->mutable_data<float>(), Y->size());
+  EigenVectorArrayMap<float> Yvec(Y->template mutable_data<float>(), Y->size());
   Yvec = lambda_ * (Xvec > 0).select(Xvec, (alpha_ * Xvec.exp() - alpha_));
   return true;
 }
@@ -27,7 +27,8 @@ bool SeluGradientOp<float, CPUContext>::RunOnDevice() {
 
   ConstEigenVectorArrayMap<float> Yvec(Y.data<float>(), Y.size());
   ConstEigenVectorArrayMap<float> dYvec(dY.data<float>(), dY.size());
-  EigenVectorArrayMap<float> dXvec(dX->mutable_data<float>(), dX->size());
+  EigenVectorArrayMap<float> dXvec(
+      dX->template mutable_data<float>(), dX->size());
 
   const float la = lambda_ * alpha_;
   dXvec = (Yvec > 0).select(lambda_ * dYvec, dYvec * (Yvec + la));
diff --git a/caffe2/operators/selu_op.cu b/caffe2/operators/selu_op.cu
index 95eb2c54ee96a..f2339acb20a20 100644
--- a/caffe2/operators/selu_op.cu
+++ b/caffe2/operators/selu_op.cu
@@ -38,7 +38,11 @@ bool SeluOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->mutable_data<float>(), alpha_, lambda_);
+      X.size(),
+      X.data<float>(),
+      Y->template mutable_data<float>(),
+      alpha_,
+      lambda_);
   return true;
 }
 
@@ -58,7 +62,7 @@ bool SeluGradientOp<float, CUDAContext>::RunOnDevice() {
       Y.size(),
       Y.data<float>(),
       dY.data<float>(),
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       alpha_,
       lambda_);
   return true;
diff --git a/caffe2/operators/sequence_ops.cc b/caffe2/operators/sequence_ops.cc
index 4dd8e65aa3842..2b7b820956867 100644
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@@ -95,7 +95,7 @@ bool RemovePaddingOp<CPUContext>::DoRunWithType() {
   std::transform(
       lengths_ptr,
       lengths_ptr + lengths_size,
-      lengths_out->mutable_data<int32_t>(),
+      lengths_out->template mutable_data<int32_t>(),
       [pad_width](int32_t x) { return x - pad_width; });
   return true;
 }
@@ -156,7 +156,7 @@ bool AddPaddingOp<CPUContext>::MakePadding(
   std::transform(
       lengths_ptr,
       lengths_ptr + lengths_size,
-      lengths_out->mutable_data<int32_t>(),
+      lengths_out->template mutable_data<int32_t>(),
       [pad_width](int32_t x) { return x + pad_width; });
   return true;
 }
@@ -203,7 +203,7 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
         static_cast<char*>(out_features->raw_mutable_data(features.meta()));
     auto src_base = static_cast<const char*>(features.raw_data());
     // copy data and add padding index as zero
-    Tensor<CPUContext> zero;
+    Tensor zero{CPU};
     zero.Resize(block_size);
     auto zeroPtr =
         static_cast<const char*>(zero.raw_mutable_data(features.meta()));
@@ -211,7 +211,7 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
     int start_src = 0;
     for (int i = 0; i < lengths.size(); ++i) {
       if (lengthsPtr[i] == 0) {
-        context_.template CopyItems<CPUContext, CPUContext>(
+        context_.CopyItemsSameDevice(
             features.meta(),
             block_size,
             zeroPtr,
@@ -219,7 +219,7 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
         start_dest += block_size;
       } else {
         auto src = src_base + start_src * features.meta().itemsize();
-        context_.template CopyItems<CPUContext, CPUContext>(
+        context_.CopyItemsSameDevice(
             features.meta(),
             lengthsPtr[i] * block_size,
             src,
diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
index 549c288c0368b..95ad9ece32d41 100644
--- a/caffe2/operators/sequence_ops.cu
+++ b/caffe2/operators/sequence_ops.cu
@@ -126,8 +126,8 @@ template <bool Inclusive = true>
 void lengths_prefix_sum(
     const int32_t* lengths,
     int32_t num_items,
-    Tensor<CUDAContext>* prefix_buffer,
-    Tensor<CUDAContext>* prefix_sum,
+    Tensor* prefix_buffer,
+    Tensor* prefix_sum,
     CUDAContext* context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -137,7 +137,7 @@ void lengths_prefix_sum(
         NULL,
         temp_storage_bytes,
         lengths,
-        prefix_sum->mutable_data<int32_t>(),
+        prefix_sum->template mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   } else {
@@ -145,7 +145,7 @@ void lengths_prefix_sum(
         NULL,
         temp_storage_bytes,
         lengths,
-        prefix_sum->mutable_data<int32_t>(),
+        prefix_sum->template mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   }
@@ -154,14 +154,14 @@ void lengths_prefix_sum(
   auto buffer_size = (temp_storage_bytes + sizeof(int32_t)) / sizeof(int32_t);
   prefix_buffer->Resize(buffer_size);
   void* d_temp_storage =
-      static_cast<void*>(prefix_buffer->mutable_data<int32_t>());
+      static_cast<void*>(prefix_buffer->template mutable_data<int32_t>());
 
   if (Inclusive) {
     cub::DeviceScan::InclusiveSum(
         d_temp_storage,
         temp_storage_bytes,
         lengths,
-        prefix_sum->mutable_data<int32_t>(),
+        prefix_sum->template mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   } else {
@@ -169,7 +169,7 @@ void lengths_prefix_sum(
         d_temp_storage,
         temp_storage_bytes,
         lengths,
-        prefix_sum->mutable_data<int32_t>(),
+        prefix_sum->template mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   }
@@ -204,7 +204,7 @@ bool AddPaddingOp<CUDAContext>::MakePadding(
   if (OutputSize() > 1) {
     auto* lengths_out = Output(1);
     lengths_out->Resize(lengths_size);
-    lengths_out_ptr = lengths_out->mutable_data<int32_t>();
+    lengths_out_ptr = lengths_out->template mutable_data<int32_t>();
   }
 
   if (lengths_size == 0) {
@@ -274,7 +274,7 @@ bool RemovePaddingOp<CUDAContext>::DoRunWithType() {
   if (OutputSize() > 1) {
     auto* lengths_out = Output(1);
     lengths_out->Resize(lengths_size);
-    lengths_out_ptr = lengths_out->mutable_data<int32_t>();
+    lengths_out_ptr = lengths_out->template mutable_data<int32_t>();
   }
 
   if (lengths_size == 0) {
diff --git a/caffe2/operators/sequence_ops.h b/caffe2/operators/sequence_ops.h
index c29ff7bfda64a..d91f3f701c199 100644
--- a/caffe2/operators/sequence_ops.h
+++ b/caffe2/operators/sequence_ops.h
@@ -93,8 +93,8 @@ class GatherPaddingOp final : public Operator<Context> {
   int startPaddingWidth_;
   int endPaddingWidth_;
   // Scratch space required by the CUDA version
-  Tensor<Context> lengths_prefix_sum_buffer_;
-  Tensor<Context> lengths_prefix_sum_;
+  Tensor lengths_prefix_sum_buffer_{Context::GetDeviceType()};
+  Tensor lengths_prefix_sum_{Context::GetDeviceType()};
 };
 
 template <class Context>
@@ -133,8 +133,8 @@ class RemovePaddingOp final : public Operator<Context> {
   int endPaddingWidth_;
 
   // Scratch space required by the CUDA version
-  Tensor<Context> lengths_prefix_sum_buffer_;
-  Tensor<Context> lengths_prefix_sum_;
+  Tensor lengths_prefix_sum_buffer_{Context::GetDeviceType()};
+  Tensor lengths_prefix_sum_{Context::GetDeviceType()};
 };
 
 template <class Context>
@@ -236,8 +236,8 @@ class AddPaddingOp final : public Operator<Context> {
   int endPaddingWidth_;
 
   // Scratch space required by the CUDA version
-  Tensor<Context> lengths_prefix_sum_buffer_;
-  Tensor<Context> lengths_prefix_sum_;
+  Tensor lengths_prefix_sum_buffer_{Context::GetDeviceType()};
+  Tensor lengths_prefix_sum_{Context::GetDeviceType()};
 };
 
 template <class Context>
diff --git a/caffe2/operators/shape_op.h b/caffe2/operators/shape_op.h
index 128a00a3d1561..05ea7a2f7c5fe 100644
--- a/caffe2/operators/shape_op.h
+++ b/caffe2/operators/shape_op.h
@@ -19,13 +19,13 @@ class ShapeOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& data = Input(DATA);
-    auto* output = OperatorBase::Output<Tensor<Context>>(0);
+    auto* output = Output(0);
     int numDims = data.ndim();
     int numAxes = axes_.size();
     if (numAxes == 0) {
       output->Resize(numDims);
       TIndex* output_data = output->template mutable_data<TIndex>();
-      context_.template CopyBytes<Context, Context>(
+      context_.CopyBytesSameDevice(
           numDims * sizeof(TIndex), data.dims().data(), output_data);
       return true;
     }
@@ -37,7 +37,7 @@ class ShapeOp : public Operator<Context> {
       auto axis = axes_[i];
       CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
       CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
-      context_.template CopyBytes<Context, Context>(
+      context_.CopyBytesSameDevice(
           sizeof(TIndex), src + axis * sizeof(TIndex), out);
       out += sizeof(TIndex);
     }
diff --git a/caffe2/operators/sinusoid_position_encoding_op.h b/caffe2/operators/sinusoid_position_encoding_op.h
index 5591b9749a704..101fd56d12603 100644
--- a/caffe2/operators/sinusoid_position_encoding_op.h
+++ b/caffe2/operators/sinusoid_position_encoding_op.h
@@ -28,7 +28,7 @@ class SinusoidPositionEncodingOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(0));
+        this, OperatorBase::Input<Tensor>(0, CPU));
   }
 
   template <typename Index>
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index a9ac0db0d08fb..e2523ad7cbf3f 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -49,13 +49,13 @@ __global__ void SliceCopyKernel(
 
 template <class SIndex, class Context>
 bool SliceImplGpu(
-    Tensor<Context>* output,
-    const Tensor<Context>& data,
+    Tensor* output,
+    const Tensor& data,
     const TensorCPU& starts,
     const TensorCPU& ends,
     Context* context,
-    Tensor<Context>* gdata = nullptr,
-    const Tensor<Context>* go = nullptr) {
+    Tensor* gdata = nullptr,
+    const Tensor* go = nullptr) {
   bool backward = output == nullptr;
 
   auto* starts_data = starts.template data<SIndex>();
@@ -237,8 +237,8 @@ bool SliceOp<int, CUDAContext>::RunOnDevice() {
   auto& data = Input(0);
 
   if (InputSize() > 1) {
-    starts_host_.CopyFrom<CUDAContext>(Input(1));
-    ends_host_.CopyFrom<CUDAContext>(Input(2));
+    starts_host_.CopyFrom(Input(1));
+    ends_host_.CopyFrom(Input(2));
   } else {
     if (!statically_inited_) {
       CAFFE_ENFORCE(HasArgument("starts"));
@@ -272,8 +272,8 @@ bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
   auto& data = Input(0);
 
   if (InputSize() == 4) {
-    starts_host_.CopyFrom<CUDAContext>(Input(1));
-    ends_host_.CopyFrom<CUDAContext>(Input(2));
+    starts_host_.CopyFrom(Input(1));
+    ends_host_.CopyFrom(Input(2));
 
     auto& go = Input(3);
 
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index 12734a8e33df7..6c8872db7fa6d 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -11,13 +11,13 @@ namespace {
 
 template <class SIndex, class Context>
 bool SliceImpl(
-    Tensor<Context>* output,
-    const Tensor<Context>& data,
-    const Tensor<Context>& starts,
-    const Tensor<Context>& ends,
+    Tensor* output,
+    const Tensor& data,
+    const Tensor& starts,
+    const Tensor& ends,
     Context* context,
-    Tensor<Context>* gdata = nullptr,
-    const Tensor<Context>* go = nullptr) {
+    Tensor* gdata = nullptr,
+    const Tensor* go = nullptr) {
   bool backward = output == nullptr;
 
   auto* starts_data = starts.template data<SIndex>();
@@ -140,7 +140,7 @@ bool SliceImpl(
       DCHECK_LE(
           static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes),
           static_cast<void*>(dst_bytes + dst_nbytes));
-      context->template CopyItems<Context, Context>(
+      context->CopyItemsSameDevice(
           data.meta(),
           dst_block_size,
           (void*)local_src_offset_bytes,
@@ -186,7 +186,7 @@ bool SliceImpl(
       DCHECK_LE(
           local_dst_offset_bytes + src_block_size_bytes,
           dst_bytes + dst_nbytes);
-      context->template CopyItems<Context, Context>(
+      context->CopyItemsSameDevice(
           go->meta(),
           src_block_size,
           (void*)local_src_offset_bytes,
@@ -213,10 +213,10 @@ class SliceOp : public Operator<Context> {
   }
 
  protected:
-  bool RunOnDeviceImpl(const Tensor<Context>& data, Tensor<Context>* output) {
+  bool RunOnDeviceImpl(const Tensor& data, Tensor* output) {
     if (InputSize() > 1) {
-      starts_host_.template CopyFrom<Context>(Input(1));
-      ends_host_.template CopyFrom<Context>(Input(2));
+      starts_host_.CopyFrom(Input(1));
+      ends_host_.CopyFrom(Input(2));
     } else {
       if (!statically_inited_) {
         CAFFE_ENFORCE(HasArgument("starts"));
@@ -248,8 +248,8 @@ class SliceOp : public Operator<Context> {
   std::vector<SIndex> starts_;
   std::vector<SIndex> ends_;
   bool statically_inited_;
-  TensorCPU starts_host_;
-  TensorCPU ends_host_;
+  Tensor starts_host_{CPU};
+  Tensor ends_host_{CPU};
 };
 
 template <class SIndex, class Context>
@@ -267,8 +267,8 @@ class SliceGradientOp : public Operator<Context> {
     auto& data = Input(0);
 
     if (InputSize() == 4) {
-      starts_host_.template CopyFrom<Context>(Input(1));
-      ends_host_.template CopyFrom<Context>(Input(2));
+      starts_host_.CopyFrom(Input(1));
+      ends_host_.CopyFrom(Input(2));
 
       auto& go = Input(3);
 
@@ -307,7 +307,7 @@ class SliceGradientOp : public Operator<Context> {
   std::vector<SIndex> starts_;
   std::vector<SIndex> ends_;
   bool statically_inited_;
-  TensorCPU starts_host_;
-  TensorCPU ends_host_;
+  Tensor starts_host_{CPU};
+  Tensor ends_host_{CPU};
 };
 } // namespace caffe2
diff --git a/caffe2/operators/softmax_op.cc b/caffe2/operators/softmax_op.cc
index 881b939dd0bb8..3f338492ce3e1 100644
--- a/caffe2/operators/softmax_op.cc
+++ b/caffe2/operators/softmax_op.cc
@@ -12,7 +12,7 @@ bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
   const int N = X.size_to_dim(canonical_axis);
   const int D = X.size_from_dim(canonical_axis);
   Y->ResizeLike(X);
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   // First, get scales
   if (scale_.size() != N) {
     scale_.Resize(N);
@@ -64,7 +64,7 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
   if (N == 0) {
     return true;
   }
-  context_.Copy<float, CPUContext, CPUContext>(Y.size(), dYdata, dXdata);
+  context_.CopySameDevice<float>(Y.size(), dYdata, dXdata);
   float* scaledata = scale_.mutable_data<float>();
   for (int i = 0; i < N; ++i) {
     math::Dot<float, CPUContext>(D, Ydata + i * D, dYdata + i * D,
diff --git a/caffe2/operators/softmax_op.h b/caffe2/operators/softmax_op.h
index 9073a0e6a98f7..8fd70fded99b1 100644
--- a/caffe2/operators/softmax_op.h
+++ b/caffe2/operators/softmax_op.h
@@ -19,9 +19,9 @@ class SoftmaxOp final : public Operator<Context> {
 
  protected:
   int axis_;
-  Tensor<Context> scale_;
-  Tensor<Context> rowmax_;
-  Tensor<Context> sum_multiplier_;
+  Tensor scale_{Context::GetDeviceType()};
+  Tensor rowmax_{Context::GetDeviceType()};
+  Tensor sum_multiplier_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -35,8 +35,8 @@ class SoftmaxGradientOp final : public Operator<Context> {
 
  protected:
   int axis_;
-  Tensor<Context> scale_;
-  Tensor<Context> sum_multiplier_;
+  Tensor scale_{Context::GetDeviceType()};
+  Tensor sum_multiplier_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
index 08dbf6e7d07a4..05b91c3b4d164 100644
--- a/caffe2/operators/softmax_ops.cu
+++ b/caffe2/operators/softmax_ops.cu
@@ -243,7 +243,7 @@ void Softmax(
 
   math::RowwiseMax<float, CUDAContext>(N, D, logits, rowmax, context);
   // Put the intermediate result X - max(X) into Y
-  context->Copy<float, CUDAContext, CUDAContext>(size, logits, probs);
+  context->CopySameDevice<float>(size, logits, probs);
   // Subtract the scale
   math::Gemm<float, CUDAContext>(
       CblasNoTrans,
@@ -327,7 +327,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
       sum_multiplier_.data<float>(),
       losses_.mutable_data<float>(),
       rowmax_.mutable_data<float>(),
-      P->mutable_data<float>(),
+      P->template mutable_data<float>(),
       !label_prob_mode_, // logarithmic output
       &context_);
   // Compute label xent loss per example
@@ -346,7 +346,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
     // Since we had logarithmic output, we need to exponentiate
     // them again.
     math::Exp<float, CUDAContext>(
-        N * D, P->data<float>(), P->mutable_data<float>(), &context_);
+        N * D, P->data<float>(), P->template mutable_data<float>(), &context_);
   } else {
     ProbCrossEntropyKernel<<<
         std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
@@ -375,7 +375,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
   }
 
   // Sum of all losses
-  float* avg_loss_data = avg_loss->mutable_data<float>();
+  float* avg_loss_data = avg_loss->template mutable_data<float>();
   math::Sum<float, CUDAContext>(
       losses_.size(), losses_.data<float>(), avg_loss_data, &context_, &scratch_);
   // Average of input batch size
@@ -413,7 +413,7 @@ bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
   }
 
   const float* Xdata = X.data<float>();
-  float* Pdata = P->mutable_data<float>();
+  float* Pdata = P->template mutable_data<float>();
 
   // Softmax for each x,y location
   SpatialSoftmaxKernel<<<
@@ -424,7 +424,7 @@ bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
 
   // Cross entropy
   avg_loss->Resize(vector<TIndex>());
-  float* avg_loss_data = avg_loss->mutable_data<float>();
+  float* avg_loss_data = avg_loss->template mutable_data<float>();
   math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
 
   const int* label_data = T.data<int>();
@@ -516,15 +516,19 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
     if (weights == nullptr) {
       // Copy softmax probabilities into dX
       if (!only_loss_) {
-        context_.Copy<float, CUDAContext, CUDAContext>(
-            P.size(), P.data<float>(), dX->mutable_data<float>());
+        context_.CopySameDevice<float>(
+            P.size(), P.data<float>(), dX->template mutable_data<float>());
       }
       LabelCrossEntropyGradientKernel<<<
           CAFFE_GET_BLOCKS(N),
           CAFFE_CUDA_NUM_THREADS,
           0,
           context_.cuda_stream()>>>(
-          N, D, P.data<float>(), T.data<int>(), dX->mutable_data<float>());
+          N,
+          D,
+          P.data<float>(),
+          T.data<int>(),
+          dX->template mutable_data<float>());
     } else {
       // Weighted version gets the Pdata values internally
       LabelCrossEntropyGradientKernelWeighted<<<
@@ -536,7 +540,7 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
           D,
           P.data<float>(),
           T.data<int>(),
-          dX->mutable_data<float>(),
+          dX->template mutable_data<float>(),
           weights);
     }
   } else {
@@ -549,7 +553,7 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
         D,
         P.data<float>(),
         T.data<float>(),
-        dX->mutable_data<float>(),
+        dX->template mutable_data<float>(),
         weights);
   }
   float total_weight = N;
@@ -571,14 +575,14 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
         dX->size(),
         scale_ / total_weight,
         dX->data<float>(),
-        dX->mutable_data<float>(),
+        dX->template mutable_data<float>(),
         &context_);
   }
   math::Scale<float, CUDAContext>(
       dX->size(),
       d_avg_loss.data<float>(),
       dX->data<float>(),
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       &context_);
 
   return true;
@@ -620,14 +624,14 @@ bool SpatialSoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
   }
 
   const float* Pdata = P.data<float>();
-  float* dX_data = dX->mutable_data<float>();
+  float* dX_data = dX->template mutable_data<float>();
   const int* label_data = T.data<int>();
   const float* d_avg_loss_data = d_avg_loss.data<float>();
 
   // Copy softmax probabilities into dX. All but the neuron
   // corresponding to the correct label has gradient equaling e(x_j)
   // which is the probability under softmax.
-  context_.Copy<float, CUDAContext, CUDAContext>(P.size(), Pdata, dX_data);
+  context_.CopySameDevice<float>(P.size(), Pdata, dX_data);
 
   math::Set<float, CUDAContext>(
       1, 0.0f, total_weight_ptr_.mutable_data<float>(), &context_);
@@ -661,14 +665,14 @@ bool SpatialSoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
         dX->size(),
         scale_ / h_total_weight,
         dX->data<float>(),
-        dX->mutable_data<float>(),
+        dX->template mutable_data<float>(),
         &context_);
   }
   math::Scale<float, CUDAContext>(
       dX->size(),
       d_avg_loss.data<float>(),
       dX->data<float>(),
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       &context_);
 
   return true;
diff --git a/caffe2/operators/softmax_shared.cc b/caffe2/operators/softmax_shared.cc
index 14e823c1f9c19..c1b376187937a 100644
--- a/caffe2/operators/softmax_shared.cc
+++ b/caffe2/operators/softmax_shared.cc
@@ -16,7 +16,7 @@ void SoftmaxCPU(
     float* rowmax) {
   math::RowwiseMax<float, CPUContext>(N, D, Xdata, rowmax, &context);
   // Put the intermediate result X - max(X) into Y
-  context.template Copy<float, CPUContext, CPUContext>(N * D, Xdata, Ydata);
+  context.template CopyFromCPU<float>(N * D, Xdata, Ydata);
   // Subtract the max (for numerical reasons)
   math::Gemm<float, CPUContext>(
       CblasNoTrans,
diff --git a/caffe2/operators/softmax_with_loss_op.cc b/caffe2/operators/softmax_with_loss_op.cc
index 32cb2cec3c9ab..e2ea869528b96 100644
--- a/caffe2/operators/softmax_with_loss_op.cc
+++ b/caffe2/operators/softmax_with_loss_op.cc
@@ -169,7 +169,7 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
         D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
   }
 
-  float* Pdata = P->mutable_data<float>();
+  float* Pdata = P->template mutable_data<float>();
   const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
 
   if (label_prob_mode_) {
@@ -253,7 +253,7 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
   }
 
   avg_loss->Resize(vector<TIndex>());
-  float* avg_loss_data = avg_loss->mutable_data<float>();
+  float* avg_loss_data = avg_loss->template mutable_data<float>();
   if (weight_sum != 0.0) {
     avg_loss_data[0] = loss_sum * scale_ / weight_sum;
   } else {
@@ -292,12 +292,12 @@ bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
   }
 
   const float* Pdata = P.data<float>();
-  float* dX_data = dX->mutable_data<float>();
+  float* dX_data = dX->template mutable_data<float>();
 
   // Copy softmax probabilities into dX. All but the neuron
   // corresponding to the correct label has gradient equaling e(x_j)
   // which is the probability under softmax.
-  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
+  context_.CopyFromCPU<float>(P.size(), Pdata, dX_data);
 
   // Compute gradient for the matching labels.
   float total_weight = 0.0f;
diff --git a/caffe2/operators/softmax_with_loss_op.h b/caffe2/operators/softmax_with_loss_op.h
index 27e6db2d76fc5..911fa9e58691b 100644
--- a/caffe2/operators/softmax_with_loss_op.h
+++ b/caffe2/operators/softmax_with_loss_op.h
@@ -32,12 +32,13 @@ class SoftmaxWithLossOp final : public Operator<Context> {
   StorageOrder order_;
   int axis_;
 
-  Tensor<Context> losses_; // Per example loss
-  Tensor<Context> rowmax_; // per example row max
-  Tensor<Context> weights_; // unignored weights
-  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
-  Tensor<Context> total_weight_ptr_;
-  Tensor<Context> scratch_;
+  Tensor losses_{Context::GetDeviceType()}; // Per example loss
+  Tensor rowmax_{Context::GetDeviceType()}; // per example row max
+  Tensor weights_{Context::GetDeviceType()}; // unignored weights
+  Tensor sum_multiplier_{
+      Context::GetDeviceType()}; // Vector of ones for summing via dot prod
+  Tensor total_weight_ptr_{Context::GetDeviceType()};
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -62,13 +63,13 @@ class SoftmaxWithLossGradientOp final : public Operator<Context> {
  protected:
   float scale_;
   int label_prob_mode_;
-  Tensor<Context> sum_multiplier_;
-  Tensor<Context> weights_; // unignored weights
-  Tensor<Context> total_weight_ptr_;
+  Tensor sum_multiplier_{Context::GetDeviceType()};
+  Tensor weights_{Context::GetDeviceType()}; // unignored weights
+  Tensor total_weight_ptr_{Context::GetDeviceType()};
   StorageOrder order_;
   bool only_loss_;
   int axis_;
-  Tensor<Context> scratch_;
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/softplus_op.cc b/caffe2/operators/softplus_op.cc
index 7d2efd578560a..bba4f461553b1 100644
--- a/caffe2/operators/softplus_op.cc
+++ b/caffe2/operators/softplus_op.cc
@@ -11,7 +11,7 @@ bool SoftplusOp<float, CPUContext>::RunOnDevice() {
   auto* Y = Output(0);
   Y->ResizeLike(X);
 
-  EigenVectorMap<float>(Y->mutable_data<float>(), X.size()) =
+  EigenVectorMap<float>(Y->template mutable_data<float>(), X.size()) =
       (ConstEigenVectorMap<float>(X.data<float>(), X.size()).array().exp() +
        1.0f)
           .log();
@@ -28,7 +28,7 @@ bool SoftplusGradientOp<float, CPUContext>::RunOnDevice() {
 
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
   ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
   ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
diff --git a/caffe2/operators/softplus_op.cu b/caffe2/operators/softplus_op.cu
index 7e542f5a9b7c8..3aefb03a5850f 100644
--- a/caffe2/operators/softplus_op.cu
+++ b/caffe2/operators/softplus_op.cu
@@ -31,7 +31,7 @@ bool SoftplusOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->mutable_data<float>());
+      X.size(), X.data<float>(), Y->template mutable_data<float>());
   return true;
 }
 
@@ -48,7 +48,10 @@ bool SoftplusGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      Y.size(), Y.data<float>(), dY.data<float>(), dX->mutable_data<float>());
+      Y.size(),
+      Y.data<float>(),
+      dY.data<float>(),
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/space_batch_op.h b/caffe2/operators/space_batch_op.h
index 4f42dc0e94838..4690b503c52a8 100644
--- a/caffe2/operators/space_batch_op.h
+++ b/caffe2/operators/space_batch_op.h
@@ -10,11 +10,11 @@ namespace caffe2 {
 
 template <typename Context>
 void spaceToBatch(
-    const Tensor<Context>& input,
+    const Tensor& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor<Context>* output,
+    Tensor* output,
     Context* /*context*/) {
   CAFFE_ENFORCE(input.ndim() == 4);
   CAFFE_ENFORCE(output->ndim() == 4);
@@ -60,11 +60,11 @@ void spaceToBatch(
 
 template <typename Context>
 void batchToSpace(
-    const Tensor<Context>& input,
+    const Tensor& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor<Context>* output,
+    Tensor* output,
     Context* /*context*/) {
   CAFFE_ENFORCE(input.ndim() == 4);
   CAFFE_ENFORCE(output->ndim() == 4);
diff --git a/caffe2/operators/space_batch_op_gpu.cu b/caffe2/operators/space_batch_op_gpu.cu
index 862440907fcf8..e9018c8a0e74e 100644
--- a/caffe2/operators/space_batch_op_gpu.cu
+++ b/caffe2/operators/space_batch_op_gpu.cu
@@ -48,13 +48,13 @@ __global__ void SpaceToBatch(
   }
 }
 
-template<>
+template <>
 void spaceToBatch<CUDAContext>(
-    const Tensor<CUDAContext>& input,
+    const Tensor& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor<CUDAContext>* output,
+    Tensor* output,
     CUDAContext* context) {
   const int output_batch = output->dim32(0);
   const int output_depth = output->dim32(1);
@@ -84,7 +84,7 @@ void spaceToBatch<CUDAContext>(
       pad_t,
       block_size,
       input.data<float>(),
-      output->mutable_data<float>());
+      output->template mutable_data<float>());
 }
 
 
@@ -133,11 +133,11 @@ __global__ void BatchToSpace(
 
 template <>
 void batchToSpace(
-    const Tensor<CUDAContext>& input,
+    const Tensor& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor<CUDAContext>* output,
+    Tensor* output,
     CUDAContext* context) {
   CAFFE_ENFORCE(input.ndim() == 4);
   CAFFE_ENFORCE(output->ndim() == 4);
@@ -170,7 +170,7 @@ void batchToSpace(
       pad_t,
       block_size,
       input.data<float>(),
-      output->mutable_data<float>());
+      output->template mutable_data<float>());
 }
 
 REGISTER_CUDA_OPERATOR(SpaceToBatch, SpaceToBatchOp<CUDAContext>);
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 5ea10d17c3f29..9e2da09fe65a6 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -104,7 +104,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
     int32_t sparse_indices_length = sparse_indices.dim32(0);
     const int32_t* lengths_vec = nullptr;
     auto* output = Output(OUTPUTVALUE);
-    Tensor<Context>* presence_mask = nullptr;
+    Tensor* presence_mask = nullptr;
     if (returnPresenceMask_) {
       presence_mask = Output(PRESENCEMASK);
     }
@@ -135,7 +135,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
     char* output_data =
         static_cast<char*>(output->raw_mutable_data(sparse_values.meta()));
     for (int i = 0; i < cols * rows; i++) {
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           default_value.meta(),
           block_size,
           default_val,
@@ -162,7 +162,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
         }
         int idx = this->getFeatureIdx(sparse_index);
         if (idx != -1) {
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               sparse_values.meta(),
               block_size,
               sparse_values_vec + (offset + c) * block_nbytes,
@@ -266,7 +266,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
         int idx = this->getFeatureIdx(sparse_indices_vec[offset + c]);
         if (idx != -1 && !gradient_used[idx]) {
           gradient_used[idx] = true;
-          context_.template CopyItems<Context, Context>(
+          context_.CopyItemsSameDevice(
               gradient_output.meta(),
               block_size,
               gradient_output_vec + (r * cols + idx) * block_nbytes,
diff --git a/caffe2/operators/sparse_to_dense_op.h b/caffe2/operators/sparse_to_dense_op.h
index 7fbfa38518c9f..6a9f2fddb9943 100644
--- a/caffe2/operators/sparse_to_dense_op.h
+++ b/caffe2/operators/sparse_to_dense_op.h
@@ -110,9 +110,9 @@ class SparseToDenseOp final : public Operator<Context> {
 
  private:
   int output_first_dim_;
-  Tensor<Context> scratch_;
-  Tensor<CPUContext> max_element_host_;
-  Tensor<Context> max_element_;
+  Tensor scratch_{Context::GetDeviceType()};
+  Tensor max_element_host_{CPU};
+  Tensor max_element_{Context::GetDeviceType()};
 
   INPUT_TAGS(INDICES, VALUES, DATA_TO_INFER_DIM);
 };
diff --git a/caffe2/operators/spatial_batch_norm_gradient_op.cc b/caffe2/operators/spatial_batch_norm_gradient_op.cc
index dd5434db725a7..5a9d55341f27e 100644
--- a/caffe2/operators/spatial_batch_norm_gradient_op.cc
+++ b/caffe2/operators/spatial_batch_norm_gradient_op.cc
@@ -48,8 +48,10 @@ bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
   // dX = (1. / N) * scale * inv_var * (N * dY - np.sum(dY, axis=0) - (X - mean)
   //   * inv_var * inv_var * np.sum(dY * (X - mean), axis=0))
 
-  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
-  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dBias_arr(
+      dBias->template mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(
+      dScale->template mutable_data<float>(), C);
 
   if (num_batches_ == 1) {
     dBias_arr.setZero();
@@ -63,7 +65,7 @@ bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
       ConstEigenArrayMap<float> X_arr(X.data<float>(), sample_size, N * C);
       ConstEigenArrayMap<float> dY_arr(dY.data<float>(), sample_size, N * C);
       EigenArrayMap<float> dX_arr(
-          dX->mutable_data<float>(), sample_size, N * C);
+          dX->template mutable_data<float>(), sample_size, N * C);
       dX_arr.setZero();
       if (N == 0) {
         return true;
@@ -94,7 +96,7 @@ bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
       ConstEigenArrayMap<float> X_arr(X.data<float>(), C, N * sample_size);
       ConstEigenArrayMap<float> dY_arr(dY.data<float>(), C, N * sample_size);
       EigenArrayMap<float> dX_arr(
-          dX->mutable_data<float>(), C, N * sample_size);
+          dX->template mutable_data<float>(), C, N * sample_size);
       dX_arr.setZero();
       if (N == 0) {
         return true;
diff --git a/caffe2/operators/spatial_batch_norm_op.cc b/caffe2/operators/spatial_batch_norm_op.cc
index 671493a1df010..09f2b04fd9f25 100644
--- a/caffe2/operators/spatial_batch_norm_op.cc
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@@ -45,7 +45,7 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
     Output(SAVED_MEAN)->Resize(C);
     Output(SAVED_INV_VAR)->Resize(C);
     EigenVectorArrayMap<float> mean(
-        Output(SAVED_MEAN)->mutable_data<float>(), C);
+        Output(SAVED_MEAN)->template mutable_data<float>(), C);
     EigenVectorArrayMap<float> var(
         Output(SAVED_INV_VAR)->mutable_data<float>(), C);
     if (N > 0) {
@@ -131,7 +131,7 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
     inv_std = (var_arr + epsilon_).sqrt().inverse();
   } else {
     EigenVectorArrayMap<float> saved_inv_std(
-        Output(SAVED_INV_VAR)->mutable_data<float>(), C);
+        Output(SAVED_INV_VAR)->template mutable_data<float>(), C);
     saved_inv_std = (saved_inv_std + epsilon_).inverse().sqrt();
     inv_std = saved_inv_std;
   }
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.cc b/caffe2/operators/spatial_softmax_with_loss_op.cc
index 1288bc7d90554..02779fa598bf6 100644
--- a/caffe2/operators/spatial_softmax_with_loss_op.cc
+++ b/caffe2/operators/spatial_softmax_with_loss_op.cc
@@ -14,33 +14,33 @@ REGISTER_CPU_OPERATOR(
 OPERATOR_SCHEMA(SpatialSoftmaxWithLoss)
     .NumInputs(2, 3)
     .NumOutputs(2)
-    .TensorInferenceFunction(
-        [](const OperatorDef& def, const vector<TensorShape>& in) {
-          ArgumentHelper helper(def);
-          vector<TensorShape> out(2);
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& in) {
+      ArgumentHelper helper(def);
+      vector<TensorShape> out(2);
 
-          auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
-          auto labels = in[1]; // Tensor with shape [batch_size, ]
-          auto batch_size = logits.dims().Get(0);
-          auto num_classes = logits.dims().Get(1);
+      auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
+      auto labels = in[1]; // Tensor with shape [batch_size, ]
+      auto batch_size = logits.dims().Get(0);
+      auto num_classes = logits.dims().Get(1);
 
-          CAFFE_ENFORCE_EQ(logits.dims_size(), 4);
-          CAFFE_ENFORCE_EQ(labels.dims_size(), 3);
-          out[0].set_data_type(logits.data_type());
-          out[0].add_dims(batch_size);
-          out[0].add_dims(num_classes);
-          out[0].add_dims(in[0].dims(2));
-          out[0].add_dims(in[0].dims(3));
-          // Output 2 is scalar shape, so no dims added
-          return out;
-        })
+      CAFFE_ENFORCE_EQ(logits.dims_size(), 4);
+      CAFFE_ENFORCE_EQ(labels.dims_size(), 3);
+      out[0].set_data_type(logits.data_type());
+      out[0].add_dims(batch_size);
+      out[0].add_dims(num_classes);
+      out[0].add_dims(in[0].dims(2));
+      out[0].add_dims(in[0].dims(3));
+      // Output 2 is scalar shape, so no dims added
+      return out;
+    })
     .SetDoc(R"DOC(
 Combined Spatial Softmax and Cross-Entropy loss operator.
 Similar to SoftmaxWithLoss, this operator computes the spatial softmax
 normalized values for each layer in the batch of the given input, after which
 cross-entropy loss is computed. This operator is numerically more stable than
 separate Softmax and CrossEntropy ops. The inputs are a 2-D tensor
-(Tensor<float>) of size (batch_size x input_feature_dimensions) and tensor of
+(Tensor) of size (batch_size x input_feature_dimensions) and tensor of
 labels (ground truth).
 Output is tensor with the probability for each label in a pixel for each example
 (N x D x W x H) and averaged loss (scalar).
@@ -78,7 +78,7 @@ bool SpatialSoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
         D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
   }
 
-  float* Pdata = P->mutable_data<float>();
+  float* Pdata = P->template mutable_data<float>();
   const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
   CAFFE_ENFORCE_EQ(X.ndim(), 4);
   CAFFE_ENFORCE_EQ(T.ndim(), 3);
@@ -120,7 +120,7 @@ bool SpatialSoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
 
   // Compute the avg cross-entropy loss
   avg_loss->Resize(vector<TIndex>());
-  float* avg_loss_data = avg_loss->mutable_data<float>();
+  float* avg_loss_data = avg_loss->template mutable_data<float>();
   const int* label_data = T.data<int>();
 
   float sum_label_xent = 0.0f;
@@ -175,13 +175,13 @@ bool SpatialSoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
   int W = X.dim32(3);
 
   const float* Pdata = P.data<float>();
-  float* dX_data = dX->mutable_data<float>();
+  float* dX_data = dX->template mutable_data<float>();
   const int* label_data = T.data<int>();
 
   // Copy softmax probabilities into dX. All but the neuron
   // corresponding to the correct label has gradient equaling e(x_j)
   // which is the probability under softmax.
-  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
+  context_.CopyFromCPU<float>(P.size(), Pdata, dX_data);
 
   float total_weight = 0.0f;
   for (int y = 0; y < H; ++y) {
@@ -228,7 +228,7 @@ bool SpatialSoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
       dX->size(),
       d_avg_loss.data<float>(),
       dX->data<float>(),
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
       &context_);
   return true;
 }
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.h b/caffe2/operators/spatial_softmax_with_loss_op.h
index d466063d45c1f..0c1d69087e681 100644
--- a/caffe2/operators/spatial_softmax_with_loss_op.h
+++ b/caffe2/operators/spatial_softmax_with_loss_op.h
@@ -28,12 +28,13 @@ class SpatialSoftmaxWithLossOp final : public Operator<Context> {
   float scale_;
   StorageOrder order_;
 
-  Tensor<Context> losses_; // Per example loss
-  Tensor<Context> rowmax_; // per example row max
-  Tensor<Context> weights_; // unignored weights
-  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
-  Tensor<Context> total_weight_ptr_;
-  Tensor<Context> scratch_;
+  Tensor losses_{Context::GetDeviceType()}; // Per example loss
+  Tensor rowmax_{Context::GetDeviceType()}; // per example row max
+  Tensor weights_{Context::GetDeviceType()}; // unignored weights
+  Tensor sum_multiplier_{
+      Context::GetDeviceType()}; // Vector of ones for summing via dot prod
+  Tensor total_weight_ptr_{Context::GetDeviceType()};
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -55,12 +56,12 @@ class SpatialSoftmaxWithLossGradientOp final : public Operator<Context> {
 
  protected:
   float scale_;
-  Tensor<Context> sum_multiplier_;
-  Tensor<Context> weights_; // unignored weights
-  Tensor<Context> total_weight_ptr_;
+  Tensor sum_multiplier_{Context::GetDeviceType()};
+  Tensor weights_{Context::GetDeviceType()}; // unignored weights
+  Tensor total_weight_ptr_{Context::GetDeviceType()};
   StorageOrder order_;
   bool only_loss_;
-  Tensor<Context> scratch_;
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc
index 64a0c1a888800..508dd1ae82060 100644
--- a/caffe2/operators/stats_ops.cc
+++ b/caffe2/operators/stats_ops.cc
@@ -35,9 +35,9 @@ class StatRegistryExportOp : public Operator<CPUContext> {
     keys->Resize(data.size());
     values->Resize(data.size());
     timestamps->Resize(data.size());
-    auto* pkeys = keys->mutable_data<std::string>();
-    auto* pvals = values->mutable_data<int64_t>();
-    auto* ptimestamps = timestamps->mutable_data<int64_t>();
+    auto* pkeys = keys->template mutable_data<std::string>();
+    auto* pvals = values->template mutable_data<int64_t>();
+    auto* ptimestamps = timestamps->template mutable_data<int64_t>();
     int i = 0;
     for (const auto& stat : data) {
       pkeys[i] = std::move(stat.key);
@@ -153,7 +153,7 @@ struct TimerGetAndEndOp : public Operator<CPUContext> {
   bool RunOnDevice() override {
     int64_t nanos = OperatorBase::Input<TimerInstance*>(0)->get_ns();
     OperatorBase::Input<TimerInstance*>(0)->end();
-    auto* res = OperatorBase::Output<TensorCPU>(0);
+    auto* res = Output(0);
     res->Resize(1);
     res->template mutable_data<int64_t>()[0] = nanos;
     return true;
@@ -166,7 +166,7 @@ struct TimerGetOp : public Operator<CPUContext> {
 
   bool RunOnDevice() override {
     int64_t nanos = OperatorBase::Input<TimerInstance*>(0)->get_ns();
-    auto* res = OperatorBase::Output<TensorCPU>(0);
+    auto* res = Output(0);
     res->Resize();
     res->template mutable_data<int64_t>()[0] = nanos;
     return true;
diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc
index 819bb6a6c5b09..672ca24d073e3 100644
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@@ -15,7 +15,7 @@ bool StringJoinOp<CPUContext>::DoRunWithType() {
   int rowSize = (input.ndim() == 2) ? input.dim(1) : 1;
   if (this->axis_ == 0) {
     output->Resize(input.dim(0));
-    auto* outputData = output->mutable_data<std::string>();
+    auto* outputData = output->template mutable_data<std::string>();
 
     int offset = 0;
     for (int i = 0; i < input.dim(0); ++i) {
@@ -29,7 +29,7 @@ bool StringJoinOp<CPUContext>::DoRunWithType() {
     }
   } else if (this->axis_ == 1) {
     output->Resize(input.dim(1));
-    auto* outputData = output->mutable_data<std::string>();
+    auto* outputData = output->template mutable_data<std::string>();
 
     for (int j = 0; j < input.dim(1); ++j) {
       std::stringstream stream;
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index 3d6fb4720ddb2..ece70ffd2425e 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
  public:
   bool runOp(const TensorCPU& input) {
     auto* blob = ws_.CreateBlob("X");
-    auto* tensor = blob->GetMutable<TensorCPU>();
+    auto* tensor = blob->GetMutableTensor(CPU);
     tensor->ResizeLike(input);
     tensor->ShareData(input);
 
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsType<TensorCPU>());
+    EXPECT_TRUE(output->IsType<Tensor>(CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
@@ -42,9 +42,9 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
   std::vector<std::string> input = {"a", "xx", "c"};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size());
-  auto* data = tensor->mutable_data<std::string>();
+  auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
     *data++ = input[i];
   }
@@ -62,9 +62,9 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                  {"dd", "ee", "ff"}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size(), input[0].size());
-  auto* data = tensor->mutable_data<std::string>();
+  auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
     for (int j = 0; j < input[0].size(); ++j) {
       *data++ = input[i][j];
@@ -82,9 +82,9 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
   std::vector<float> input = {3.90f, 5.234f, 8.12f};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size());
-  auto* data = tensor->mutable_data<float>();
+  auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
     *data++ = input[i];
   }
@@ -102,9 +102,9 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                            {4.67f, 5.90f, 6.32f}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size(), input[0].size());
-  auto* data = tensor->mutable_data<float>();
+  auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
     for (int j = 0; j < input[0].size(); ++j) {
       *data++ = input[i][j];
@@ -122,9 +122,9 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
   std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(input.size(), input[0].size());
-  auto* data = tensor->mutable_data<int64_t>();
+  auto* data = tensor->template mutable_data<int64_t>();
   for (int i = 0; i < input.size(); ++i) {
     for (int j = 0; j < input[0].size(); ++j) {
       *data++ = input[i][j];
diff --git a/caffe2/operators/stump_func_op.cu b/caffe2/operators/stump_func_op.cu
index 2ea3108e73ad3..9e38da2bcebd0 100644
--- a/caffe2/operators/stump_func_op.cu
+++ b/caffe2/operators/stump_func_op.cu
@@ -42,7 +42,7 @@ bool StumpFuncOp<float, float, CUDAContext>::RunOnDevice() {
   const float* in_data = in.data<float>();
   auto* out = Output(0);
   out->ResizeLike(in);
-  float* out_data = out->mutable_data<float>();
+  float* out_data = out->template mutable_data<float>();
   StumpFuncKernel<<<CAFFE_GET_BLOCKS(in.size()), CAFFE_CUDA_NUM_THREADS,
     0, context_.cuda_stream()>>>(
       in.size(), threshold_, low_value_, high_value_, in_data, out_data);
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index ca4a762587bfc..8f1e0895a2859 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsType<TensorCPU>()) {
+    if (!noiseBlob->IsType<Tensor>(CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->template GetMutable<TensorCPU>();
+      auto* t = noiseBlob->GetMutableTensor(CPU);
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
@@ -115,13 +115,13 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
         X.data<uint8_t>(),
         mean.data<float>(),
         noise.data<float>(),
-        Y->mutable_data<float>());
+        Y->template mutable_data<float>());
 
     return true;
   }
 
 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
-  void initNoiseCPU(Tensor<CPUContext>* noise, int size) {
+  void initNoiseCPU(Tensor* noise, int size) {
     noise->Resize(size);
 
     math::RandGaussian<float, CPUContext>(
@@ -134,7 +134,7 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  void initNoiseCPUNeon(Tensor<CPUContext>* noise, int size) {
+  void initNoiseCPUNeon(Tensor* noise, int size) {
     // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
     // the inner loop is vectorized. Round up to the next highest
     // multiple of kNeonNoiseReadSize
@@ -429,7 +429,7 @@ class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp
         W,
         X.data<float>(),
         mean.data<float>(),
-        Y->mutable_data<uint8_t>());
+        Y->template mutable_data<uint8_t>());
 
     return true;
   }
diff --git a/caffe2/operators/summarize_op.cc b/caffe2/operators/summarize_op.cc
index 6ae7ed0bfc43d..5ba3b4447bcca 100644
--- a/caffe2/operators/summarize_op.cc
+++ b/caffe2/operators/summarize_op.cc
@@ -33,7 +33,7 @@ bool SummarizeOp<float, CPUContext>::RunOnDevice() {
   if (OutputSize()) {
     auto* Y = Output(0);
     Y->Resize(NUM_STATS);
-    float* Ydata = Y->mutable_data<float>();
+    float* Ydata = Y->template mutable_data<float>();
     Ydata[MIN_IDX] = min;
     Ydata[MAX_IDX] = max;
     Ydata[MEAN_IDX] = static_cast<float>(mean);
@@ -50,7 +50,7 @@ OPERATOR_SCHEMA(Summarize)
     .NumInputs(1)
     .NumOutputs(0, 1)
     .SetDoc(R"DOC(
-Summarize computes four statistics of the input tensor (Tensor<float>)- min,
+Summarize computes four statistics of the input tensor (Tensor)- min,
 max, mean and standard deviation. The output will be written to a 1-D tensor of
 size 4 if an output tensor is provided. Else, if the argument 'to_file' is
 greater than 0, the values are written to a log file in the root folder.
@@ -59,11 +59,11 @@ greater than 0, the values are written to a log file in the root folder.
         "to_file",
         "(int, default 0) flag to indicate if the summarized "
         "statistics have to be written to a log file.")
-    .Input(0, "data", "The input data as Tensor<float>.")
+    .Input(0, "data", "The input data as Tensor.")
     .Output(
         0,
         "output",
-        "1-D tensor (Tensor<float>) of size 4 containing min, "
+        "1-D tensor (Tensor) of size 4 containing min, "
         "max, mean and standard deviation");
 
 SHOULD_NOT_DO_GRADIENT(Summarize);
diff --git a/caffe2/operators/summarize_op.cu b/caffe2/operators/summarize_op.cu
index 89dd4c01003df..13c1a1b8793e1 100644
--- a/caffe2/operators/summarize_op.cu
+++ b/caffe2/operators/summarize_op.cu
@@ -96,12 +96,12 @@ bool SummarizeOp<float, CUDAContext>::RunOnDevice() {
                  << standard_deviation << std::endl;
   }
   if (OutputSize()) {
-    auto* Y = OperatorBase::Output<TensorCUDA>(0);
+    auto* Y = Output(0);
     Y->Resize(4);
     float output_buffer[NUM_STATS] = {result.min, result.max, result.mean,
                                standard_deviation};
-    context_.Copy<float, CPUContext, CUDAContext>(
-        NUM_STATS, output_buffer, Y->mutable_data<float>());
+    context_.CopyFromCPU<float>(
+        NUM_STATS, output_buffer, Y->template mutable_data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/swish_op.cc b/caffe2/operators/swish_op.cc
index a636d23d85f7c..f68b86c3f0d85 100644
--- a/caffe2/operators/swish_op.cc
+++ b/caffe2/operators/swish_op.cc
@@ -58,8 +58,8 @@ OPERATOR_SCHEMA(Swish)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(
-Swish takes one input data (Tensor<T>) and produces one output data
-(Tensor<T>) where the swish function, y = x / (1 + exp(-x)), is applied to the
+Swish takes one input data (Tensor) and produces one output data
+(Tensor) where the swish function, y = x / (1 + exp(-x)), is applied to the
 tensor elementwise.
 )DOC")
     .Input(0, "X", "1D input tensor")
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index e9e55b8fa7876..f8ff48588f30d 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -43,7 +43,7 @@ TensorProtosDBInput<Context>::TensorProtosDBInput(
 template <class Context>
 bool TensorProtosDBInput<Context>::Prefetch() {
   const db::DBReader& reader = OperatorBase::Input<db::DBReader>(0);
-  TensorDeserializer<CPUContext> deserializer;
+  TensorDeserializer deserializer;
   if (batch_size_ == 0) {
     // We do not need to construct a batch. As a result, we will simply
     // deserialize everything into the target prefetched blob.
@@ -56,11 +56,13 @@ bool TensorProtosDBInput<Context>::Prefetch() {
         protos.mutable_protos(i)->clear_device_detail();
       }
       deserializer.Deserialize(
-          protos.protos(i),
-          prefetched_blobs_[i].template GetMutable<TensorCPU>());
+          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
     }
   } else {
-    vector<TensorCPU> temp_tensors(OutputSize());
+    vector<Tensor> temp_tensors;
+    for (int i = 0; i < OutputSize(); ++i) {
+      temp_tensors.emplace_back(CPU);
+    }
     for (int item_id = 0; item_id < batch_size_; ++item_id) {
       reader.Read(&key_, &value_);
       TensorProtos protos;
@@ -72,18 +74,18 @@ bool TensorProtosDBInput<Context>::Prefetch() {
           vector<int> dims(
               protos.protos(i).dims().begin(), protos.protos(i).dims().end());
           dims.insert(dims.begin(), batch_size_);
-          prefetched_blobs_[i].template GetMutable<TensorCPU>()->Resize(dims);
+          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
         }
       }
       for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = prefetched_blobs_[i].template GetMutable<TensorCPU>();
+        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
         TensorCPU& src = temp_tensors[i];
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
         }
         deserializer.Deserialize(protos.protos(i), &src);
         DCHECK_EQ(src.size() * batch_size_, dst->size());
-        this->context_.template CopyItems<CPUContext, CPUContext>(
+        this->context_.CopyItemsSameDevice(
             src.meta(),
             src.size(),
             src.raw_data(),
@@ -98,8 +100,9 @@ bool TensorProtosDBInput<Context>::Prefetch() {
 template <class Context>
 bool TensorProtosDBInput<Context>::CopyPrefetched() {
   for (int i = 0; i < OutputSize(); ++i) {
-    OperatorBase::Output<Tensor<Context>>(i)->CopyFrom(
-        prefetched_blobs_[i].template Get<TensorCPU>(), &this->context_);
+    OperatorBase::template Output<Tensor>(i, Context::GetDeviceType())
+        ->CopyFrom(
+            prefetched_blobs_[i].template Get<TensorCPU>(), &this->context_);
   }
   return true;
 }
diff --git a/caffe2/operators/thresholded_relu_op.cc b/caffe2/operators/thresholded_relu_op.cc
index 8b5e6b514478c..bba4d496e65db 100644
--- a/caffe2/operators/thresholded_relu_op.cc
+++ b/caffe2/operators/thresholded_relu_op.cc
@@ -12,11 +12,11 @@ bool ThresholdedReluOp<float, CPUContext>::RunOnDevice() {
   Y->ResizeLike(X);
 
   ConstEigenVectorArrayMap<float> Xvec(X.data<float>(), X.size());
-  EigenVectorArrayMap<float> Yvec(Y->mutable_data<float>(), Y->size());
+  EigenVectorArrayMap<float> Yvec(Y->template mutable_data<float>(), Y->size());
   Yvec = (Xvec > alpha_).select(Xvec, 0.f);
   /* Naive implementation
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
   for (int i = 0; i < X.size(); ++i) {
     Xdata[i] -= alpha_;
     Ydata[i] = std::max(Xdata[i], 0.0f);
@@ -35,7 +35,7 @@ bool ThresholdedReluGradientOp<float, CPUContext>::RunOnDevice() {
 
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
+  float* dXdata = dX->template mutable_data<float>();
   EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
   ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
   ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
diff --git a/caffe2/operators/thresholded_relu_op.cu b/caffe2/operators/thresholded_relu_op.cu
index a12ee62d42b28..5a5027c7faed0 100644
--- a/caffe2/operators/thresholded_relu_op.cu
+++ b/caffe2/operators/thresholded_relu_op.cu
@@ -30,7 +30,7 @@ bool ThresholdedReluOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->mutable_data<float>(), alpha_);
+      X.size(), X.data<float>(), Y->template mutable_data<float>(), alpha_);
   return true;
 }
 
@@ -47,7 +47,10 @@ bool ThresholdedReluGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      Y.size(), Y.data<float>(), dY.data<float>(), dX->mutable_data<float>());
+      Y.size(),
+      Y.data<float>(),
+      dY.data<float>(),
+      dX->template mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h
index 046aaa55f7a94..bde0b41ebe8b1 100644
--- a/caffe2/operators/tile_op.h
+++ b/caffe2/operators/tile_op.h
@@ -34,7 +34,7 @@ class TileOp : public Operator<Context> {
           "Input `tiles` should be a vector of size 1.");
 
       const auto& input1 = Input(1);
-      context_.template CopyItems<Context, CPUContext>(
+      context_.CopyItemsToCPU(
           input1.meta(),
           1,
           static_cast<const char*>(input1.raw_data()),
@@ -46,7 +46,7 @@ class TileOp : public Operator<Context> {
             "Input `axis` should be a vector of size 1.");
 
         const auto& input2 = Input(2);
-        context_.template CopyItems<Context, CPUContext>(
+        context_.CopyItemsToCPU(
             input2.meta(),
             1,
             static_cast<const char*>(input2.raw_data()),
@@ -114,8 +114,7 @@ class TileOp : public Operator<Context> {
       char* output_data) {
     for (auto i = 0; i < outer_dim; ++i) {
       for (auto t = 0; t < tiles_; ++t) {
-        context_.template CopyItems<Context, Context>(
-            meta, inner_dim, input_data, output_data);
+        context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
         output_data += inner_dim * item_size;
       }
       input_data += inner_dim * item_size;
@@ -149,7 +148,7 @@ class TileGradientOp : public Operator<Context> {
           "Input `tiles` should be a vector of size 1.");
 
       const auto& input1 = Input(1);
-      context_.template CopyItems<Context, CPUContext>(
+      context_.CopyItemsToCPU(
           input1.meta(),
           1,
           static_cast<const char*>(input1.raw_data()),
@@ -161,7 +160,7 @@ class TileGradientOp : public Operator<Context> {
             "Input `axis` should be a vector of size 1.");
 
         const auto& input2 = Input(2);
-        context_.template CopyItems<Context, CPUContext>(
+        context_.CopyItemsToCPU(
             input2.meta(),
             1,
             static_cast<const char*>(input2.raw_data()),
@@ -231,8 +230,7 @@ class TileGradientOp : public Operator<Context> {
       const char* input_data,
       char* output_data) {
     for (auto i = 0; i < outer_dim; ++i) {
-      context_.template CopyItems<Context, Context>(
-          meta, inner_dim, input_data, output_data);
+      context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
       input_data += inner_dim * item_size;
       for (auto t = 1; t < tiles_; ++t) {
         math::Axpy<T, Context>(
diff --git a/caffe2/operators/top_k.cu b/caffe2/operators/top_k.cu
index ddcb7c65d52a1..6562b7fa5030f 100644
--- a/caffe2/operators/top_k.cu
+++ b/caffe2/operators/top_k.cu
@@ -166,18 +166,18 @@ class TopKCudaOp : public Operator<Context> {
   int axis_;
 
   // Buffers for CUDAContext.
-  Tensor<Context> input_transposed_buffer_;
-  Tensor<Context> values_transposed_buffer_;
-  Tensor<Context> indices_transposed_buffer_;
+  Tensor input_transposed_buffer_{CUDA};
+  Tensor values_transposed_buffer_{CUDA};
+  Tensor indices_transposed_buffer_{CUDA};
 
   // Shape tensors on device for CUDAContext.
-  Tensor<Context> input_dims_device_;
-  Tensor<Context> input_transposed_dims_device_;
-  Tensor<Context> input_axes_device_;
+  Tensor input_dims_device_{CUDA};
+  Tensor input_transposed_dims_device_{CUDA};
+  Tensor input_axes_device_{CUDA};
 
-  Tensor<Context> output_dims_device_;
-  Tensor<Context> output_transposed_dims_device_;
-  Tensor<Context> output_transposed_axes_device_;
+  Tensor output_dims_device_{CUDA};
+  Tensor output_transposed_dims_device_{CUDA};
+  Tensor output_transposed_axes_device_{CUDA};
 };
 
 template <typename T, typename Context>
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index 13196bf3761b7..6586b014689de 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
     int cores_idx = 0;
 
     // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = Y_temp_->GetMutable<Tensor<Context>>();
+    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
     Y_buf->ResizeLike(X);
     Y_buf->CopyFrom(X);
 
@@ -104,7 +104,7 @@ class TTLinearOp final : public Operator<Context> {
 
       // Resize operation
       Y_buf->Resize(Y->dim32(0), Y->dim32(1));
-      context_.template Copy<float, CPUContext, CPUContext>(
+      context_.template CopyFromCPU<float>(
           Y->size(),
           Y->template data<float>(),
           Y_buf->template mutable_data<float>());
@@ -160,7 +160,7 @@ class TTLinearOp final : public Operator<Context> {
   }
 
  protected:
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
   std::vector<int> inp_sizes_;
   std::vector<int> out_sizes_;
   std::vector<int> tt_ranks_;
@@ -181,7 +181,7 @@ class TTLinearGradientOp : public Operator<Context> {
   }
 
  protected:
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/unique_ops.cu b/caffe2/operators/unique_ops.cu
index 992488f0c3dd1..90252bf401209 100644
--- a/caffe2/operators/unique_ops.cu
+++ b/caffe2/operators/unique_ops.cu
@@ -73,8 +73,7 @@ bool UniqueOp<CUDAContext>::DoRunWithType() {
   const T* input = inputTensor.template data<T>();
   thrust_unique_buffer_.Resize(N);
   auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
-  context_.template CopyItems<CUDAContext, CUDAContext>(
-      inputTensor.meta(), N, input, buffer);
+  context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);
 
   // Create two vectors of {0, 1, ..., N-1} on CUDA device
   thrust::device_vector<int> order1(N), order2(N);
@@ -115,8 +114,7 @@ bool UniqueOp<CUDAContext>::DoRunWithType() {
 
   uniqueTensor->Resize(K);
   T* unique = uniqueTensor->template mutable_data<T>();
-  context_.template CopyItems<CUDAContext, CUDAContext>(
-      thrust_unique_buffer_.meta(), K, buffer, unique);
+  context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique);
 
   // Compute the remapping. For example, for the number 1, if we look at
   // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
diff --git a/caffe2/operators/unique_ops.h b/caffe2/operators/unique_ops.h
index d8af029f16e9c..5def615fbfb42 100644
--- a/caffe2/operators/unique_ops.h
+++ b/caffe2/operators/unique_ops.h
@@ -47,9 +47,9 @@ class UniqueOp : public Operator<Context> {
 
  private:
   vector<int> order_;
-  Tensor<Context> thrust_unique_buffer_;
-  Tensor<Context> cuda_order_buffer_;
-  Tensor<Context> second_order_buffer_;
+  Tensor thrust_unique_buffer_{Context::GetDeviceType()};
+  Tensor cuda_order_buffer_{Context::GetDeviceType()};
+  Tensor second_order_buffer_{Context::GetDeviceType()};
 
  public:
   OUTPUT_TAGS(UNIQUE, REMAPPING);
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 1abf2130953a7..b20ac903999e9 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -1276,7 +1276,7 @@ template <typename T>
 bool RangeOp<CPUContext>::DoRunOnDevice(
     const T& start,
     const T& step,
-    Tensor<CPUContext>* output) {
+    Tensor* output) {
   auto* output_data = output->template mutable_data<T>();
   for (int i = 0; i < output->size(); ++i) {
     output_data[i] = i * step + start;
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index a340734e690b4..c97f3a72e50d8 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -46,7 +46,7 @@ class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
 
   bool RunOnDevice() override {
     auto& input = Input(0);
-    auto* output = OperatorBase::Output<Tensor<CUDAContext>>(0);
+    auto* output = OperatorBase::Output<Tensor>(0, CUDA);
     CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
     output->ResizeLike(input);
     context.template CopyItems<CUDAContext, CUDAContext>(
@@ -143,7 +143,7 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
               << std::endl;
 
     for (int j = 0; j < InputSize(); j++) {
-      TensorCPU cpu_X;
+      Tensor cpu_X(CPU);
       cpu_X.ResizeLike(Input(j));
       // Hack to cause allocaiton happen here, so it won't happen
       // when we do CopyFrom. We need the mutex then because host->gpu
@@ -192,7 +192,7 @@ ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
 
 template <>
 bool MaxOp<float, CUDAContext>::Compute() {
-  float* output_data = Output(0)->mutable_data<float>();
+  float* output_data = Output(0)->template mutable_data<float>();
   const int N = Input(0).size();
 
   // Run pairwise-maxes
@@ -223,7 +223,7 @@ ElwiseMinKernel(const float* X, const float* Y, float* minout, const int N) {
 
 template <>
 bool MinOp<float, CUDAContext>::Compute() {
-  float* output_data = Output(0)->mutable_data<float>();
+  float* output_data = Output(0)->template mutable_data<float>();
   const int N = Input(0).size();
 
   // Run pairwise-mines
@@ -274,7 +274,7 @@ bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
         output.data<float>(),
         input.data<float>(),
         grad_output.data<float>(),
-        grad_input->mutable_data<float>());
+        grad_input->template mutable_data<float>());
   }
   return true;
 }
@@ -299,7 +299,7 @@ __global__ void GatherKernel(
 template <>
 bool GatherOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<TensorCUDA>(INDICES));
+      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
 }
 
 template <>
@@ -501,13 +501,14 @@ template <typename T>
 bool RangeOp<CUDAContext>::DoRunOnDevice(
     const T& start,
     const T& step,
-    Tensor<CUDAContext>* output) {
+    Tensor* output) {
   int N = output->size();
   RangeKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
-      context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
+      context_.cuda_stream()>>>(
+      N, output->template mutable_data<T>(), start, step);
   return true;
 }
 
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index a0eb0f3c531f0..ce9a1c0c70279 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -26,7 +26,7 @@ class NanCheckOp final : public Operator<Context> {
 
  private:
   TensorPrinter tensorPrinter_;
-  Tensor<Context> scratch_;
+  Tensor scratch_{Context::GetDeviceType()};
 };
 
 struct GetNanCheckGradient : public GradientMakerBase {
@@ -54,7 +54,7 @@ class WallClockTimeOp final : public Operator<Context> {
             std::chrono::high_resolution_clock::now().time_since_epoch())
             .count());
 
-    TensorCPU* output = OperatorBase::Output<TensorCPU>(0);
+    TensorCPU* output = Output(0);
     output->Resize();
     *output->template mutable_data<int64_t>() = nanoseconds;
 
@@ -90,8 +90,8 @@ class PrintOp final : public Operator<Context> {
       return true;
     }
 
-    if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
-        !OperatorBase::InputIsType<TensorCPU>(0)) {
+    if (!OperatorBase::InputIsType<Tensor>(0, Context::GetDeviceType()) &&
+        !OperatorBase::InputIsType<Tensor>(0, CPU)) {
       LOG(INFO) << "Blob of type: "
                 << OperatorBase::Inputs().at(0)->meta().name();
       return true;
@@ -112,9 +112,9 @@ class PrintOp final : public Operator<Context> {
         unsigned char,
         std::string>;
 
-    if (OperatorBase::InputIsType<TensorCPU>(0)) {
+    if (OperatorBase::InputIsType<Tensor>(0, CPU)) {
       return DispatchHelper<Types>::call(
-          this, OperatorBase::Input<TensorCPU>(0));
+          this, OperatorBase::Input<Tensor>(0, CPU));
     } else {
       return DispatchHelper<Types>::call(this, Input(0));
     }
@@ -127,9 +127,9 @@ class PrintOp final : public Operator<Context> {
     // pointing to the right instantiation. Note that tensor_copy_if_needed
     // will handle memory deallocation itself so no smart pointer is needed.
     const TensorCPU* tensor;
-    TensorCPU tensor_copy_if_needed;
-    if (OperatorBase::InputIsType<TensorCPU>(0)) {
-      tensor = &OperatorBase::Input<TensorCPU>(0);
+    Tensor tensor_copy_if_needed(CPU);
+    if (OperatorBase::InputIsType<Tensor>(0, CPU)) {
+      tensor = &OperatorBase::Input<Tensor>(0, CPU);
     } else {
       tensor_copy_if_needed.CopyFrom(Input(0), &context_);
       // Make sure that the copy is finished.
@@ -215,7 +215,7 @@ class FlattenToVecOp : public Operator<Context> {
         input.dims().size(), 1, "The rank of the tensor must be >= 1.");
     output->Resize(input.size());
 
-    context_.template CopyItems<Context, Context>(
+    context_.CopyItemsSameDevice(
         input.meta(),
         input.size(),
         input.raw_data(),
@@ -237,7 +237,7 @@ class ResizeLikeOp : public Operator<Context> {
     auto* output = Output(0);
     CAFFE_ENFORCE_EQ(input0.size(), input1.size());
     output->ResizeLike(Input(1));
-    context_.template CopyItems<Context, Context>(
+    context_.CopyItemsSameDevice(
         input0.meta(),
         input0.size(),
         input0.raw_data(),
@@ -532,10 +532,10 @@ class ScatterWeightedSumOp : public Operator<Context> {
     }
     return true;
   }
-  Tensor<CPUContext> x_data_host_;
-  Tensor<CPUContext> weights_host_;
-  Tensor<Context> x_data_device_;
-  Tensor<Context> weights_device_;
+  Tensor x_data_host_{CPU};
+  Tensor weights_host_{CPU};
+  Tensor x_data_device_{Context::GetDeviceType()};
+  Tensor weights_device_{Context::GetDeviceType()};
 };
 
 /**
@@ -663,7 +663,7 @@ class ScatterAssignOp : public Operator<Context> {
       // double-checking the indices, but it's fine as it's DCHECK only
       DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                   << ", range 0 to " << N;
-      context_.template Copy<T, Context, Context>(
+      context_.template CopySameDevice<T>(
           block_size, slicesData + block_size * i, data + block_size * idx);
     }
   }
@@ -678,8 +678,8 @@ class CopyOp : public Operator<Context> {
   USE_SIMPLE_CTOR_DTOR(CopyOp);
 
   bool RunOnDevice() override {
-    auto& input = OperatorBase::Input<Tensor<SrcContext>>(0);
-    auto* output = OperatorBase::Output<Tensor<DstContext>>(0);
+    auto& input = OperatorBase::Input<Tensor>(0, SrcContext::GetDeviceType());
+    auto* output = OperatorBase::Output<Tensor>(0, DstContext::GetDeviceType());
     output->ResizeLike(input);
     this->context_.template CopyItems<SrcContext, DstContext>(
         input.meta(),
@@ -943,7 +943,7 @@ class HasElementsOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& input = Input(0);
-    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto* output = Output(0);
     output->Resize(std::vector<TIndex>{});
     *output->template mutable_data<bool>() = input.size() > 0;
     return true;
@@ -958,7 +958,7 @@ class IsEmptyOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& input = Input(0);
-    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto* output = Output(0);
     output->Resize(std::vector<TIndex>{});
     *output->template mutable_data<bool>() = (input.size() == 0);
     return true;
@@ -1026,7 +1026,7 @@ class GatherOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
   }
 
   template <typename Index>
@@ -1059,7 +1059,7 @@ class GatherOp : public Operator<Context> {
           " data_dim=",
           data.dim(0));
       auto src = src_base + idx * block_bytesize;
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           data.meta(), block_size, src, out + block_bytesize * i);
     }
     return true;
@@ -1076,7 +1076,7 @@ class GatherRangesOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(RANGES));
+        this, OperatorBase::Input<Tensor>(RANGES, CPU));
   }
 
   template <typename Index>
@@ -1123,7 +1123,7 @@ class GatherRangesOp : public Operator<Context> {
       auto rangeSizeBytes = rangeLength * itemsize;
       CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
       CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           data.meta(),
           rangeLength,
           rawData + rangeStart * itemsize,
@@ -1155,7 +1155,7 @@ class LengthsGatherOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
   }
 
   template <typename Index>
@@ -1202,7 +1202,7 @@ class LengthsGatherOp : public Operator<Context> {
     for (size_t i = 0; i < indices.size(); ++i) {
       auto idx = indices_data[i];
       auto length = lengths_data[idx];
-      context_.template CopyItems<Context, Context>(
+      context_.CopyItemsSameDevice(
           items.meta(),
           length * block_size,
           src_base + offsets_[idx] * block_bytesize,
@@ -1252,7 +1252,7 @@ class UnsafeCoalesceOp final : public Operator<Context> {
     size_t coalesced_offset = 0;
     for (auto i = 0; i < InputSize(); ++i) {
       const auto input_nbytes = Input(i).nbytes();
-      context_.template CopyBytes<Context, Context>(
+      context_.CopyBytesSameDevice(
           input_nbytes,
           (const uint8_t*)Input(i).raw_data(),
           coalesced->template mutable_data<uint8_t>() + coalesced_offset);
@@ -1353,7 +1353,7 @@ class RangeOp : public Operator<Context> {
     if (std::is_same<Context, TensorCPU>::value) {
       return Input(index).template data<T>()[0];
     } else {
-      local_.template CopyFrom<Context>(Input(index));
+      local_.CopyFrom(Input(index));
       return local_.template data<T>()[0];
     }
   }
@@ -1409,11 +1409,11 @@ class RangeOp : public Operator<Context> {
   }
 
   template <typename T>
-  bool DoRunOnDevice(const T& start, const T& step, Tensor<Context>* output);
+  bool DoRunOnDevice(const T& start, const T& step, Tensor* output);
 
  private:
   // local CPU tensor for copying constants.
-  TensorCPU local_;
+  Tensor local_{CPU};
 };
 
 class ThrowExceptionOp : public Operator<CPUContext> {
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index 4b9b10eafbc82..fb8baba549768 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -19,10 +19,10 @@ static void AddConstInput(
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
-      tensor->size(), value, tensor->mutable_data<float>(), &context);
+      tensor->size(), value, tensor->template mutable_data<float>(), &context);
   return;
 }
 
@@ -43,7 +43,7 @@ TEST(UtilityOpGPUTest, testReshapeWithScalar) {
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
   Blob* XNew = ws.GetBlob("XNew");
-  const Tensor<CUDAContext>& XNewTensor = XNew->Get<Tensor<CUDAContext>>();
+  const Tensor& XNewTensor = XNew->Get<Tensor>();
   EXPECT_EQ(1, XNewTensor.ndim());
   EXPECT_EQ(1, XNewTensor.size());
 }
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index 74705173fa3de..7b4bcb3144f3e 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -16,10 +16,10 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
-      tensor->size(), value, tensor->mutable_data<float>(), &context);
+      tensor->size(), value, tensor->template mutable_data<float>(), &context);
   return;
 }
 
@@ -37,7 +37,7 @@ TEST(UtilityOpTest, testReshapeWithScalar) {
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
   Blob* XNew = ws.GetBlob("XNew");
-  const TensorCPU& XNewTensor = XNew->Get<Tensor<CPUContext>>();
+  const TensorCPU& XNewTensor = XNew->Get<Tensor>();
   EXPECT_EQ(1, XNewTensor.ndim());
   EXPECT_EQ(1, XNewTensor.size());
 }
diff --git a/caffe2/operators/weighted_multi_sampling_op.cc b/caffe2/operators/weighted_multi_sampling_op.cc
index 6f565c6ec1219..57d50ed34b53d 100644
--- a/caffe2/operators/weighted_multi_sampling_op.cc
+++ b/caffe2/operators/weighted_multi_sampling_op.cc
@@ -84,14 +84,14 @@ OPERATOR_SCHEMA(WeightedMultiSampling)
     .SetDoc(R"DOC(
 The operator performs sampling based on the input sampling weights.
 All weights are cummulative probability thus sorted. The output is
-a 1-D tensor (Tensor<int>). If two inputs are given, the second input
+a 1-D tensor (Tensor). If two inputs are given, the second input
 is used to provide shape of the output sample tensor. Otherwise, we use
 argument `num_samples` to determine the number of samples to generate.
 )DOC")
     .Input(
         0,
         "sampling_cdf",
-        "An optional 1-D Tensor<float>."
+        "An optional 1-D Tensor."
         "Input cumulative sampling probability (such as [0.2, 0.5, 0.8, 1.5])."
         " All weights must be non-negative numbers. Note that the last value of"
         " CDF is not necessary 1. If the last value is not 1, all values in"
@@ -105,7 +105,7 @@ argument `num_samples` to determine the number of samples to generate.
         "sampled_indexes",
         "The output tensor contains indices sampled from distribution given"
         "by the weight vector in the input tensor"
-        "The output is a 1-D Tensor<int> of size determined by argument"
+        "The output is a 1-D Tensor of size determined by argument"
         "`num_samples` or the second input tensor.")
     .Arg("num_samples", "number of samples to sample from the input data");
 
diff --git a/caffe2/operators/weighted_sample_op.cc b/caffe2/operators/weighted_sample_op.cc
index 2ffd35f6e6419..c8b278b30f8c9 100644
--- a/caffe2/operators/weighted_sample_op.cc
+++ b/caffe2/operators/weighted_sample_op.cc
@@ -85,33 +85,33 @@ OPERATOR_SCHEMA(WeightedSample)
     .SetDoc(R"DOC(
 The operator performs sampling based on the input sampling weights for
 each batch. All weights must be non-negative numbers.
-The input is a 2-D tensor (Tensor<float>) of size (batch_size x weights_dim).
+The input is a 2-D tensor (Tensor) of size (batch_size x weights_dim).
 For each batch, an index is randomly sampled from the distribution given by
 the weights of the corresponding batch.
-The output is a 1-D tensor (Tensor<int>) of size (batch_size x 1) and
+The output is a 1-D tensor (Tensor) of size (batch_size x 1) and
 contains the index(es) of the sampled output.
 )DOC")
     .Input(
         0,
         "sampling_weights",
-        "A 2-D Tensor<float> of size (batch_size x weights_dim)."
+        "A 2-D Tensor of size (batch_size x weights_dim)."
         "All weights must be non-negative numbers.")
     .Input(
         1,
         "sampling_values",
-        "An optional 2-D Tensor<float> of size (batch_size x weights_dim)."
+        "An optional 2-D Tensor of size (batch_size x weights_dim)."
         "Its values correspond to the sampling weights.")
     .Output(
         0,
         "sampled_indexes",
         "The output tensor contains index(es) sampled from distribution given"
         "by the weight vector(s) in the input tensor"
-        "The output is a 1-D Tensor<int> of size (batch_size x 1)")
+        "The output is a 1-D Tensor of size (batch_size x 1)")
     .Output(
         1,
         "sampled_values",
         "The output tensor contains value(s) selected by the sampled index(es)"
-        "It is a 1-D Tensor<float> of size (batch_size x 1)");
+        "It is a 1-D Tensor of size (batch_size x 1)");
 
 SHOULD_NOT_DO_GRADIENT(WeightedSample);
 } // namespace caffe2
diff --git a/caffe2/operators/weighted_sample_op.cu b/caffe2/operators/weighted_sample_op.cu
index fa247c61256a3..ba44868aa3b46 100644
--- a/caffe2/operators/weighted_sample_op.cu
+++ b/caffe2/operators/weighted_sample_op.cu
@@ -58,7 +58,7 @@ bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
 
     const float* in_weights_data = in_weights.data<float>();
     const float* in_val_data = nullptr;
-    int* out_idx_data = out_idx->mutable_data<int>();
+    int* out_idx_data = out_idx->template mutable_data<int>();
     float* out_val_data = nullptr;
 
     if (OutputSize() == 2) {
@@ -71,7 +71,7 @@ bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
 
       auto* out_val = Output(1);
       out_val->Resize(batch_size, 1);
-      out_val_data = out_val->mutable_data<float>();
+      out_val_data = out_val->template mutable_data<float>();
     }
 
     float* unif_samples_data = unif_samples_.mutable_data<float>();
@@ -92,11 +92,11 @@ bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
         out_val_data);
   } else {
     out_idx->Resize(0);
-    out_idx->mutable_data<int>();
+    out_idx->template mutable_data<int>();
     if (OutputSize() == 2) {
       auto* out_val = Output(1);
       out_val->Resize(0);
-      out_val->mutable_data<float>();
+      out_val->template mutable_data<float>();
     }
   }
 
diff --git a/caffe2/operators/weighted_sample_op.h b/caffe2/operators/weighted_sample_op.h
index e870511f2c28a..ac5a7cdd57699 100644
--- a/caffe2/operators/weighted_sample_op.h
+++ b/caffe2/operators/weighted_sample_op.h
@@ -22,7 +22,7 @@ class WeightedSampleOp final : public Operator<Context> {
 
  private:
   vector<float> cum_mass_;
-  Tensor<Context> unif_samples_;
+  Tensor unif_samples_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h
index dff6f1190be6c..258862b690e4a 100644
--- a/caffe2/operators/while_op.h
+++ b/caffe2/operators/while_op.h
@@ -35,7 +35,7 @@ class WhileOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor<Context>>(0),
+        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
         "Invalid condition in While operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/operators/workspace_ops.cc b/caffe2/operators/workspace_ops.cc
index d9775aa3a7752..24655af325591 100644
--- a/caffe2/operators/workspace_ops.cc
+++ b/caffe2/operators/workspace_ops.cc
@@ -15,7 +15,8 @@ class GetAllBlobNamesOp final : public Operator<CPUContext> {
     auto* out = Output(0);
     const auto& blobs = include_shared_ ? ws_->Blobs() : ws_->LocalBlobs();
     out->Resize(blobs.size());
-    std::copy(blobs.begin(), blobs.end(), out->mutable_data<std::string>());
+    std::copy(
+        blobs.begin(), blobs.end(), out->template mutable_data<std::string>());
     return true;
   }
 
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index dcab984f0244c..8a1b736399562 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -40,10 +40,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
       continue;
     }
 
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                              \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                    \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");           \
-  auto name##Tensor = ws->GetBlob(name->getName())->GetMutable<TensorCPU>(); \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
+  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index c1c6c310786c5..75baec0e9be66 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -175,11 +175,11 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
 
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = blob->GetMutable<TensorCPU>();
+      auto* cpu_tensor = blob->GetMutableTensor(CPU);
       std::vector<TIndex> dims;
       std::copy(t.dims().begin(), t.dims().end(), dims.begin());
       cpu_tensor->Resize(dims);
-      context.template CopyBytes<CPUContext, CPUContext>(
+      context.CopyBytesSameDevice(
           cpu_tensor->size() * sizeof(float),
           static_cast<const void*>(t.raw_data().data()),
           cpu_tensor->raw_mutable_data(TypeMeta::Make<float>()));
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 70bc635193f19..3412e5c306fba 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -62,7 +62,7 @@ CAFFE_DEFINE_TYPED_REGISTRY(
     BlobFeederBase,
     std::unique_ptr);
 
-REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorCPU>()), TensorFetcher<CPUContext>);
+REGISTER_BLOB_FETCHER((TypeMeta::Id<Tensor>()), TensorFetcher);
 REGISTER_BLOB_FEEDER(CPU, TensorFeeder<CPUContext>);
 
 Workspace* GetCurrentWorkspace() {
@@ -326,7 +326,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "tensor",
-          [](Blob* blob) { return py::cast(blob->GetMutable<TensorCPU>()); },
+          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
           py::return_value_policy::reference_internal)
       .def(
           "_feed",
@@ -403,7 +403,7 @@ void addObjectMethods(py::module& m) {
               // keep this behavior for backward compatibility
               t->mutable_data<float>();
             }
-            auto res = TensorFetcher<CPUContext>().FetchTensor(*t, false);
+            auto res = TensorFetcher().FetchTensor(*t, false);
             return res.obj;
           },
           "Return numpy array pointing to this tensor's data if possible. "
@@ -422,17 +422,17 @@ void addObjectMethods(py::module& m) {
       .def(
           "fetch",
           [](TensorCPU* t) {
-            auto res = TensorFetcher<CPUContext>().FetchTensor(*t, true);
+            auto res = TensorFetcher().FetchTensor(*t, true);
             return res.obj;
           },
           "Copy data from this tensor into a new numpy array.")
       .def(
           "init",
-          [](TensorCPU* t, std::vector<TIndex> dims, int caffe_type) {
+          [](Tensor* t, std::vector<TIndex> dims, int caffe_type) {
             const auto& meta =
                 DataTypeToTypeMeta((TensorProto::DataType)caffe_type);
             CAFFE_ENFORCE(
-                !TensorFetcher<CPUContext>().NeedsCopy(meta),
+                !TensorFetcher().NeedsCopy(t, meta),
                 "Cannot init tensor of this type. Use `feed` instead.");
             t->Resize(dims);
             t->raw_mutable_data(meta);
@@ -725,14 +725,15 @@ void addObjectMethods(py::module& m) {
             for (const auto pair : inputs) {
               const auto& name = pair.first;
               const auto& input = pair.second;
+              tensors_data.emplace(name, Tensor(CPU));
               CAFFE_ENFORCE(
                   PyArray_Check(input.ptr()),
                   "Input must be of type numpy array.");
               PyArrayObject* array =
                   reinterpret_cast<PyArrayObject*>(input.ptr());
               TensorFeeder<CPUContext>().FeedTensor(
-                  DeviceOption(), array, &tensors_data[name]);
-              tensors.insert(std::make_pair(name, &tensors_data[name]));
+                  DeviceOption(), array, &tensors_data.at(name));
+              tensors.insert(std::make_pair(name, &tensors_data.at(name)));
             }
 
 
@@ -740,8 +741,7 @@ void addObjectMethods(py::module& m) {
             instance.RunMap(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(
-                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
             }
             return pyout;
           })
@@ -750,7 +750,10 @@ void addObjectMethods(py::module& m) {
           [](caffe2::onnx::Caffe2BackendRep& instance,
              std::vector<py::object> inputs) -> std::vector<py::object> {
             Predictor::TensorVector tensors;
-            std::vector<TensorCPU> tensors_data(inputs.size());
+            std::vector<TensorCPU> tensors_data;
+            for (auto i = 0; i < inputs.size(); ++i) {
+              tensors_data.emplace_back(caffe2::CPU);
+            }
             for (auto i = 0; i < inputs.size(); ++i) {
               auto input = inputs[i];
               CAFFE_ENFORCE(
@@ -766,8 +769,7 @@ void addObjectMethods(py::module& m) {
             instance.Run(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(
-                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
             }
             return pyout;
           });
@@ -847,7 +849,10 @@ void addObjectMethods(py::module& m) {
           [](Predictor& instance,
              std::vector<py::object> inputs) -> std::vector<py::object> {
             Predictor::TensorVector tensors;
-            std::vector<TensorCPU> tensors_data(inputs.size());
+            std::vector<Tensor> tensors_data;
+            for (auto i = 0; i < inputs.size(); ++i) {
+              tensors_data.emplace_back(CPU);
+            }
             for (auto i = 0; i < inputs.size(); ++i) {
               auto input = inputs[i];
               CAFFE_ENFORCE(
@@ -863,8 +868,7 @@ void addObjectMethods(py::module& m) {
             instance.run(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(
-                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
             }
             return pyout;
           })
@@ -877,21 +881,21 @@ void addObjectMethods(py::module& m) {
             for (const auto pair : inputs) {
               const auto& name = pair.first;
               const auto& input = pair.second;
+              tensors_data.emplace(name, Tensor(CPU));
               CAFFE_ENFORCE(
                   PyArray_Check(input.ptr()),
                   "Input must be of type numpy array.");
               PyArrayObject* array =
                   reinterpret_cast<PyArrayObject*>(input.ptr());
               TensorFeeder<CPUContext>().FeedTensor(
-                  DeviceOption(), array, &tensors_data[name]);
-              tensors.insert(std::make_pair(name, &tensors_data[name]));
+                  DeviceOption(), array, &tensors_data.at(name));
+              tensors.insert(std::make_pair(name, &tensors_data.at(name)));
             }
             std::vector<TensorCPU*> out;
             instance.run_map(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(
-                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
+              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
             }
             return pyout;
           });
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index f46972a05561c..894c420afa94b 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -90,19 +90,20 @@ static_assert(
 int CaffeToNumpyType(const TypeMeta& meta);
 const TypeMeta& NumpyTypeToCaffe(int numpy_type);
 
-template <class Context>
 class TensorFetcher : public BlobFetcherBase {
  public:
   pybind11::object Fetch(const Blob& blob) override {
-    return FetchTensor(blob.Get<Tensor<Context>>(), true).obj;
+    return FetchTensor(blob.Get<Tensor>(), true).obj;
   }
 
-  bool NeedsCopy(const TypeMeta& meta) const {
-    return !std::is_same<Context, CPUContext>::value ||
+  // Checks whether the data with type `meta` needs to be copied in the context
+  // of `tensor`
+  bool NeedsCopy(const Tensor* tensor, const TypeMeta& meta) const {
+    return tensor->GetStaticContext() != GetCPUStaticContext() ||
         CaffeToNumpyType(meta) == NPY_OBJECT;
   }
 
-  FetchedBlob FetchTensor(const Tensor<Context>& tensor, bool force_copy) {
+  FetchedBlob FetchTensor(const Tensor& tensor, bool force_copy) {
     FetchedBlob result;
     CAFFE_ENFORCE_GE(tensor.size(), 0, "Trying to fetch unitilized tensor");
     const int numpy_type = CaffeToNumpyType(tensor.meta());
@@ -115,7 +116,7 @@ class TensorFetcher : public BlobFetcherBase {
     for (const auto dim : tensor.dims()) {
       npy_dims.push_back(dim);
     }
-    result.copied = force_copy || NeedsCopy(tensor.meta());
+    result.copied = force_copy || NeedsCopy(&tensor, tensor.meta());
     void* outPtr;
     if (result.copied) {
       result.obj = py::reinterpret_steal<py::object>(
@@ -123,7 +124,7 @@ class TensorFetcher : public BlobFetcherBase {
       outPtr = static_cast<void*>(
           PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
     } else {
-      outPtr = const_cast<Tensor<Context>&>(tensor).raw_mutable_data();
+      outPtr = const_cast<Tensor&>(tensor).raw_mutable_data();
       result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
           tensor.ndim(), npy_dims.data(), numpy_type, outPtr));
     }
@@ -146,10 +147,9 @@ class TensorFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      Context context;
-      context.template CopyBytes<Context, CPUContext>(
-          tensor.nbytes(), tensor.raw_data(), outPtr);
-      context.FinishDeviceComputation();
+      auto context = tensor.GetStaticContext()->CreateContext();
+      context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
+      context->FinishDeviceComputation();
     }
     return result;
   }
@@ -161,7 +161,7 @@ class TensorFeeder : public BlobFeederBase {
   void FeedTensor(
       const DeviceOption& option,
       PyArrayObject* original_array,
-      Tensor<Context>* tensor) {
+      Tensor* tensor) {
     PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
     auto g = MakeGuard([&]() { Py_XDECREF(array); });
 
@@ -220,7 +220,7 @@ class TensorFeeder : public BlobFeederBase {
             "instead of unicode strings.");
         break;
       default:
-        context.template CopyBytes<CPUContext, Context>(
+        context.CopyBytesFromCPU(
             tensor->size() * meta.itemsize(),
             static_cast<void*>(PyArray_DATA(array)),
             tensor->raw_mutable_data(meta));
@@ -230,7 +230,10 @@ class TensorFeeder : public BlobFeederBase {
 
   virtual void
   Feed(const DeviceOption& option, PyArrayObject* original_array, Blob* blob) {
-    FeedTensor(option, original_array, blob->GetMutable<Tensor<Context>>());
+    FeedTensor(
+        option,
+        original_array,
+        blob->GetMutableTensor(Context::GetDeviceType()));
   }
 };
 
@@ -316,29 +319,26 @@ class PythonOpBase : public Operator<Context> {
         const auto* blob = &InputBlob(i);
         // Allow CPU tensors in addition to operator context's tensors
         py::object py_obj;
-        if (blob->template IsType<Tensor<CPUContext>>()) {
+        if (blob->template IsType<Tensor>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                const_cast<Tensor<CPUContext>*>(
-                    &blob->template Get<Tensor<CPUContext>>()),
-                cpu_option);
+                const_cast<Tensor*>(&blob->template Get<Tensor>()), cpu_option);
             // copy wrapper
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                &blob->template Get<Tensor<CPUContext>>(),
+                &blob->template Get<Tensor>(),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                const_cast<Tensor<Context>*>(
-                    &blob->template Get<Tensor<Context>>()),
+                const_cast<Tensor*>(&blob->template Get<Tensor>()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                &blob->template Get<Tensor<Context>>(),
+                &blob->template Get<Tensor>(),
                 py::return_value_policy::reference);
           }
         }
@@ -365,31 +365,31 @@ class PythonOpBase : public Operator<Context> {
 
         // make sure output blob is initialized before creating the binding
         if (forced_cpu_outputs_.count(i)) {
-          blob->template GetMutable<Tensor<CPUContext>>();
+          blob->GetMutableTensor(Context::GetDeviceType());
         } else {
-          blob->template GetMutable<Tensor<Context>>();
+          blob->GetMutableTensor(Context::GetDeviceType());
         }
 
         py::object py_obj;
-        if (blob->template IsType<Tensor<CPUContext>>()) {
+        if (blob->template IsType<Tensor>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                blob->template GetMutable<Tensor<CPUContext>>(), cpu_option);
+                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->template GetMutable<Tensor<CPUContext>>(),
+                blob->GetMutableTensor(Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                blob->template GetMutable<Tensor<Context>>(),
+                blob->GetMutableTensor(Context::GetDeviceType()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->template GetMutable<Tensor<Context>>(),
+                blob->GetMutableTensor(Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         }
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 1ba3f0fa24476..37bf82e90bc30 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -23,7 +23,7 @@ const TypeMeta& DLTypeToCaffe(const DLDataType& dl_type);
 template <class Context>
 class DLPackWrapper {
  public:
-  DLPackWrapper(Tensor<Context>* tensor, DeviceOption device_option)
+  DLPackWrapper(Tensor* tensor, DeviceOption device_option)
       : tensor(tensor), device_option(device_option) {}
 
   py::object data() {
@@ -120,7 +120,7 @@ class DLPackWrapper {
         });
   }
 
-  Tensor<Context>* tensor;
+  Tensor* tensor;
   DeviceOption device_option;
   DLManagedTensor managed_tensor;
 };
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index 9ceec10dbd71c..0a8b10aa54e93 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -31,7 +31,6 @@ REGISTER_CUDA_OPERATOR(
     PythonDLPackGradient,
     PythonGradientOp<CUDAContext, true>);
 
-REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorCUDA>()), TensorFetcher<CUDAContext>);
 REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);
 
 namespace py = pybind11;
diff --git a/caffe2/python/pybind_state_hip.cc b/caffe2/python/pybind_state_hip.cc
index b770ea00001e3..bb4b4c715c5a8 100644
--- a/caffe2/python/pybind_state_hip.cc
+++ b/caffe2/python/pybind_state_hip.cc
@@ -20,7 +20,6 @@ REGISTER_HIP_OPERATOR(
 REGISTER_HIP_OPERATOR(PythonDLPack, PythonOp<HIPContext, true>);
 REGISTER_HIP_OPERATOR(PythonDLPackGradient, PythonGradientOp<HIPContext, true>);
 
-REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorHIP>()), TensorFetcher<HIPContext>);
 REGISTER_BLOB_FEEDER(HIP, TensorFeeder<HIPContext>);
 
 namespace py = pybind11;
diff --git a/caffe2/python/pybind_state_int8.cc b/caffe2/python/pybind_state_int8.cc
index 683a4cee503ca..1df33130acb05 100644
--- a/caffe2/python/pybind_state_int8.cc
+++ b/caffe2/python/pybind_state_int8.cc
@@ -45,8 +45,7 @@ class Int8TensorFetcher : public BlobFetcherBase {
     void* ptr = static_cast<void*>(
         PyArray_DATA(reinterpret_cast<PyArrayObject*>(data_array.ptr())));
     CPUContext context;
-    context.template CopyBytes<CPUContext, CPUContext>(
-        src.t.nbytes(), src.t.raw_data(), ptr);
+    context.CopyBytesSameDevice(src.t.nbytes(), src.t.raw_data(), ptr);
     context.FinishDeviceComputation();
 
     auto result = pybind11::cast<pybind11::object>(
diff --git a/caffe2/queue/blobs_queue_db.h b/caffe2/queue/blobs_queue_db.h
index 7d4ac146f05df..317f371ac48cf 100644
--- a/caffe2/queue/blobs_queue_db.h
+++ b/caffe2/queue/blobs_queue_db.h
@@ -16,8 +16,8 @@ namespace {
 const std::string& GetStringFromBlob(Blob* blob) {
   if (blob->template IsType<string>()) {
     return blob->template Get<string>();
-  } else if (blob->template IsType<Tensor<CPUContext>>()) {
-    return *blob->template Get<Tensor<CPUContext>>().template data<string>();
+  } else if (blob->template IsType<Tensor>()) {
+    return *blob->template Get<Tensor>().template data<string>();
   } else {
     CAFFE_THROW("Unsupported Blob type");
   }
diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
index 4ed6acaa1b69a..8e924176a02b0 100644
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@@ -146,7 +146,7 @@ class SafeDequeueBlobsOp final : public Operator<Context> {
       }
       for (int col = 0; col < size; ++col) {
         auto* out = this->Output(col);
-        const auto& in = blobPtrs_.at(col)->template Get<Tensor<Context>>();
+        const auto& in = blobPtrs_.at(col)->template Get<Tensor>();
         if (i == 0) {
           out->CopyFrom(in);
         } else {
diff --git a/caffe2/queue/rebatching_queue.cc b/caffe2/queue/rebatching_queue.cc
index 6be252f44a11f..cfb43a99f491b 100644
--- a/caffe2/queue/rebatching_queue.cc
+++ b/caffe2/queue/rebatching_queue.cc
@@ -50,12 +50,12 @@ void concat(
         continue;
       }
 
-      context.CopyItems<CPUContext, CPUContext>(
+      context.CopyItemsToCPU(
           input.meta(),
           input.size(),
           input.raw_data() /* src */,
           destinations[j] /* dst */
-          );
+      );
 
       destinations[j] =
           (char*)destinations[j] + input.size() * input.itemsize();
@@ -84,8 +84,8 @@ std::vector<std::vector<TensorCPU>> split(
     CAFFE_ENFORCE_EQ(input.dims().at(0), outputSize);
 
     for (int i = 0; i < outputSize; ++i) {
-      outputs[i].push_back(TensorCPU(outputDims));
-      context.CopyItems<CPUContext, CPUContext>(
+      outputs[i].push_back(Tensor(outputDims, CPU));
+      context.CopyItemsToCPU(
           input.meta(),
           innerSize,
           (char*)input.raw_data() + i * innerSize * itemSize /* src */,
diff --git a/caffe2/queue/rebatching_queue_ops.h b/caffe2/queue/rebatching_queue_ops.h
index 80749a42692e7..5c9059c05b6eb 100644
--- a/caffe2/queue/rebatching_queue_ops.h
+++ b/caffe2/queue/rebatching_queue_ops.h
@@ -30,7 +30,7 @@ class EnqueueRebatchingQueueOp : public Operator<CPUContext> {
     auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
     CHECK(queue);
     CAFFE_ENFORCE_EQ(InputSize(), queue->numBlobs() + 1);
-    std::vector<const TensorCPU*> inputTensors;
+    std::vector<const Tensor*> inputTensors;
     inputTensors.reserve(InputSize() - 1);
     for (int i = 1; i < InputSize(); ++i) {
       inputTensors.push_back(&Input(i));
@@ -54,7 +54,7 @@ class DequeueRebatchingQueueOp : public Operator<CPUContext> {
     auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
     CHECK(queue);
 
-    std::vector<TensorCPU*> outputTensors;
+    std::vector<Tensor*> outputTensors;
     outputTensors.reserve(OutputSize());
     for (int i = 0; i < OutputSize(); ++i) {
       outputTensors.push_back(Output(i));
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index bb30247ca5f84..c25509b3d00af 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -88,7 +88,7 @@ class AdamOp final : public Operator<Context> {
         epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
   bool RunOnDevice() override {
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size());
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size());
@@ -98,7 +98,7 @@ class AdamOp final : public Operator<Context> {
     Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
 
     const auto iter =
-        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
 
     const auto t = iter + 1;
     const auto correction =
@@ -177,7 +177,7 @@ class SparseAdamOp final : public Operator<Context> {
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     const auto iter =
-        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
 
     const auto t = iter + 1;
     const auto correction =
@@ -287,7 +287,7 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     const auto iter =
-        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
 
     const auto t = iter + 1;
     const auto correction =
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
index 8eb1b8835c96d..8fdde749a4636 100644
--- a/caffe2/sgd/adam_op_gpu.cu
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -129,7 +129,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
   auto N = Input(GRAD).size();
   auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
   const auto iter =
-      OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+    OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
   const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
       (1.0f - std::pow(beta1_, iter + 1));
 
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h
index 85a9d53396fcd..556b8a21f0524 100644
--- a/caffe2/sgd/fp16_momentum_sgd_op.h
+++ b/caffe2/sgd/fp16_momentum_sgd_op.h
@@ -35,9 +35,10 @@ class FP16MomentumSGDUpdateOp final : public Operator<Context> {
         fp32_update_(OperatorBase::GetSingleArgument<int>("fp32_update", 0)) {}
 
   bool RunOnDevice() override {
+    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h
index 25ca516eeeeea..d94de7b7ac262 100644
--- a/caffe2/sgd/fp32_momentum_sgd_op.h
+++ b/caffe2/sgd/fp32_momentum_sgd_op.h
@@ -31,9 +31,10 @@ class FP32MomentumSGDUpdateOp final : public Operator<Context> {
         nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
 
   bool RunOnDevice() override {
+    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
index 13681d7db341d..91709f47f3453 100644
--- a/caffe2/sgd/iter_op.h
+++ b/caffe2/sgd/iter_op.h
@@ -38,19 +38,20 @@ class IterOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     if (InputSize() == 0) {
-      if (!OperatorBase::OutputIsType<TensorCPU>(0)) {
+      LOG(INFO) << "[Input size is zero]";
+      if (!OperatorBase::OutputIsType<Tensor>(0, CPU)) {
         // This is the first run; set the iter to start with 0.
         LOG(ERROR) << "You are using an old definition of IterOp that will "
                       "be deprecated soon. More specifically, IterOp now "
                       "requires an explicit in-place input and output.";
 
-        auto* output = OperatorBase::Output<TensorCPU>(0);
+        auto* output = OperatorBase::Output<Tensor>(0, CPU);
         VLOG(1) << "Initializing iter counter.";
         output->Resize(1);
         output->template mutable_data<int64_t>()[0] = 0;
       }
     }
-    IncrementIter(OperatorBase::Output<TensorCPU>(0));
+    IncrementIter(OperatorBase::Output<Tensor>(0, CPU));
     return true;
   }
 };
@@ -67,7 +68,7 @@ class AtomicIterOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
     std::lock_guard<std::mutex> lg(*mutex);
-    IncrementIter(OperatorBase::Output<TensorCPU>(0));
+    IncrementIter(OperatorBase::Output<Tensor>(0, CPU));
     CAFFE_EVENT(stats_, num_iter);
     return true;
   }
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index 0a47b6c5fd6d5..bd813ce653dfb 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -27,12 +27,12 @@ class LearningRateOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     int64_t iter =
-        OperatorBase::Input<TensorCPU>(0).template data<int64_t>()[0];
+        OperatorBase::Input<Tensor>(0, CPU).template data<int64_t>()[0];
     T learning_rate = cur_base_lr_ * (*functor_)(iter);
     // Write to output.
     auto* output = Output(0);
     output->Resize(vector<TIndex>());
-    context_.template Copy<T, CPUContext, Context>(
+    context_.template CopyFromCPU<T>(
         1, &learning_rate, Output(0)->template mutable_data<T>());
     return true;
   }
diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h
index 23da3d420c82b..f3f75f642164e 100644
--- a/caffe2/sgd/momentum_sgd_op.h
+++ b/caffe2/sgd/momentum_sgd_op.h
@@ -45,9 +45,10 @@ class MomentumSGDOp final : public Operator<Context> {
         nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
 
   bool RunOnDevice() override {
+    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
@@ -84,9 +85,10 @@ class MomentumSGDUpdateOp final : public Operator<Context> {
         nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
 
   bool RunOnDevice() override {
+    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
     CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
     CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h
index 02403ea53692d..06ecc177c8b69 100644
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@@ -126,21 +126,21 @@ CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
 CAFFE2_YF_READ_INPUT(GRAD, grad)
 #undef CAFFE2_YF_READ_OUTPUT
 
-    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
-    CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
-    CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
-    CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
-    CAFFE_ENFORCE_EQ(param_tensor.ndim(), g_avg_tensor.ndim());
-    CAFFE_ENFORCE_EQ(param_tensor.ndim(), g2_avg_tensor.ndim());
-    CAFFE_ENFORCE_EQ(param_tensor.ndim(), grad_tensor.ndim());
-    for (int i = 0; i < param_tensor.ndim(); ++i) {
-      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
-      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
-      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
-      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
+CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
+CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
+CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
+CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
+CAFFE_ENFORCE_EQ(param_tensor.ndim(), g_avg_tensor.ndim());
+CAFFE_ENFORCE_EQ(param_tensor.ndim(), g2_avg_tensor.ndim());
+CAFFE_ENFORCE_EQ(param_tensor.ndim(), grad_tensor.ndim());
+for (int i = 0; i < param_tensor.ndim(); ++i) {
+  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
+  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
+  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
+  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
     }
 
-    iter_ = OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
+    iter_ = OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
 
     D_ = param_tensor.size();
 
@@ -229,8 +229,8 @@ CAFFE2_YF_READ_INPUT(GRAD, grad)
   int D_;
 
 // Temporary memory on device, listed all variables used in calculations
-#define CAFFE2_YF_DEFINE_TENSOR(NAME) \
-  Tensor<Context> NAME##_tensor_;     \
+#define CAFFE2_YF_DEFINE_TENSOR(NAME)              \
+  Tensor NAME##_tensor_{Context::GetDeviceType()}; \
   T* NAME##_;
 
   CAFFE2_YF_DEFINE_TENSOR(aux_vector)
@@ -255,7 +255,7 @@ CAFFE2_YF_READ_INPUT(GRAD, grad)
   CAFFE2_YF_DEFINE_TENSOR(mu_deb)
   CAFFE2_YF_DEFINE_TENSOR(variance)
 
-  Tensor<Context> scratch_tensor_;
+  Tensor scratch_tensor_{Context::GetDeviceType()};
 
 #undef CAFFE2_YF_DEFINE_TENSOR
 
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
index 6d42cf6f2bd5b..616587a39fda1 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
@@ -438,9 +438,9 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
   }
 
   bool RunOnDeviceWithOrderNCHW() override {
-    const Tensor<CPUContext>& X = Input(0);
+    const Tensor& X = Input(0);
     auto& filter = Input(1);
-    Tensor<CPUContext>* Y = Output(0);
+    Tensor* Y = Output(0);
     const int N = X.dim32(0), C = X.dim32(1);
     CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
     const int M = filter.dim32(0);
@@ -536,7 +536,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
   }
 
  private:
-  Tensor<CPUContext> bias_;
+  Tensor bias_{CPU};
 };
 
 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, DEPTHWISE_3x3, Depthwise3x3ConvOp);
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 0a759c81aa9de..476930ce4f904 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index 8e1a0b264c2e5..05c945106c52d 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -197,7 +197,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
   initNNPACK();
   pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
 
-  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor<CPUContext>* buffer) {
+  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
     if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
       transformedFilters_.resize(group_);
 
@@ -231,11 +231,11 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
 
         for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] =
-              ws_->CreateBlob(
-                     "__transformed_kernel_" +
-                     to_string(__sync_fetch_and_add(&precomputed_transform_id, 1)))
-                  ->GetMutable<TensorCPU>();
+          transformedFilters_[g] = ws_->CreateBlob(
+                                          "__transformed_kernel_" +
+                                          to_string(__sync_fetch_and_add(
+                                              &precomputed_transform_id, 1)))
+                                       ->GetMutableTensor(CPU);
           transformedFilters_[g]->Resize(transformedFilterElements);
 
           status = nnp_convolution_inference(
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index c94faaa029c57..ddc451264abca 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
index a2aa32fb56db7..8312ed33ddd6f 100644
--- a/caffe2/utils/filler.h
+++ b/caffe2/utils/filler.h
@@ -13,14 +13,14 @@ template <class Context_t>
 class TensorFiller {
  public:
   template <class Type>
-  void Fill(Tensor<Context_t>* tensor) const {
+  void Fill(Tensor* tensor) const {
     CAFFE_ENFORCE(context_, "context is null");
     CAFFE_ENFORCE(tensor, "tensor is null");
     auto min = static_cast<Type>(min_);
     auto max = static_cast<Type>(max_);
     CAFFE_ENFORCE_LE(min, max);
 
-    Tensor<Context_t> temp_tensor(shape_);
+    Tensor temp_tensor(shape_, Context_t::GetDeviceType());
     tensor->swap(temp_tensor);
     Type* data = tensor->template mutable_data<Type>();
     Context_t* context = static_cast<Context_t*>(context_);
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index ae54faa4e628f..f962c20b58126 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{10, 6};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutable<Tensor<HIPContext>>();
+  auto* tensorW = blobW->GetMutableTensor(HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -60,7 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -81,7 +81,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -102,7 +102,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{6, 10};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutable<Tensor<HIPContext>>();
+  auto* tensorW = blobW->GetMutableTensor(HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -160,7 +160,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -181,7 +181,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -201,7 +201,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
   vector<int> shapeA{5, 10};
   vector<int> shapeX{10};
   vector<int> shapeY{5};
-  auto* tensorA = blobA->GetMutable<Tensor<HIPContext>>();
+  auto* tensorA = blobA->GetMutableTensor(HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 50);
@@ -256,7 +256,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@@ -274,7 +274,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@@ -292,7 +292,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
   vector<int> shapeA{6, 10};
   vector<int> shapeX{6};
   vector<int> shapeY{10};
-  auto* tensorA = blobA->GetMutable<Tensor<HIPContext>>();
+  auto* tensorA = blobA->GetMutableTensor(HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
+  auto* tensorX = blobX->GetMutableTensor(HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
+  auto* tensorY = blobY->GetMutableTensor(HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
+  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 60);
@@ -346,7 +346,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
@@ -364,7 +364,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
@@ -382,7 +382,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
+  tensorY_host->CopyFrom(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
diff --git a/caffe2/utils/hip/math_hip.cc b/caffe2/utils/hip/math_hip.cc
index dfe5beb72df8e..59e93a7c8b4d5 100644
--- a/caffe2/utils/hip/math_hip.cc
+++ b/caffe2/utils/hip/math_hip.cc
@@ -705,7 +705,7 @@ DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
-      Tensor<HIPContext>* scratch_ptr,                                  \
+      Tensor* scratch_ptr,                                              \
       HIPContext* context) {                                            \
     size_t memRequired = 0;                                             \
     cub::DeviceReduce::func(                                            \
@@ -1385,7 +1385,7 @@ void Dot<float, HIPContext>(
   float result;
   ROCBLAS_ENFORCE(
       rocblas_sdot(context->rocblas_handle(), n, a, 1, b, 1, &result));
-  context->Copy<float, CPUContext, HIPContext>(1, &result, y);
+  context->CopyFromCPU<float>(1, &result, y);
 }
 
 template <>
@@ -1474,7 +1474,7 @@ void SumGenericIter(
     IterT it,
     T*& dest,
     HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   size_t memRequired = 0;
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->hip_stream());
@@ -1503,7 +1503,7 @@ void Sum<float, HIPContext>(
     const float* x,
     float* y,
     HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<float>(N, x, y, context, scratch_ptr);
   } else {
@@ -1526,7 +1526,7 @@ void Sum<int32_t, HIPContext>(
     const int32_t* x,
     int32_t* y,
     HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
   } else {
@@ -1559,7 +1559,7 @@ struct FloatTransform {
       const T* x,                                                         \
       T* y,                                                               \
       HIPContext* context,                                                \
-      Tensor<HIPContext>* scratch_ptr) {                                  \
+      Tensor* scratch_ptr) {                                              \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
       FloatTransform<T> transform;                                        \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
@@ -1606,7 +1606,7 @@ void SumSqr<float, HIPContext>(
     const float* x,
     float* y,
     HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SqrTransform<float> transform;
     cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
@@ -1633,7 +1633,7 @@ void SumSqr<float, HIPContext>(
       const T* x,                                                     \
       T* y,                                                           \
       HIPContext* context,                                            \
-      Tensor<HIPContext>* scratch_ptr) {                              \
+      Tensor* scratch_ptr) {                              \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {            \
       FloatTransform<T> float_transform;                              \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> \
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index 75ae1778aa3c8..b25dffc7a723b 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -19,7 +19,6 @@ extern "C" {
 
 namespace caffe2 {
 
-template <class Context>
 class Tensor;
 
 // An empty class as a placeholder for a math function that has no specific
@@ -168,7 +167,7 @@ void ReduceMin(
     const int N,
     const T* x,
     T* y,
-    Tensor<Context>* scratch_ptr,
+    Tensor* scratch_ptr,
     Context* context);
 
 template <typename T, class Context>
@@ -176,7 +175,7 @@ void ReduceMax(
     const int N,
     const T* x,
     T* y,
-    Tensor<Context>* scratch_ptr,
+    Tensor* scratch_ptr,
     Context* context);
 
 template <typename T, class Context>
@@ -441,7 +440,7 @@ void Sum(
     const T* x,
     T* y,
     Context* context,
-    Tensor<Context>* scratch_ptr = nullptr);
+    Tensor* scratch_ptr = nullptr);
 
 // Sum of squares of vector x, and writes the result to a single value y.
 template <typename T, class Context>
@@ -450,7 +449,7 @@ void SumSqr(
     const T* x,
     T* y,
     Context* context,
-    Tensor<Context>* scratch_ptr = nullptr);
+    Tensor* scratch_ptr = nullptr);
 
 // Select does index selection of the rows a N*D matrix x, and gives the N
 // dimensional vector y that contains the selected data.
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 9290fcd4260eb..e01af7bf2f88d 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -807,28 +807,28 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
 // Eigen or via custom code.
 ////////////////////////////////////////////////////////////////////////////////
 
-#define CAFFE2_SPECIALIZED_REDUCEMIN(T)    \
-  template <>                              \
-  void ReduceMin<T, CPUContext>(           \
-      const int N,                         \
-      const T* x,                          \
-      T* y,                                \
-      Tensor<CPUContext>* /*scratch_ptr*/, \
-      CPUContext* /*context*/) {           \
-    *y = *std::min_element(x, x + N);      \
+#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
+  template <>                           \
+  void ReduceMin<T, CPUContext>(        \
+      const int N,                      \
+      const T* x,                       \
+      T* y,                             \
+      Tensor* /*scratch_ptr*/,          \
+      CPUContext* /*context*/) {        \
+    *y = *std::min_element(x, x + N);   \
   }
 CAFFE2_SPECIALIZED_REDUCEMIN(float)
 #undef CAFFE2_SPECIALIZED_REDUCEMIN
 
-#define CAFFE2_SPECIALIZED_REDUCEMAX(T)    \
-  template <>                              \
-  void ReduceMax<T, CPUContext>(           \
-      const int N,                         \
-      const T* x,                          \
-      T* y,                                \
-      Tensor<CPUContext>* /*scratch_ptr*/, \
-      CPUContext* /*context*/) {           \
-    *y = *std::max_element(x, x + N);      \
+#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
+  template <>                           \
+  void ReduceMax<T, CPUContext>(        \
+      const int N,                      \
+      const T* x,                       \
+      T* y,                             \
+      Tensor* /*scratch_ptr*/,          \
+      CPUContext* /*context*/) {        \
+    *y = *std::max_element(x, x + N);   \
   }
 CAFFE2_SPECIALIZED_REDUCEMAX(float)
 CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
@@ -1899,7 +1899,7 @@ void RandGaussian<float, CPUContext>(
       const T* x,                            \
       T* y,                                  \
       CPUContext* /* unused */,              \
-      Tensor<CPUContext>* /* unused */) {    \
+      Tensor* /* unused */) {                \
     *y = ConstEigenVectorMap<T>(x, N).sum(); \
   }
 
@@ -1915,7 +1915,7 @@ void SumSqr<float, CPUContext>(
     const float* x,
     float* y,
     CPUContext* /*context*/ /* unused */,
-    Tensor<CPUContext>* /*scratch_ptr*/ /* unused */) {
+    Tensor* /*scratch_ptr*/ /* unused */) {
   *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
 }
 
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 94d3233886360..40b75e5d5732a 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -639,7 +639,7 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
-      Tensor<CUDAContext>* scratch_ptr,                                 \
+      Tensor* scratch_ptr,                                              \
       CUDAContext* context) {                                           \
     size_t memRequired = 0;                                             \
     cub::DeviceReduce::func(                                            \
@@ -1627,7 +1627,7 @@ void Dot<float, CUDAContext>(
     CUDAContext* context) {
   float result;
   CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result));
-  context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
+  context->CopyFromCPU<float>(1, &result, y);
 }
 
 template <>
@@ -1713,7 +1713,7 @@ void SumGenericIter(
     IterT it,
     T*& dest,
     CUDAContext* context,
-    Tensor<CUDAContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   size_t memRequired = 0;
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->cuda_stream());
@@ -1742,7 +1742,7 @@ void Sum<float, CUDAContext>(
     const float* x,
     float* y,
     CUDAContext* context,
-    Tensor<CUDAContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<float>(N, x, y, context, scratch_ptr);
   } else {
@@ -1757,7 +1757,7 @@ void Sum<int32_t, CUDAContext>(
     const int32_t* x,
     int32_t* y,
     CUDAContext* context,
-    Tensor<CUDAContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
   } else {
@@ -1782,7 +1782,7 @@ struct FloatTransform {
       const T* x,                                                         \
       T* y,                                                               \
       CUDAContext* context,                                               \
-      Tensor<CUDAContext>* scratch_ptr) {                                 \
+      Tensor* scratch_ptr) {                                              \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
       FloatTransform<T> transform;                                        \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
@@ -1814,7 +1814,7 @@ void SumSqr<float, CUDAContext>(
     const float* x,
     float* y,
     CUDAContext* context,
-    Tensor<CUDAContext>* scratch_ptr) {
+    Tensor* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SqrTransform<float> transform;
     cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
@@ -1833,7 +1833,7 @@ void SumSqr<float, CUDAContext>(
       const T* x,                                                       \
       T* y,                                                             \
       CUDAContext* context,                                             \
-      Tensor<CUDAContext>* scratch_ptr) {                               \
+      Tensor* scratch_ptr) {                                            \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {              \
       FloatTransform<T> float_transform;                                \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*>   \
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 330f34181918c..eaf3ef1aac212 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
   Blob* bloby = ws.CreateBlob("Y");
   Blob* bloby_host = ws.CreateBlob("Y_host");
 
-  auto* tensorx0 = blobx0->GetMutable<Tensor<CUDAContext>>();
-  auto* tensorx1 = blobx1->GetMutable<Tensor<CUDAContext>>();
-  auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();
+  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
+  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
+  auto* tensory = bloby->GetMutableTensor(CUDA);
 
   vector<int> shapex0_vector{shapex0};
   vector<int> shapex1_vector{shapex1};
@@ -71,8 +71,8 @@ void executeGpuBinaryOpTest(
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
-  tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensory, &context);
+  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
   for (int i = 0; i < shapey; ++i) {
@@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   vector<int> shapex{33 * 9, 25};
   vector<int> shapey{33, 25};
 
-  auto* tensorx = blobx->GetMutable<Tensor<CUDAContext>>();
+  auto* tensorx = blobx->GetMutableTensor(CUDA);
   tensorx->Resize(shapex);
   int stripe = 33 * 25;
   vector<float> tot(33, 0.0);
@@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
     }
   }
 
-  auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();
+  auto* tensory = bloby->GetMutableTensor(CUDA);
   tensory->Resize(shapey);
   math::Set<float, CUDAContext>(
       stripe, 0.0, tensory->mutable_data<float>(), &context);
@@ -125,8 +125,8 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
-  tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensory, &context);
+  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
   for (int k = 0; k < 33; k++) {
@@ -149,7 +149,7 @@ TEST(MathUtilGPUTest, testReduceMin) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor<CUDAContext> aux;
+        Tensor aux(CUDA);
         math::ReduceMin<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 11.0f; });
@@ -165,7 +165,7 @@ TEST(MathUtilGPUTest, testReduceMin) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor<CUDAContext> aux;
+        Tensor aux(CUDA);
         math::ReduceMin<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 11.0f; });
@@ -184,7 +184,7 @@ TEST(MathUtilGPUTest, testReduceMax) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor<CUDAContext> aux;
+        Tensor aux(CUDA);
         math::ReduceMax<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 11.0f; });
@@ -200,7 +200,7 @@ TEST(MathUtilGPUTest, testReduceMax) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor<CUDAContext> aux;
+        Tensor aux(CUDA);
         math::ReduceMax<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 17.0f; });
@@ -258,9 +258,9 @@ class GemmBatchedGPUTest
     Blob* X_blob = ws_.CreateBlob("X");
     Blob* W_blob = ws_.CreateBlob("W");
     Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = X_blob->GetMutable<Tensor<CUDAContext>>();
-    W_ = W_blob->GetMutable<Tensor<CUDAContext>>();
-    Y_ = Y_blob->GetMutable<Tensor<CUDAContext>>();
+    X_ = X_blob->GetMutableTensor(CUDA);
+    W_ = W_blob->GetMutableTensor(CUDA);
+    Y_ = Y_blob->GetMutableTensor(CUDA);
     X_->Resize(std::vector<TIndex>{3, 5, 10});
     W_->Resize(std::vector<TIndex>{3, 6, 10});
     Y_->Resize(std::vector<TIndex>{3, 5, 6});
@@ -326,7 +326,7 @@ class GemmBatchedGPUTest
   }
 
   void VerifyOutput(const float value) const {
-    TensorCPU Y_cpu(*Y_);
+    Tensor Y_cpu(*Y_, CPU);
     for (int i = 0; i < Y_cpu.size(); ++i) {
       EXPECT_FLOAT_EQ(value, Y_cpu.template data<float>()[i]);
     }
@@ -335,9 +335,9 @@ class GemmBatchedGPUTest
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor<CUDAContext>* X_ = nullptr;
-  Tensor<CUDAContext>* W_ = nullptr;
-  Tensor<CUDAContext>* Y_ = nullptr;
+  Tensor* X_ = nullptr;
+  Tensor* W_ = nullptr;
+  Tensor* Y_ = nullptr;
   bool trans_X_;
   bool trans_W_;
 };
@@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
-    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
+    X_ = blob_x->GetMutableTensor(CUDA);
+    Y_ = blob_y->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -396,14 +396,14 @@ class ReduceTensorGPUTest : public testing::Test {
     X_->Resize(X_dims);
     Y_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+    cuda_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
-    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
+    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
@@ -433,8 +433,8 @@ class ReduceTensorGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor<CUDAContext>* X_ = nullptr;
-  Tensor<CUDAContext>* Y_ = nullptr;
+  Tensor* X_ = nullptr;
+  Tensor* Y_ = nullptr;
 };
 
 TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
@@ -661,8 +661,8 @@ class BroadcastGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
-    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
+    X_ = blob_x->GetMutableTensor(CUDA);
+    Y_ = blob_y->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -672,14 +672,14 @@ class BroadcastGPUTest : public testing::Test {
     X_->Resize(X_dims);
     Y_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+    cuda_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
-    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
+    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
@@ -707,8 +707,8 @@ class BroadcastGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor<CUDAContext>* X_ = nullptr;
-  Tensor<CUDAContext>* Y_ = nullptr;
+  Tensor* X_ = nullptr;
+  Tensor* Y_ = nullptr;
 };
 
 TEST_F(BroadcastGPUTest, BroadcastGPUFloatTest) {
@@ -737,9 +737,9 @@ class MomentsGPUTest : public testing::Test {
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_mean = ws_.CreateBlob("mean");
     Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
-    mean_ = blob_mean->GetMutable<Tensor<CUDAContext>>();
-    variance_ = blob_variance->GetMutable<Tensor<CUDAContext>>();
+    X_ = blob_x->GetMutableTensor(CUDA);
+    mean_ = blob_mean->GetMutableTensor(CUDA);
+    variance_ = blob_variance->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -754,7 +754,7 @@ class MomentsGPUTest : public testing::Test {
     mean_->Resize(Y_dims);
     variance_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+    cuda_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
@@ -762,12 +762,11 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& mean_data,
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = blob_mean_host->GetMutable<TensorCPU>();
-    mean_host->CopyFrom<CUDAContext, CUDAContext>(*mean_, cuda_context_.get());
+    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
+    mean_host->CopyFrom(*mean_, cuda_context_.get());
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = blob_variance_host->GetMutable<TensorCPU>();
-    variance_host->CopyFrom<CUDAContext, CUDAContext>(
-        *variance_, cuda_context_.get());
+    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
+    variance_host->CopyFrom(*variance_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
 
     ASSERT_EQ(mean_data.size(), mean_host->size());
@@ -802,9 +801,9 @@ class MomentsGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor<CUDAContext>* X_ = nullptr;
-  Tensor<CUDAContext>* mean_ = nullptr;
-  Tensor<CUDAContext>* variance_ = nullptr;
+  Tensor* X_ = nullptr;
+  Tensor* mean_ = nullptr;
+  Tensor* variance_ = nullptr;
 };
 
 TEST_F(MomentsGPUTest, MomentsGPUFloatTest) {
@@ -865,8 +864,8 @@ class TransposeGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
-    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
+    X_ = blob_x->GetMutableTensor(CUDA);
+    Y_ = blob_y->GetMutableTensor(CUDA);
   }
 
   void SetUpData(
@@ -881,14 +880,14 @@ class TransposeGPUTest : public testing::Test {
     X_->Resize(X_dims);
     Y_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->Copy<float, CPUContext, CUDAContext>(
+    cuda_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
-    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
+    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
@@ -916,8 +915,8 @@ class TransposeGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor<CUDAContext>* X_ = nullptr;
-  Tensor<CUDAContext>* Y_ = nullptr;
+  Tensor* X_ = nullptr;
+  Tensor* Y_ = nullptr;
 };
 
 TEST_F(TransposeGPUTest, TransposeGPUFloatTest) {
diff --git a/caffe2/utils/math_test.cc b/caffe2/utils/math_test.cc
index 6d3444553d51f..ed08aedf1954c 100644
--- a/caffe2/utils/math_test.cc
+++ b/caffe2/utils/math_test.cc
@@ -16,9 +16,9 @@ namespace caffe2 {
 TEST(MathTest, GemmNoTransNoTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  TensorCPU X(std::vector<int>{5, 10});
-  TensorCPU W(std::vector<int>{10, 6});
-  TensorCPU Y(std::vector<int>{5, 6});
+  Tensor X(std::vector<int>{5, 10}, CPU);
+  Tensor W(std::vector<int>{10, 6}, CPU);
+  Tensor Y(std::vector<int>{5, 6}, CPU);
   EXPECT_EQ(X.size(), 50);
   EXPECT_EQ(W.size(), 60);
   math::Set<float, CPUContext>(
@@ -91,9 +91,9 @@ TEST(MathTest, GemmNoTransNoTrans) {
 TEST(MathTest, GemmNoTransTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  TensorCPU X(std::vector<int>{5, 10});
-  TensorCPU W(std::vector<int>{6, 10});
-  TensorCPU Y(std::vector<int>{5, 6});
+  Tensor X(std::vector<int>{5, 10}, CPU);
+  Tensor W(std::vector<int>{6, 10}, CPU);
+  Tensor Y(std::vector<int>{5, 6}, CPU);
   EXPECT_EQ(X.size(), 50);
   EXPECT_EQ(W.size(), 60);
   math::Set<float, CPUContext>(
@@ -243,9 +243,9 @@ class GemmBatchedTest
 
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
-  TensorCPU X_;
-  TensorCPU W_;
-  TensorCPU Y_;
+  Tensor X_{CPU};
+  Tensor W_{CPU};
+  Tensor Y_{CPU};
   bool trans_X_;
   bool trans_W_;
 };
@@ -278,9 +278,9 @@ INSTANTIATE_TEST_CASE_P(
 TEST(MathTest, GemvNoTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  TensorCPU A(std::vector<int>{5, 10});
-  TensorCPU X(std::vector<int>{10});
-  TensorCPU Y(std::vector<int>{5});
+  Tensor A(std::vector<int>{5, 10}, CPU);
+  Tensor X(std::vector<int>{10}, CPU);
+  Tensor Y(std::vector<int>{5}, CPU);
   EXPECT_EQ(A.size(), 50);
   EXPECT_EQ(X.size(), 10);
   math::Set<float, CPUContext>(
@@ -344,9 +344,9 @@ TEST(MathTest, GemvNoTrans) {
 TEST(MathTest, GemvTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  TensorCPU A(std::vector<int>{6, 10});
-  TensorCPU X(std::vector<int>{6});
-  TensorCPU Y(std::vector<int>{10});
+  Tensor A(std::vector<int>{6, 10}, CPU);
+  Tensor X(std::vector<int>{6}, CPU);
+  Tensor Y(std::vector<int>{10}, CPU);
   EXPECT_EQ(A.size(), 60);
   EXPECT_EQ(X.size(), 6);
   math::Set<float, CPUContext>(
@@ -445,7 +445,7 @@ class ReduceTensorTest : public testing::Test {
     X_.Resize(X_dims);
     Y_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->Copy<float, CPUContext, CPUContext>(
+    cpu_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     reduce_func(
         X_dims.size(),
@@ -463,8 +463,8 @@ class ReduceTensorTest : public testing::Test {
 
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
-  TensorCPU X_;
-  TensorCPU Y_;
+  Tensor X_{CPU};
+  Tensor Y_{CPU};
 };
 
 TEST_F(ReduceTensorTest, ReduceMinTest) {
@@ -679,7 +679,7 @@ class BroadcastTest : public testing::Test {
     X_.Resize(X_dims);
     Y_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->Copy<float, CPUContext, CPUContext>(
+    cpu_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     math::Broadcast<float, CPUContext>(
         X_dims.size(),
@@ -698,8 +698,8 @@ class BroadcastTest : public testing::Test {
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
 
-  TensorCPU X_;
-  TensorCPU Y_;
+  Tensor X_{CPU};
+  Tensor Y_{CPU};
 };
 
 TEST_F(BroadcastTest, BroadcastFloatTest) {
@@ -735,7 +735,7 @@ class MomentsTest : public testing::Test {
     mean_.Resize(Y_dims);
     variance_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->Copy<float, CPUContext, CPUContext>(
+    cpu_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     math::Moments<float, CPUContext>(
         X_dims.size(),
@@ -759,9 +759,9 @@ class MomentsTest : public testing::Test {
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
 
-  TensorCPU X_;
-  TensorCPU mean_;
-  TensorCPU variance_;
+  Tensor X_{CPU};
+  Tensor mean_{CPU};
+  Tensor variance_{CPU};
 };
 
 TEST_F(MomentsTest, MomentsFloatTest) {
@@ -828,7 +828,7 @@ class TransposeTest : public testing::Test {
     X_.Resize(X_dims);
     Y_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->Copy<float, CPUContext, CPUContext>(
+    cpu_context_->CopyFromCPU<float>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     math::Transpose<float, CPUContext>(
         X_dims.size(),
@@ -846,8 +846,8 @@ class TransposeTest : public testing::Test {
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
 
-  TensorCPU X_;
-  TensorCPU Y_;
+  Tensor X_{CPU};
+  Tensor Y_{CPU};
 };
 
 TEST_F(TransposeTest, TransposeFloatTest) {
diff --git a/caffe2/utils/smart_tensor_printer.cc b/caffe2/utils/smart_tensor_printer.cc
index feb669e4bbae0..b7fa7ef0d3365 100644
--- a/caffe2/utils/smart_tensor_printer.cc
+++ b/caffe2/utils/smart_tensor_printer.cc
@@ -33,7 +33,7 @@ struct ProxyPrinter {
         char>>::call(this, tensor->meta());
   }
 
-  const Tensor<CPUContext>* tensor;
+  const Tensor* tensor;
   TensorPrinter* tensorPrinter;
 };
 }
@@ -52,7 +52,7 @@ SmartTensorPrinter::SmartTensorPrinter(
     int limit)
     : tensorPrinter_(tensor_name, file_name, limit) {}
 
-void SmartTensorPrinter::Print(const Tensor<CPUContext>& tensor) {
+void SmartTensorPrinter::Print(const Tensor& tensor) {
   ProxyPrinter printer;
 
   printer.tensor = &tensor;
@@ -71,7 +71,7 @@ SmartTensorPrinter& SmartTensorPrinter::DefaultTensorPrinter() {
 #endif
 }
 
-void SmartTensorPrinter::PrintTensor(const Tensor<CPUContext>& tensor) {
+void SmartTensorPrinter::PrintTensor(const Tensor& tensor) {
   DefaultTensorPrinter().Print(tensor);
 }
 }
diff --git a/caffe2/utils/smart_tensor_printer.h b/caffe2/utils/smart_tensor_printer.h
index f99226d696438..224f7d91e0e98 100644
--- a/caffe2/utils/smart_tensor_printer.h
+++ b/caffe2/utils/smart_tensor_printer.h
@@ -27,19 +27,17 @@ class SmartTensorPrinter {
       const std::string& file_name,
       int limit);
 
-  void Print(const Tensor<CPUContext>& tensor);
+  void Print(const Tensor& tensor);
 
-  template <class Context>
-  void PrintMeta(const Tensor<Context>& tensor) {
+  void PrintMeta(const Tensor& tensor) {
     tensorPrinter_.PrintMeta(tensor);
   }
 
   // Uses a default constructed SmartTensorPrinter
-  static void PrintTensor(const Tensor<CPUContext>& tensor);
+  static void PrintTensor(const Tensor& tensor);
 
   // Uses a default constructed SmartTensorPrinter
-  template <class Context>
-  void PrintTensorMeta(const Tensor<Context>& tensor) {
+  void PrintTensorMeta(const Tensor& tensor) {
     DefaultTensorPrinter().PrintMeta(tensor);
   }
 
diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
index d5681e2e0b07f..82a59ad60aa95 100644
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ b/caffe2/utils/smart_tensor_printer_test.cc
@@ -30,7 +30,7 @@ void printTensorAndCheck(const std::vector<T>& values) {
   testing::internal::CaptureStderr();
   CPUContext cpuContext;
 
-  Tensor<CPUContext> tensor(
+  Tensor tensor(
       std::vector<TIndex>{static_cast<TIndex>(values.size())},
       values,
       &cpuContext);
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index 216b039501d33..3034e1bd4adbe 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -52,14 +52,14 @@ class VideoInputOp final : public PrefetchOperator<Context> {
 
   const db::DBReader* reader_;
   CPUContext cpu_context_;
-  TensorCPU prefetched_clip_rgb_;
-  TensorCPU prefetched_clip_of_;
-  TensorCPU prefetched_label_;
-  TensorCPU prefetched_video_id_;
-  Tensor<Context> prefetched_clip_rgb_on_device_;
-  Tensor<Context> prefetched_clip_of_on_device_;
-  Tensor<Context> prefetched_label_on_device_;
-  Tensor<Context> prefetched_video_id_on_device_;
+  Tensor prefetched_clip_rgb_{CPU};
+  Tensor prefetched_clip_of_{CPU};
+  Tensor prefetched_label_{CPU};
+  Tensor prefetched_video_id_{CPU};
+  Tensor prefetched_clip_rgb_on_device_{Context::GetDeviceType()};
+  Tensor prefetched_clip_of_on_device_{Context::GetDeviceType()};
+  Tensor prefetched_label_on_device_{Context::GetDeviceType()};
+  Tensor prefetched_video_id_on_device_{Context::GetDeviceType()};
   int batch_size_;
   int clip_per_video_;
   std::vector<float> mean_rgb_;
@@ -826,7 +826,8 @@ template <class Context>
 bool VideoInputOp<Context>::CopyPrefetched() {
   int index = 0;
   if (get_rgb_) {
-    auto* clip_rgb_output = OperatorBase::Output<Tensor<Context>>(index++);
+    auto* clip_rgb_output =
+        OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
     if (std::is_same<Context, CPUContext>::value) {
       clip_rgb_output->CopyFrom(prefetched_clip_rgb_, &context_);
     } else {
@@ -834,21 +835,24 @@ bool VideoInputOp<Context>::CopyPrefetched() {
     }
   }
   if (get_optical_flow_) {
-    auto* clip_of_output = OperatorBase::Output<Tensor<Context>>(index++);
+    auto* clip_of_output =
+        OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
     if (std::is_same<Context, CPUContext>::value) {
       clip_of_output->CopyFrom(prefetched_clip_of_, &context_);
     } else {
       clip_of_output->CopyFrom(prefetched_clip_of_on_device_, &context_);
     }
   }
-  auto* label_output = OperatorBase::Output<Tensor<Context>>(index++);
+  auto* label_output =
+      OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
   if (std::is_same<Context, CPUContext>::value) {
     label_output->CopyFrom(prefetched_label_, &context_);
   } else {
     label_output->CopyFrom(prefetched_label_on_device_, &context_);
   }
   if (get_video_id_) {
-    auto* video_id_output = OperatorBase::Output<Tensor<Context>>(index);
+    auto* video_id_output =
+        OperatorBase::Output<Tensor>(index, Context::GetDeviceType());
     if (std::is_same<Context, CPUContext>::value) {
       video_id_output->CopyFrom(prefetched_video_id_, &context_);
     } else {
diff --git a/modules/detectron/group_spatial_softmax_op.h b/modules/detectron/group_spatial_softmax_op.h
index 6bced40dc0532..2109aca13fe80 100644
--- a/modules/detectron/group_spatial_softmax_op.h
+++ b/modules/detectron/group_spatial_softmax_op.h
@@ -68,7 +68,7 @@ class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
  protected:
   int num_classes_;
   StorageOrder order_;
-  Tensor<Context> sum_probs_;
+  Tensor sum_probs_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/select_smooth_l1_loss_op.h b/modules/detectron/select_smooth_l1_loss_op.h
index 04908ef3af3ad..131be9e0993c5 100644
--- a/modules/detectron/select_smooth_l1_loss_op.h
+++ b/modules/detectron/select_smooth_l1_loss_op.h
@@ -45,7 +45,7 @@ class SelectSmoothL1LossOp final : public Operator<Context> {
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
   int dim_; // dimension for 1 anchor prediction
-  Tensor<Context> buff_; // Buffer for element-wise differences
+  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
 };
 
 template <typename T, class Context>
@@ -69,7 +69,7 @@ class SelectSmoothL1LossGradientOp final : public Operator<Context> {
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
   int dim_; // dimension for 1 anchor prediction
-  Tensor<Context> buff_; // Buffer for element-wise differences
+  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.h b/modules/detectron/sigmoid_cross_entropy_loss_op.h
index 34acd6886a716..bb0e923ddb93d 100644
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.h
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.h
@@ -44,9 +44,9 @@ class SigmoidCrossEntropyLossOp final : public Operator<Context> {
  protected:
   float scale_;
   int normalize_;
-  Tensor<Context> losses_;
-  Tensor<Context> counts_;
-  Tensor<Context> normalizer_;
+  Tensor losses_{Context::GetDeviceType()};
+  Tensor counts_{Context::GetDeviceType()};
+  Tensor normalizer_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -69,8 +69,8 @@ class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
  protected:
   float scale_;
   int normalize_;
-  Tensor<Context> counts_;
-  Tensor<Context> normalizer_;
+  Tensor counts_{Context::GetDeviceType()};
+  Tensor normalizer_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/sigmoid_focal_loss_op.h b/modules/detectron/sigmoid_focal_loss_op.h
index d59df8f4b2fa1..2a07abc5afe72 100644
--- a/modules/detectron/sigmoid_focal_loss_op.h
+++ b/modules/detectron/sigmoid_focal_loss_op.h
@@ -47,8 +47,8 @@ class SigmoidFocalLossOp final : public Operator<Context> {
   int num_classes_;
   float gamma_;
   float alpha_;
-  Tensor<Context> losses_;
-  Tensor<Context> counts_;
+  Tensor losses_{Context::GetDeviceType()};
+  Tensor counts_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -74,8 +74,8 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
   int num_classes_;
   float gamma_;
   float alpha_;
-  Tensor<Context> counts_;
-  Tensor<Context> weights_; // unignored weights
+  Tensor counts_{Context::GetDeviceType()};
+  Tensor weights_{Context::GetDeviceType()}; // unignored weights
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/smooth_l1_loss_op.h b/modules/detectron/smooth_l1_loss_op.h
index 283be2eb73134..013645ebc08ad 100644
--- a/modules/detectron/smooth_l1_loss_op.h
+++ b/modules/detectron/smooth_l1_loss_op.h
@@ -44,7 +44,7 @@ class SmoothL1LossOp final : public Operator<Context> {
  protected:
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
-  Tensor<Context> buff_; // Buffer for element-wise differences
+  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
 };
 
 template <typename T, class Context>
@@ -67,7 +67,7 @@ class SmoothL1LossGradientOp final : public Operator<Context> {
  protected:
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
-  Tensor<Context> buff_; // Buffer for element-wise differences
+  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/softmax_focal_loss_op.h b/modules/detectron/softmax_focal_loss_op.h
index 98750dd189bf1..ac9b0e39a0780 100644
--- a/modules/detectron/softmax_focal_loss_op.h
+++ b/modules/detectron/softmax_focal_loss_op.h
@@ -52,7 +52,7 @@ class SoftmaxFocalLossOp final : public Operator<Context> {
   float alpha_;
   int num_classes_;
   StorageOrder order_;
-  Tensor<Context> losses_;
+  Tensor losses_{Context::GetDeviceType()};
 };
 
 template <typename T, class Context>
@@ -83,7 +83,7 @@ class SoftmaxFocalLossGradientOp final : public Operator<Context> {
   float alpha_;
   int num_classes_;
   StorageOrder order_;
-  Tensor<Context> buff_;
+  Tensor buff_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2

From 716f7d657daca1a79fb35192b8151e874115befc Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Thu, 26 Jul 2018 10:36:25 -0700
Subject: [PATCH 04/10] Remove Broadcast.py. (#9843)

Summary:
I don't think this file is used anywhere, I guess we'll find out!

(Weirdly this failed lint on one of my PRs even though it shouldn't).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9843

Differential Revision: D9003949

Pulled By: gchanan

fbshipit-source-id: 26d580d1e7cdd30e82e5f4176244e51fd7cd616d
---
 tools/cwrap/plugins/Broadcast.py | 362 -------------------------------
 tools/cwrap/plugins/__init__.py  |   1 -
 2 files changed, 363 deletions(-)
 delete mode 100644 tools/cwrap/plugins/Broadcast.py

diff --git a/tools/cwrap/plugins/Broadcast.py b/tools/cwrap/plugins/Broadcast.py
deleted file mode 100644
index 5b0a74167839c..0000000000000
--- a/tools/cwrap/plugins/Broadcast.py
+++ /dev/null
@@ -1,362 +0,0 @@
-from . import CWrapPlugin
-from string import Template
-
-# Arguments to the Broadcast Plugin:
-# broadcast: args_to_broadcast_against [inplace] [fallback]
-# [args_to_broadcast_against]: either a single argument (e.g. "arg1") or a comma-separated
-#                              list of two arguments (e.g. "tensor1,tensor2") indicating
-#                              arguments to broadcast specified argument (usually "self") against
-# [inplace] will generate code for in-place function, which doesn't allow the in-place
-#           argument to be broadcast
-# [fallback] if tensors aren't broadcastable, preserves "element number" pointwise behavior,
-#            where only number of elements need to match, and tensors are viewed as 1-dimensional.
-# [dims] specify if the tensors shouldn't be broadcast to a specific tensor or tensors, but a combination
-#        of individual dimension sizes of a set of tensors.  For example: addbmm(C,A,B) a.k.a. [C + A @ B]
-#        broadcasts C to the first dimension of A and the second dimension of B.  Each dimension is specified as
-#        [arg].dim[#] and dimensions are comma-separated.  So, to specify that the tensor should be
-#        broadcast to 3-dimensions with sizes:
-#        tensor0->size[0] x tensor1->size[1] x tensor2->size[2]
-#        you would write:
-#        dims:tensor0.dim0,tensor1.dim1,tensor2.dim2
-# [types] if the tensors should be of different types than THTensor, specify as X where
-#         the actual type to use is THXTensor (i.e. Byte for THByteTensor).  If the type
-#         should be THTensor, use 'Real'
-
-# For out of place:
-# Two args: expand the two args together
-# Three args (fused kernels): (e.g. addcmul) expand all three args together
-# Sketch of proof that this is the same:
-# consider addcmul, under expansion we want: a + (b * c) = (a + b * c) [all expanded together]
-# Let e(i, j) be the expansion of i with j, e(i, j, k) be the expansion of i with j,k
-#
-# Then a + (b * c) = e(a, e(b,c) * e(c,b)) + e(e(b,c)   * e(c,b), a)
-#                  = e(a, e(b,c))          + e(e(b,c)   * e(c,b), a)    (only size matters for second param)
-#                  = e(a,b,c)              + e(e(b,c)   * e(c,b), a)    (by associativity of max in expand)
-#                  = e(a,b,c)              + e(b,c,a)   * e(c,b,a)      (see L1)
-# which is a + b * c all expanded together
-#
-# L1: Show e(i * j, a) = e(i,a) * e(j,a) where i,j have same size
-# Consider any index _{ s_0, ..., s_n}
-# e(i * j, a) = (i*j)_{f(s_0), ...,f(s_n)} where f is the expansion of that dimension with a
-#             = i_{f(s_0), ..., f(s_n)} * j_{f(s_0), ..., f(s_n)} by definition of pointwise operator
-#             = e(i,a) * e(j,a)
-
-
-class Broadcast(CWrapPlugin):
-
-    # Save and restore passed in arguments in case later plugins use
-    POST_TEMPLATE = Template(
-        """${arg_op_other} = ${arg_op_other}_save;\n""")
-
-    def getPreArgStringTemplate(self, type=None):
-        if type is None:
-            ret = """THTensor *${arg_op_other}_save = ${arg_op_other};
-                     THTensorPtr ${arg_op_other}_guard(nullptr);\n"""
-        else:
-            cpu_t = "TH" + type + "Tensor"
-            gpu_t = "THCuda" + type + "Tensor"
-            ret = ("#if !IS_CUDA\n" +
-                   cpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" +
-                   cpu_t + "Ptr ${arg_op_other}_guard(nullptr);\n" +
-                   "#else\n" +
-                   gpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" +
-                   "THPPointer<" + gpu_t + "> ${arg_op_other}_guard(nullptr);\n" +
-                   "#endif\n")
-        return Template(ret)
-
-    def getNewForExpand(self, type):
-        if type is None:
-            ret = """THTensor_(new)(LIBRARY_STATE_NOARGS);\n"""
-        else:
-            cpu_t = "TH" + type + "Tensor"
-            gpu_t = "THCuda" + type + "Tensor"
-            ret = ("#if !IS_CUDA\n" +
-                   cpu_t + "_new(LIBRARY_STATE_NOARGS);\n" +
-                   "#else\n" +
-                   gpu_t + "_new(LIBRARY_STATE_NOARGS);\n" +
-                   "#endif\n")
-        return ret
-
-    def getExpandTemplate(self, same_size_check, expand_call, success_code, raise_errors):
-        if not raise_errors:
-            return Template(
-                "bool try_expand = !" + same_size_check + "\n" +
-                "if (try_expand) {\n" +
-                "bool expand_success = false;\n" +
-                "try {\n" +
-                expand_call +
-                "\nexpand_success = true;\n" +
-                "}\n"
-                "catch (std::exception &e) {}\n" +
-                "if(expand_success) {\n" +
-                success_code +
-                "\n}" +
-                "\n}\n")
-        else:
-            return Template(
-                "bool try_expand = !" + same_size_check + "\n" +
-                "if (try_expand) {\n" +
-                expand_call + "\n" +
-                success_code + "\n"
-                "}\n")
-
-    def getOutPlacePreExpand2Template(self, type_op_a, type_op_other, raise_errors):
-        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_other}->size, ${arg_op_other}->nDimension);"""
-        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
-                       "${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" +
-                       """expand_outplace2(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_other}_guard.get(),
-                                           ${arg_op_a}, ${arg_op_other},
-                                           \"${op_a}\", \"${op_other}\", !${raise_errors});""")
-        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();
-                          ${arg_op_other} = ${arg_op_other}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    def getOutPlacePreExpand3Template(self, type_op_a, type_op_other1, type_op_other2, raise_errors):
-        size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                             ${arg_op_other1}->size, ${arg_op_other1}->nDimension) &&
-                        THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_other2}->size, ${arg_op_other2}->nDimension));"""
-        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
-                       "${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" +
-                       "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" +
-                       """expand_outplace3(LIBRARY_STATE ${arg_op_a}_guard.get(),
-                                          ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(),
-                                          ${arg_op_a}, ${arg_op_other1}, ${arg_op_other2},
-                                          \"${op_a}\", \"${op_other1}\", \"${op_other2}\", !${raise_errors});""")
-        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();
-                          ${arg_op_other1} = ${arg_op_other1}_guard.get();
-                          ${arg_op_other2} = ${arg_op_other2}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE = Template(
-        """if(THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}) <= ${arg_op_dim_value}) {
-             THError("Argument %s requires at least %d dimensions, but only has %d",
-                     "${op_dim}", ${arg_op_dim_value} + 1, THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}));
-           }
-           int64_t ${arg_op_a}_dim${idx}_size = THTensor_(size)(LIBRARY_STATE ${arg_op_dim}, ${arg_op_dim_value});\n""")
-
-    OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE = Template(
-        """THLongStoragePtr ${arg_op_a}_storage(THLongStorage_newWithSize1(${arg_op_a}_dim0_size));\n""")
-
-    OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE = Template(
-        """THLongStoragePtr ${arg_op_a}_storage(
-               THLongStorage_newWithSize2(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size));\n""")
-
-    OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE = Template(
-        """THLongStoragePtr ${arg_op_a}_storage(
-               THLongStorage_newWithSize3(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size, ${arg_op_a}_dim2_size));\n""")
-
-    def getOutPlacePreExpandPostDimTemplate(self, type_op_a, raise_errors):
-        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_a}_storage->data, ${arg_op_a}_storage->size);"""
-        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
-                       """expand(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_a}, ${arg_op_a}_storage);""")
-        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    OUT_PLACE_PRE_TEMPLATE = Template(
-        """${code_arg_op_a}${code_arg_op_other1}${code_arg_op_other2}
-           ${expand_code}""")
-
-    def getInPlacePreExpand1Template(self, type_op_other, raise_errors):
-        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_other}->size, ${arg_op_other}->nDimension);"""
-        expand_code = ("${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" +
-                       """expand_inplace1(LIBRARY_STATE ${arg_op_other}_guard.get(), ${arg_op_other}, ${arg_op_a},
-                                         \"${op_other}\", \"${op_a}\", !${raise_errors});""")
-        success_code = """${arg_op_other} = ${arg_op_other}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    def getInPlacePreExpand2Template(self, type_op_other1, type_op_other2, raise_errors):
-        size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                             ${arg_op_other1}->size, ${arg_op_other1}->nDimension) &&
-                         THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                             ${arg_op_other2}->size, ${arg_op_other2}->nDimension));"""
-        expand_code = ("${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" +
-                       "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" +
-                       """expand_inplace2(LIBRARY_STATE ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(),
-                                         ${arg_op_other1}, ${arg_op_other2}, ${arg_op_a},
-                                         \"${op_other1}\", \"${op_other2}\", \"${op_a}\", !${raise_errors});""")
-        success_code = """${arg_op_other1} = ${arg_op_other1}_guard.get();
-                          ${arg_op_other2} = ${arg_op_other2}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    IN_PLACE_PRE_TEMPLATE = Template(
-        """${code_arg_op_other1}${code_arg_op_other2}
-           ${expand_code}""")
-
-    def initialize(self, cwrap):
-        self.cwrap = cwrap
-
-    # Arguments:
-    # [0]: name of tensor to broadcast with (possibly two comma separated)
-    # [1] inplace (optional).  In place operations only broadcast on second tensor argument
-    # [2] fallback (optional).  Will fallback to applying to tensor of equal nElem if broadcast fails
-    def process_option_code_template(self, template, option):
-        new_code_pre = []
-        new_code_post = []
-        for _, arg in enumerate(option['arguments']):
-            if 'broadcast' not in arg:
-                continue
-
-            params = arg.get('broadcast').split(" ")
-            op_a = arg.get('assign_name', arg['name'])
-            in_place = "inplace" in params
-            raise_errors = "false" if "fallback" in params else "true"
-
-            param_others = params[0].split(",")
-            if len(param_others) > 2:
-                raise ValueError('Broadcast only supports up to 2 secondary parameters')
-            op_b = param_others[0]
-            op_c = param_others[1] if len(param_others) == 2 else None
-            arg_op_b = "arg_" + op_b
-            arg_op_a = "arg_" + op_a
-            arg_op_c = ("arg_" + op_c) if op_c else None
-
-            dims_kvs = []
-            for p in params:
-                if p.startswith("dims:"):
-                    assert(raise_errors == "true")
-                    if len(dims_kvs) != 0:
-                        raise ValueError("multiple specifications of dims")
-                    dims = p[len("dims:"):].split(",")
-                    for dim in dims:
-                        batchdim = dim.split(".")
-                        assert len(batchdim) == 2
-                        assert batchdim[1].startswith("dim")
-                        dim_val = batchdim[1][len("dim"):]
-                        dims_kvs.append({"op": batchdim[0], "arg_op": "arg_" + batchdim[0], "val": dim_val})
-
-            assert len(dims_kvs) <= 3
-            for p in params[1:]:
-                if p != "inplace" and p != "fallback" and not p.startswith("dims:") and not p.startswith("types:"):
-                    raise ValueError("invalid parameter {}".format(p))
-
-            type_op_b = None
-            type_op_c = None
-            for p in params:
-                if p.startswith("types:"):
-                    if not in_place and len(dims_kvs) > 0:
-                        raise ValueError("type specification not supported yet for out-of-place functions "
-                                         "that specify explicit dimensions")
-                    types = p[len("types:"):].split(",")
-                    assert(len(types) == (2 if op_c else 1))
-                    type_op_b = None if types[0] == "Real" else types[0]
-                    if op_c:
-                        type_op_c = None if types[1] == "Real" else types[1]
-
-            op_b_mapping = {
-                "op_a": op_a,
-                "op_other": op_b,
-                "arg_op_a": arg_op_a,
-                "arg_op_other": arg_op_b,
-                "raise_errors": raise_errors
-            }
-            op_c_mapping = {
-                "op_a": op_a,
-                "op_other": op_c,
-                "arg_op_a": arg_op_a,
-                "arg_op_other": arg_op_c,
-                "raise_errors": raise_errors
-            }
-            raise_errors_s = raise_errors == "true"
-
-            if in_place:
-                code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping)
-                code_arg_op_other2 = (
-                    self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping) if op_c else "")
-
-                if op_c:
-                    expand_code = self.getInPlacePreExpand2Template(type_op_b, type_op_c, raise_errors_s).substitute(
-                        op_b_mapping,
-                        op_other1=op_b,
-                        op_other2=op_c,
-                        arg_op_other1=arg_op_b,
-                        arg_op_other2=arg_op_c)
-                else:
-                    expand_code = self.getInPlacePreExpand1Template(type_op_b, raise_errors_s).substitute(op_b_mapping)
-
-                new_code_pre.append(self.IN_PLACE_PRE_TEMPLATE.substitute(
-                    arg_op_a=arg_op_a,
-                    code_arg_op_other1=code_arg_op_other1,
-                    code_arg_op_other2=code_arg_op_other2,
-                    expand_code=expand_code,
-                    raise_errors=raise_errors))
-                new_code_pre.append("")
-
-                post_code = self.POST_TEMPLATE.substitute(op_b_mapping)
-                if op_c:
-                    post_code += self.POST_TEMPLATE.substitute(op_c_mapping)
-
-                new_code_post.append(post_code)
-                new_code_post.append("")
-            else:
-                if len(dims_kvs) != 0:
-                    code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a)
-                    code_arg_op_other1 = ""
-                    code_arg_op_other2 = ""
-                    expand_code = ""
-                    for idx, kv in enumerate(dims_kvs):
-                        expand_code += self.OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            op_dim=kv["op"],
-                            arg_op_dim=kv["arg_op"],
-                            arg_op_dim_value=kv["val"],
-                            idx=idx)
-
-                    if len(dims_kvs) == 1:
-                        expand_code += self.OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            arg_op_dim0=dims_kvs[0]["arg_op"])
-                    elif len(dims_kvs) == 2:
-                        expand_code += self.OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            arg_op_dim0=dims_kvs[0]["arg_op"],
-                            arg_op_dim1=dims_kvs[1]["arg_op"])
-                    else:
-                        expand_code += self.OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            arg_op_dim0=dims_kvs[0]["arg_op"],
-                            arg_op_dim1=dims_kvs[1]["arg_op"],
-                            arg_op_dim2=dims_kvs[2]["arg_op"])
-                    expand_code += self.getOutPlacePreExpandPostDimTemplate(None, raise_errors_s).substitute(
-                        arg_op_a=arg_op_a,
-                        raise_errors=raise_errors)
-                    post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a)
-
-                else:
-                    code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a)
-                    code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping)
-                    code_arg_op_other2 = (self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping)
-                                          if op_c else "")
-
-                    if op_c:
-                        expand_template = self.getOutPlacePreExpand3Template(None, type_op_b, type_op_c, raise_errors_s)
-                        expand_code = expand_template.substitute(
-                            op_b_mapping,
-                            op_other1=op_b,
-                            op_other2=op_c,
-                            arg_op_other1=arg_op_b,
-                            arg_op_other2=arg_op_c)
-
-                    else:
-                        expand_code = self.getOutPlacePreExpand2Template(None, type_op_b, raise_errors_s).substitute(
-                            op_b_mapping)
-
-                    post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a)
-                    post_code += self.POST_TEMPLATE.substitute(op_b_mapping)
-                    post_code += self.POST_TEMPLATE.substitute(op_c_mapping) if op_c else ""
-
-                new_code_pre.append(self.OUT_PLACE_PRE_TEMPLATE.substitute(
-                    code_arg_op_a=code_arg_op_a,
-                    code_arg_op_other1=code_arg_op_other1,
-                    code_arg_op_other2=code_arg_op_other2,
-                    expand_code=expand_code))
-                new_code_pre.append("")
-
-                new_code_post.append(post_code)
-                new_code_post.append("")
-
-        template = new_code_pre + template + new_code_post
-        return template
diff --git a/tools/cwrap/plugins/__init__.py b/tools/cwrap/plugins/__init__.py
index 7efb4a51bf1ce..53789a0bed989 100644
--- a/tools/cwrap/plugins/__init__.py
+++ b/tools/cwrap/plugins/__init__.py
@@ -432,4 +432,3 @@ def process_pre_arg_assign(self, template, option):
 from .AutoGPU import AutoGPU
 from .CuDNNPlugin import CuDNNPlugin
 from .WrapDim import WrapDim
-from .Broadcast import Broadcast

From 6f10944f88d4f4ee5c25fd184ad419d8400cb36a Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Thu, 26 Jul 2018 10:51:25 -0700
Subject: [PATCH 05/10] Re-enable rocm tests that have been fixed in rocm 1.8.2
 (#9862)

Summary:
petrex
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9862

Differential Revision: D9012520

Pulled By: bddppq

fbshipit-source-id: cdcc184e23befa8dbd1bc44d59bd25766aac33d0
---
 .jenkins/caffe2/test.sh | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 9b6b28f2842b8..60502343472ae 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -100,6 +100,7 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Unknown reasons, need to debug
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py")
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py")
+  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
 
   # Need to go through roi ops to replace max(...) with fmaxf(...)
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py")
@@ -107,12 +108,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Our cuda top_k op has some asm code, the hipified version doesn't
   # compile yet, so we don't have top_k operator for now
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
-
-  # These are fixed in rocm 1.8.2, re-enable them once our CI docker images are upgraded
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/recurrent_net_executor_test.py")
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/conv_test.py")
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/group_conv_test.py")
 fi
 
 # Python tests

From e39c8043dc97950318ff83eb8502c52f8f4b63d9 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Thu, 26 Jul 2018 11:45:41 -0700
Subject: [PATCH 06/10] Make GraphExecutors work on Stacks instead of
 variable_tensor_lists (#9763)

Summary:
This is blocking the IR operator unification, because I need to be able to pass scalars to backward functions.

zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9763

Reviewed By: zou3519

Differential Revision: D8978457

Pulled By: apaszke

fbshipit-source-id: 570b4c3409322459cb0f2592069730a7d586ab20
---
 torch/csrc/jit/argument_spec.h           |  99 +++++----
 torch/csrc/jit/graph_executor.cpp        | 252 +++++++++++++----------
 torch/csrc/jit/graph_executor.h          |   4 +-
 torch/csrc/jit/init.cpp                  |  30 +--
 torch/csrc/jit/interpreter.cpp           |   9 +-
 torch/csrc/jit/ivalue.h                  |   2 +
 torch/csrc/jit/passes/shape_analysis.cpp |   4 +-
 torch/csrc/jit/pybind_utils.h            |  27 ++-
 torch/csrc/jit/python_ir.cpp             |   2 +-
 torch/csrc/jit/script/init.cpp           |  10 +-
 torch/csrc/jit/script/module.h           |  26 +--
 torch/csrc/jit/stack.h                   |   4 +-
 torch/csrc/jit/test_jit.cpp              |  30 +--
 torch/csrc/jit/tracer.cpp                |   8 +-
 14 files changed, 274 insertions(+), 233 deletions(-)

diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index a3e3c0f40f48c..69b5036766e99 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -4,6 +4,7 @@
 #include <vector>
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/utils/hash.h"
+#include "torch/csrc/jit/stack.h"
 #include "torch/csrc/jit/variable_tensor_list.h"
 
 namespace torch { namespace jit {
@@ -16,14 +17,15 @@ namespace torch { namespace jit {
 // since it is used along the hot-path of the JIT to check if the code
 // we have created is valid for the given inputs.
 
-// TensorInfoPOD is only used internally in ArgumentSpec
-// API users should use TensorInfo
-struct TensorInfoPOD {
+// ArgumentInfoPOD is only used internally in ArgumentSpec
+// API users should use ArgumentInfo
+struct ArgumentInfoPOD {
   // total size is 64-bit
-  unsigned type : 8;
+  unsigned is_tensor : 8; // all other fields are invalid if this is false
+  unsigned type : 8; // scalar type
   unsigned defined : 1;
   unsigned requires_grad : 1;
-  signed device : 22;
+  signed device : 14;
   uint32_t total_dims; // all TensorInfoPODs are in ArgumentSpec's tensor_info() array.
                        // total_dims is the total number of dimensions seen so far
                        // in all previous members of tensor_info(), including this tensor
@@ -32,34 +34,38 @@ struct TensorInfoPOD {
                        // for tensor 0, the offset is always 0
 };
 
-static_assert(sizeof(TensorInfoPOD) == sizeof(int64_t),
-  "TensorInfoPOD must be 64-bit struct for ArgumentSpec encoding to work");
+static_assert(sizeof(ArgumentInfoPOD) == sizeof(int64_t),
+  "ArgumentInfoPOD must be 64-bit struct for ArgumentSpec encoding to work");
 
-struct TensorInfo;
+struct ArgumentInfo;
 
 struct ArgumentSpec {
-  // note: tensors must always be variables
-  ArgumentSpec(bool with_grad, const variable_tensor_list & tensors)
-  :  hash_code(0), ntensors(tensors.size()) {
-    int all_dims = 0;
-    for(size_t i = 0; i < ntensors; i++) {
-      all_dims += tensors[i].defined() ? tensors[i].ndimension() : 0;
+  ArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs)
+  :  hash_code(0), ninputs(inputs.size()) {
+    int32_t all_dims = 0;
+    const int32_t num_inputs = inputs.size();
+    for (int32_t i = 0; i < num_inputs; i++) {
+      if (!inputs[i].isTensor()) continue;
+      auto tensor = inputs[i].toTensor();
+      all_dims += tensor.defined() ? tensor.ndimension() : 0;
     }
     // allocate enough room for all TensorPODs and dimensions
-    data.resize(ntensors + all_dims*2);
+    data.resize(ninputs + all_dims*2);
 
     // and reinterpret our data array as these structs
-    TensorInfoPOD * pods = reinterpret_cast<TensorInfoPOD*>(data.data());
+    ArgumentInfoPOD * pods = reinterpret_cast<ArgumentInfoPOD*>(data.data());
     int64_t * next_dim = sizes_strides();
-    int total_dims = 0;
-    for(size_t i = 0; i < ntensors; i++) {
-      const auto & t = tensors[i];
+    int32_t total_dims = 0;
+    for(int32_t i = 0; i < num_inputs; i++) {
       auto & pod = pods[i];
+      pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
+      if (!pod.is_tensor) continue;
+      at::Tensor t = inputs[i].toTensor();
       pod.defined = t.defined();
-      if(t.defined()) {
-        pod.type = static_cast<unsigned int>(t.type().scalarType());
+      if (pod.defined) {
+        pod.type = static_cast<int>(t.type().scalarType());
         pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
-        pod.requires_grad = with_grad && static_cast<const autograd::Variable&>(t).requires_grad();
+        pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
         total_dims += t.ndimension();
         auto sizes = t.sizes();
         std::copy(sizes.begin(),sizes.end(), next_dim);
@@ -73,51 +79,54 @@ struct ArgumentSpec {
     }
     // we precompute the hash_code to minimize the time inside of hash
     // table operations where we may need to hold a compiler cache lock.
-    hash_code = hash_combine(0, ntensors);
+    hash_code = hash_combine(0, ninputs);
     for(auto d : data) {
       hash_code = hash_combine(hash_code, d);
     }
   }
 
-  // equality is fast: check ntensors, and then check the raw array data,
+  // equality is fast: check ninputs, and then check the raw array data,
   // there are no size/stride indirections
   bool operator==(const ArgumentSpec & spec) const {
-    return ntensors == spec.ntensors && data == spec.data;
+    return ninputs == spec.ninputs && data == spec.data;
   }
   bool operator!=(const ArgumentSpec & spec) const {
     return !(*this == spec);
   }
-  friend struct TensorInfo;
-  TensorInfo tensorInfo(size_t i) const;
+  friend struct ArgumentInfo;
+  ArgumentInfo at(size_t i) const;
   size_t size() const {
-    return ntensors;
+    return ninputs;
   }
   size_t hashCode() const {
     return hash_code;
   }
 
 private:
-  ArrayRef<TensorInfoPOD> tensor_info() const {
-    return ArrayRef<TensorInfoPOD>(reinterpret_cast<const TensorInfoPOD*>(data.data()), ntensors);
+  ArrayRef<ArgumentInfoPOD> tensor_info() const {
+    return ArrayRef<ArgumentInfoPOD>(reinterpret_cast<const ArgumentInfoPOD*>(data.data()), ninputs);
   }
-  // the start of the sizes_strides information, which comes after the TensorInfoPOD list.
+  // the start of the sizes_strides information, which comes after the ArgumentInfoPOD list.
   const int64_t* sizes_strides() const {
-    return data.data() + ntensors;
+    return data.data() + ninputs;
   }
   int64_t* sizes_strides() {
-    return data.data() + ntensors;
+    return data.data() + ninputs;
   }
   size_t hash_code; // precomputed on construction
-  uint32_t ntensors;
-  // layout is ntensors of TensorPOD (each 64-bit) followed by their size and stride info
+  int32_t ninputs;
+  // layout is ninputs of TensorPOD (each 64-bit) followed by their size and stride info
   // for 3 tensors: [t0POD][t1POD][t2POD][t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides]
   std::vector<int64_t> data;
 };
 
-// public view of compressed TensorInfo
-struct TensorInfo {
-  TensorInfo(const ArgumentSpec & spec, const int i)
+// public view of compressed ArgumentInfo
+struct ArgumentInfo {
+  ArgumentInfo(const ArgumentSpec & spec, const int i)
   : spec(spec), i(i) {}
+  bool isTensor() const {
+    return pod(i).is_tensor;
+  }
   at::ScalarType type() const {
     return at::ScalarType(pod(i).type);
   }
@@ -148,20 +157,20 @@ struct TensorInfo {
   }
 private:
   // offsetinto sizes_strides() array where the sizes start for tensor j
-  // [valid range] valid range is [0, ntensors]
-  // (i.e. you can ask for the offset at ntensors, which would be the offset of the next tensor if it existed)
+  // [valid range] valid range is [0, ninputs]
+  // (i.e. you can ask for the offset at ninputs, which would be the offset of the next tensor if it existed)
   int sizes_strides_offset(int j) const {
     if(j == 0) return 0;
     return 2*pod(j - 1).total_dims;
   }
-  const TensorInfoPOD & pod(int j) const {
+  const ArgumentInfoPOD & pod(int j) const {
     return spec.tensor_info().at(j);
   }
   const ArgumentSpec & spec;
   const int i;
 };
 
-inline std::ostream & operator<<(std::ostream & out, const TensorInfo & info) {
+inline std::ostream & operator<<(std::ostream & out, const ArgumentInfo & info) {
   if(!info.defined()) {
     return out << "<undefined>";
   }
@@ -178,14 +187,14 @@ inline std::ostream& operator<<(std::ostream & out, const ArgumentSpec & spec) {
   for(size_t i = 0; i < spec.size(); ++i) {
     if (i > 0)
       out << ", ";
-    out << spec.tensorInfo(i);
+    out << spec.at(i);
   }
   out << "}";
   return out;
 }
 
-inline TensorInfo ArgumentSpec::tensorInfo(size_t i) const {
-  return TensorInfo(*this, i);
+inline ArgumentInfo ArgumentSpec::at(size_t i) const {
+  return ArgumentInfo(*this, i);
 }
 
 }}
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 0324d1f3e44b8..2c595ffd679c2 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -34,6 +34,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include <iterator>
 
 namespace torch { namespace jit {
 
@@ -51,38 +52,51 @@ using autograd::variable_list;
 struct ExecutionPlanAutogradFunction : public autograd::Function {
   ExecutionPlanAutogradFunction(GraphExecutor graph, size_t capture_size)
   : graph(std::move(graph)) {
-    captures.reserve(capture_size);
+    is_var_capture.reserve(capture_size);
+    var_captures.reserve(capture_size);
+    ivalue_captures.reserve(capture_size);
   }
+
   virtual variable_list apply(variable_list&& inputs) override {
-    // TODO: expensive copies here to convert to/from tensor_list
-    // TODO: because inputs is passed by const reference there is no
-    // way to release tensors incrementally as this runs
-    variable_tensor_list all_inputs;
-    all_inputs.reserve(captures.size() + inputs.size());
-    all_inputs.insert(all_inputs.end(), inputs.begin(), inputs.end());
-    for(auto & sv : captures) {
-      all_inputs.push_back(sv.unpack(this->shared_from_this()));
+    Stack stack;
+    stack.reserve(is_var_capture.size() + inputs.size());
+    stack.insert(stack.end(), std::make_move_iterator(inputs.begin()),
+                              std::make_move_iterator(inputs.end()));
+    auto var_capture_it = var_captures.begin();
+    auto ivalue_capture_it = ivalue_captures.begin();
+    for (bool is_var : is_var_capture) {
+      if (is_var) {
+        stack.push_back(var_capture_it->unpack(this->shared_from_this()));
+        ++var_capture_it;
+      } else {
+        stack.push_back(*ivalue_capture_it);
+        ++ivalue_capture_it;
+      }
+    }
+    graph.run(stack);
+    return fmap(stack, [](IValue & val) {
+      return autograd::Variable(std::move(val).toTensor());
+    });
+  }
+
+  void capture(const IValue & val) {
+    const bool is_tensor = val.isTensor();
+    is_var_capture.push_back(is_tensor);
+    if (is_tensor) {
+      var_captures.emplace_back(Variable(val.toTensor()), false);
+    } else {
+      ivalue_captures.push_back(val);
     }
-    auto tensors = graph.run(std::move(all_inputs));
-    // TODO: another copy that needs to be removed
-    return autograd::variable_list(tensors.begin(), tensors.end());
   }
 private:
   friend struct ExecutionPlan;
   GraphExecutor graph;
-  std::vector<autograd::SavedVariable> captures;
-};
-
 
-// helper to run interpreter on variables until we switch
-// everything to IValue
-inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list inputs) {
-  std::vector<IValue> stack(inputs.begin(), inputs.end());
-  InterpreterState(code).runOneStage(stack);
-  return variable_tensor_list(fmap(stack, [](IValue& v) {
-    return std::move(v).toTensor();
-  }));
-}
+  // INVARIANT: is_var_capture.size() == var_captures.size() + ivalue_captures.size()
+  std::vector<bool> is_var_capture;
+  std::vector<autograd::SavedVariable> var_captures;
+  std::vector<IValue> ivalue_captures;
+};
 
 // an optimized way of executing the subgraph computed directly on
 // tensors rather than Variables.
@@ -91,19 +105,25 @@ inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list
 // to the output Variables if present.
 struct ExecutionPlan {
   ExecutionPlan(std::shared_ptr<Graph>& graph)
-      : f(graph), graph(graph) {}
+      : f(graph),
+        graph(graph),
+        num_inputs(graph->inputs().size()),
+        num_outputs(graph->outputs().size()) {}
   ExecutionPlan(std::shared_ptr<Graph>& graph, Gradient grad)
       : f(graph),
         graph(graph),
         grad(std::move(grad)),
-        grad_executor(this->grad.df) {}
+        grad_executor(this->grad.df),
+        num_inputs(graph->inputs().size()),
+        num_outputs(graph->outputs().size()) {}
 
-  variable_tensor_list run(variable_tensor_list&& stack) const {
-    if(grad) {
-      return runWithGrad(std::move(stack));
+  void run(Stack & stack) const {
+    if (grad) {
+      return runWithGrad(stack);
     }
-    return runOneStage(f, std::move(stack));
+    InterpreterState(f).runOneStage(stack);
   }
+
   std::shared_ptr<Graph> get_graph() const {
     return graph;
   }
@@ -124,70 +144,73 @@ struct ExecutionPlan {
   }
 
 private:
-  // note: should be inplace to avoid allocations, but we have to switch from
-  // a list of tensor to a list of ivalues
-  std::vector<IValue> unwrapVariables(variable_tensor_list && list) const {
-    return fmap(list, [](const Variable& v) -> IValue {
-      return v.defined() ? autograd::as_variable_ref(v).detach() : at::Tensor();
-    });
-  }
-  // note: should be inplace to avoid allocations, but we have to switch from
-  // a list of tensor to a list of ivalues
-  variable_tensor_list wrapTensors(tensor_list && list) const {
-    for(auto & v : list) {
-      v = autograd::make_variable(v, /*requires_grad=*/false);
+  void detachVariables(Stack & stack) const {
+    // It would be nice to use an ArrayRef here, but unfortunately those can only
+    // return const references, so we need to do a bunch of indexing ourselves.
+    const int64_t stack_size = stack.size();
+    const int64_t stack_offset = stack_size - num_inputs;
+    for (int64_t i = stack_offset; i < stack_size; ++i) {
+      auto & v = stack[i];
+      if (!v.isTensor()) continue;
+      auto t = std::move(v).toTensor();
+      v = IValue{t.defined() ? autograd::as_variable_ref(t).detach() : std::move(t)};
     }
-    return variable_tensor_list(std::move(list));
   }
   // Capture (save) inputs that would be required to subsequently run backwards
-  void captureInputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & inputs) const {
-    for(auto offset : grad.df_input_captured_inputs) {
-      grad_fn.captures.emplace_back(autograd::as_variable_ref(inputs[offset]), false);
+  void captureInputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef<IValue> inputs) const {
+    for (size_t offset : grad.df_input_captured_inputs) {
+      grad_fn.capture(inputs[offset]);
     }
   }
-  void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & outputs) const {
-    for(auto offset : grad.df_input_captured_outputs) {
-      grad_fn.captures.emplace_back(autograd::as_variable_ref(outputs[offset]), true);
+  void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef<IValue> outputs) const {
+    for (size_t offset : grad.df_input_captured_outputs) {
+      grad_fn.capture(outputs[offset]);
     }
   }
 
-  variable_tensor_list runWithGrad(variable_tensor_list&& inputs) const {
+  // XXX: keep in mind that stack can be larger than the inputs we need!
+  void runWithGrad(Stack & stack) const {
     auto grad_fn = std::make_shared<ExecutionPlanAutogradFunction>(grad_executor,
       grad.df_input_captured_inputs.size() + grad.df_input_captured_outputs.size());
-    // hook up the outputs of df to the gradient functions of the inputs that require
-    // gradients
-    for(auto idx : grad.df_output_vjps) {
-      auto & v = autograd::as_variable_ref(inputs[idx]);
-      grad_fn->add_next_edge(v.gradient_edge());
+
+    {
+      auto inputs = last(stack, num_inputs);
+      // hook up the outputs of df to the gradient functions of the inputs that require gradients
+      for(auto idx : grad.df_output_vjps) {
+        auto v = Variable(inputs[idx].toTensor());
+        grad_fn->add_next_edge(v.gradient_edge());
+      }
+      captureInputs(*grad_fn, inputs);
     }
-    captureInputs(*grad_fn, inputs);
 
-    auto stack = unwrapVariables(std::move(inputs));
+    detachVariables(stack);
     InterpreterState(f).runOneStage(stack);
-    variable_tensor_list outputs(
-        fmap(stack, [](IValue& v) { return std::move(v).toTensor(); }));
-
-    // hookup the gradients for the output tensors that require gradients
-    // to the inputs to our gradient function df
-    // TODO - XXX - if any output is the same tensor multiple times, views have to be
-    // setup here. We need to refactor autograd until it is safe for
-    // tensors to be constructed without all the viewing infrastructure.
-    // this is currently intentionally not done here so we can get an idea of our
-    // perf before introducing overhead for correctness
-    for(auto idx : grad.df_input_vjps) {
-      // Note: we have to set this up in place, or we have to throw away and
-      // reallocate variables that were already created in wrapTensors. We
-      // should add an API for this.
-      auto& output = autograd::as_variable_ref(outputs[idx]);
-      autograd::create_gradient_edge(output, grad_fn);
-      output.set_requires_grad(true);
+
+    {
+      auto outputs = last(stack, num_outputs);
+      // hookup the gradients for the output tensors that require gradients
+      // to the inputs to our gradient function df
+      // TODO - XXX - if any output is the same tensor multiple times, views have to be
+      // setup here. We need to refactor autograd until it is safe for
+      // tensors to be constructed without all the viewing infrastructure.
+      // this is currently intentionally not done here so we can get an idea of our
+      // perf before introducing overhead for correctness
+      for(auto idx : grad.df_input_vjps) {
+        // Note: we have to set this up in place, or we have to throw away and
+        // reallocate variables that were already created in wrapTensors. We
+        // should add an API for this.
+        Variable output = outputs[idx].toTensor();
+        autograd::create_gradient_edge(output, grad_fn);
+        output.set_requires_grad(true);
+      }
+      captureOutputs(*grad_fn, outputs);
+      // drop the temporary outputs so that we return the same number of
+      // outputs as if we were not also calculating gradient
+      const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs;
+      stack.erase(stack.end() - num_temporary_outputs, stack.end());
     }
-    captureOutputs(*grad_fn, outputs);
-    // drop the temporary outputs so that we return the same number of
-    // outputs as if we were not also calculating gradient
-    outputs.erase(outputs.begin() + grad.f_real_outputs, outputs.end());
-    return outputs;
   }
+
   Code f;
   // optimized graph for debugging and testing
   std::shared_ptr<Graph> graph;
@@ -195,6 +218,9 @@ struct ExecutionPlan {
   Gradient grad; // if(grad) is false when this is unused
   // executor for df, including code caches
   GraphExecutor grad_executor;
+
+  const size_t num_inputs;
+  const size_t num_outputs;
 };
 
 } // anonymous namespace
@@ -210,6 +236,7 @@ struct GraphExecutorImpl {
   : graph(std::move(graph))
   , optimize(optimize)
   , num_inputs(this->graph->inputs().size())
+  , num_outputs(this->graph->outputs().size())
   , symbolically_differentiable(symbolically_differentiable)
   , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {}
   GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
@@ -223,34 +250,36 @@ struct GraphExecutorImpl {
   }
 
   // entry point where execution begins
-  variable_tensor_list run(variable_tensor_list inputs) {
-    if(inputs.size() != num_inputs) {
+  void run(Stack & stack) {
+    if(stack.size() < num_inputs) {
       std::stringstream ss;
-      ss << "expected " << num_inputs << " inputs but got " << inputs.size() << " inputs";
+      ss << "expected " << num_inputs << " inputs but got " << stack.size() << " inputs";
       throw std::runtime_error(ss.str());
     }
+    auto inputs = last(stack, num_inputs);
 
     // the tracer has called a graph executor
     // there is no need to optimize, but we do need to splice the graph of
     // this excutor into the trace. Otherwise we might unroll control-flow
     // operations.
     if(tracer::isTracing()) {
-      return runTraced(std::move(inputs));
+      return runTraced(stack);
     }
 
     // this is the fallback pathway, when we cannot differentiate
     if(!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
-      return runFallback(std::move(inputs));
+      return runFallback(stack);
     }
 
     // either we can symbolically differentiate, or we do not need a gradient.
     // go down the route where we treat the inputs as tensors
     // and fully optimize
     auto & implementation = getOrCompile(inputs);
-    return implementation.run(std::move(inputs));
+    return implementation.run(stack);
   }
 
-  std::shared_ptr<Graph> graphFor(const variable_tensor_list& inputs) const {
+  std::shared_ptr<Graph> graphFor(const Stack& stack) const {
+    auto inputs = last(stack, num_inputs);
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
 
     if (!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
@@ -282,12 +311,15 @@ struct GraphExecutorImpl {
 private:
   friend struct GraphExecutor;
 
-  variable_tensor_list runTraced(variable_tensor_list inputs) {
+  void runTraced(Stack & stack) {
     auto state = tracer::getTracingState();
-    auto input_values = fmap(inputs, tracer::getValueTrace);
+    auto inputs = last(stack, num_inputs);
+    auto input_values = fmap(inputs, [](const IValue & v) {
+      return tracer::getValueTrace(v.toTensor());
+    });
 
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
-    auto outputs = runFallback(std::move(inputs));
+    runFallback(stack);
 
     auto all_dynamic = [](const at::ArrayRef<Value*> xs) {
       for(Value* x : xs) {
@@ -308,15 +340,18 @@ struct GraphExecutorImpl {
     }
     auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values);
 
-    for(size_t i = 0; i < outputs.size(); ++i) {
-      tracer::setValueTrace(outputs[i], output_values[i]);
+    auto outputs = last(stack, num_outputs);
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      // We can't attach tracing states to scalars, so we have to skip them here
+      // TODO: Should we reinterpret them as scalar tensors instead?
+      if (!outputs[i].isTensor()) continue;
+      tracer::setValueTrace(outputs[i].toTensor(), output_values[i]);
     }
-    return outputs;
   }
 
-  variable_tensor_list runFallback(variable_tensor_list inputs) {
+  void runFallback(Stack & stack) {
     auto & fb = getOrCreateAutogradFallback();
-    return runOneStage(fb, std::move(inputs));
+    InterpreterState(fb).runOneStage(stack);
   }
 
   static bool calcMayIntroduceGradient(Block* b) {
@@ -330,14 +365,16 @@ struct GraphExecutorImpl {
     }
     return false;
   }
-  bool needsGradient(const variable_tensor_list & inputs) const {
+  bool needsGradient(at::ArrayRef<IValue> inputs) const {
     if (!autograd::GradMode::is_enabled()) {
       return false;
     }
-    if(may_introduce_gradient)
+    if (may_introduce_gradient)
       return true;
-    for (const auto & tensor : inputs) {
-      if(tensor.defined() && static_cast<const Variable&>(tensor).requires_grad())
+    for (const IValue & value : inputs) {
+      if (!value.isTensor()) continue;
+      auto t = value.toTensor();
+      if (t.defined() && autograd::as_variable_ref(t).requires_grad())
         return true;
     }
     return false;
@@ -359,7 +396,7 @@ struct GraphExecutorImpl {
     autograd_fallback = Code(graph_);
     return autograd_fallback;
   }
-  const ExecutionPlan & getOrCompile(const variable_tensor_list & inputs) {
+  const ExecutionPlan & getOrCompile(at::ArrayRef<IValue> inputs) {
     // outside lock guard, to minimize the time holding the lock on the fast path
     // ArgumentSpec even computes its hashCode here.
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
@@ -376,7 +413,7 @@ struct GraphExecutorImpl {
 
   bool argumentSpecRequiresGradient(const ArgumentSpec & spec) {
     for(size_t i = 0; i < spec.size(); ++i) {
-      if(spec.tensorInfo(i).requires_grad())
+      if(spec.at(i).requires_grad())
         return true;
     }
     return false;
@@ -396,7 +433,7 @@ struct GraphExecutorImpl {
     std::vector<bool> requires_grads;
     requires_grads.reserve(spec.size());
     for(size_t i = 0; i < spec.size(); i++)
-      requires_grads.push_back(spec.tensorInfo(i).requires_grad());
+      requires_grads.push_back(spec.at(i).requires_grad());
 
     Gradient gradient = differentiate(graph_, requires_grads);
     graph_ = gradient.f;
@@ -410,8 +447,9 @@ struct GraphExecutorImpl {
   // true - do everything we can to make this graph run fast
   // false - do not modifiy the graph at all and just use the interpreter
   // to run the graph. Useful for debugging correctness issues in the implementation
-  bool optimize;
-  size_t num_inputs;
+  const bool optimize;
+  const size_t num_inputs;
+  const size_t num_outputs;
 
   // GraphExecutor optimizes more aggresively when we _know_ the graph will be
   // symbolically differentiable.
@@ -450,15 +488,15 @@ GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable)
 : pImpl(new GraphExecutorImpl(std::move(graph), optimize, symbolically_differentiable)) {}
 
-variable_tensor_list GraphExecutor::run(variable_tensor_list && inputs) {
-  return pImpl->run(std::move(inputs));
+void GraphExecutor::run(Stack & inputs) {
+  return pImpl->run(inputs);
 }
 
 std::shared_ptr<Graph> GraphExecutor::graph() const {
   return pImpl->graph;
 }
 
-std::shared_ptr<Graph> GraphExecutor::graphFor(const variable_tensor_list& inputs) const {
+std::shared_ptr<Graph> GraphExecutor::graphFor(const Stack& inputs) const {
   return pImpl->graphFor(inputs);
 }
 
@@ -481,7 +519,7 @@ void specializeToSpec(const std::shared_ptr<Graph>& graph_, const ArgumentSpec&
   // this must be first because later passes do not know what GradOfs are
   std::vector<bool> defined;
   for(size_t i = 0; i < spec.size(); ++i) {
-    defined.push_back(spec.tensorInfo(i).defined());
+    defined.push_back(spec.at(i).defined());
   }
   specializeUndef(*graph_, defined);
 
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index d78076ab6484f..4e862c9e0a1e4 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -38,12 +38,12 @@ struct TORCH_API GraphExecutor {
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
   // note: if not specified, symbolically_differentiable is computed from the graph.
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable);
-  variable_tensor_list run(variable_tensor_list && inputs);
+  void run(Stack & inputs);
   explicit operator bool() const {
     return pImpl != nullptr;
   }
   std::shared_ptr<Graph> graph() const;
-  std::shared_ptr<Graph> graphFor(const variable_tensor_list& inputs) const;
+  std::shared_ptr<Graph> graphFor(const Stack& inputs) const;
   GraphExecutorState getDebugState();
 private:
   std::shared_ptr<GraphExecutorImpl> pImpl;
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index a4bfdc2a5b843..908404a43b649 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -57,18 +57,19 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_onnx", ToONNX)
    .def("_jit_pass_onnx_peephole", PeepholeOptimizeONNX)
    .def("_jit_pass_fuse", FuseGraph)
-   .def("_jit_pass_dce", [](std::shared_ptr<Graph>& g){
+   .def("_jit_pass_dce", [](std::shared_ptr<Graph>& g) {
      return EliminateDeadCode(g); // overload resolution
    })
-   .def("_jit_pass_cse", EliminateCommonSubexpression)
+   .def("_jit_pass_cse", [](std::shared_ptr<Graph>& g) {
+     return EliminateCommonSubexpression(g); // overload resolution
+   })
    .def("_jit_pass_peephole", PeepholeOptimize)
    .def("_jit_pass_canonicalize", [](const std::shared_ptr<Graph>& g) {
      return Canonicalize(g);
    })
    .def("_jit_pass_lint", LintGraph)
    .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) {
-     auto tensor_inputs = createVariableTensorList(inputs);
-     PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs));
+     PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs)));
    })
    .def("_jit_pass_remove_expands", RemoveExpands)
    .def("_jit_pass_erase_number_types", EraseNumberTypes)
@@ -180,28 +181,15 @@ void initJITBindings(PyObject *module) {
         return ge.graph();
       })
       .def("graph_for", [](GraphExecutor& ge, py::args args) {
-        return ge.graphFor(createVariableTensorList(args));
+        return ge.graphFor(createStack(args));
       })
       .def("get_debug_state", [](GraphExecutor& ge) {
         return ge.getDebugState();
       })
       .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
-        auto inputs = createVariableTensorList(args);
-        auto outputs = ge.run(std::move(inputs));
-        // if we don't tell pybind these are variables it chokes on the
-        // conversion.
-        // TODO: fix conversions to be sane and make sure this works.
-        if (outputs.size() == 0) {
-          return py::none();
-        } else if (outputs.size() == 1) {
-          return py::cast(autograd::as_variable_ref(outputs[0]));
-        } else {
-          py::tuple tuple(outputs.size());
-          for(size_t i = 0; i < outputs.size(); i++) {
-            tuple[i] = py::cast(autograd::as_variable_ref(outputs[i]));
-          }
-          return tuple;
-        }
+        auto stack = createStack(args);
+        ge.run(stack);
+        return wrapStack(std::move(stack));
       });
 
   initPythonIRBindings(module);
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 04664f62885e8..cf7dda32413c2 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -617,16 +617,9 @@ struct CodeImpl {
 
     auto executor = std::make_shared<GraphExecutor>(node->g(attr::Subgraph));
     graph_executors.emplace_back(executor.get());
-    auto num_inputs = node->inputs().size();
     return [=](Stack& stack) mutable {
       autograd::profiler::RecordFunction record("GraphExecutor");
-      auto inputs = last(stack, num_inputs);
-      variable_tensor_list tinputs(
-          fmap(inputs, [](const IValue& v) { return v.toTensor(); }));
-      drop(stack, num_inputs);
-      //TODO: has graph executor work from a stack as well
-      variable_tensor_list toutputs = executor->run(variable_tensor_list(std::move(tinputs)));
-      stack.insert(stack.end(), toutputs.begin(), toutputs.end());
+      executor->run(stack);
       return 0;
     };
   }
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 81863baac9ce3..42a5be89e55e4 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -4,6 +4,8 @@
 
 #include <ATen/ATen.h>
 
+#include <type_traits>
+
 namespace torch { namespace jit {
 
 // smart pointer to hold onto at::Retainable objects in a generic way
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index adcc566417930..3b18699f94ffc 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -455,7 +455,9 @@ void PropagateShapeOnBlock(Block * block, bool insert_expands) {
 void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec) {
   JIT_ASSERT(graph.inputs().size() == spec.size());
   for(size_t i = 0; i < spec.size(); ++i) {
-    graph.inputs()[i]->setType(spec.tensorInfo(i));
+    auto argspec = spec.at(i);
+    if (!argspec.isTensor()) continue;
+    graph.inputs()[i]->setType(argspec);
   }
   PropagateShapeOnBlock(graph.block());
 }
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 8b7e78a4d5438..415fc311086ac 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -2,17 +2,10 @@
 
 #include "torch/csrc/utils/pybind.h"
 
-#include "torch/csrc/jit/variable_tensor_list.h"
-
 namespace torch { namespace jit {
 
-namespace {
-
-// we cannot use the default py:cast<autograd::Variable> because it currently
-// unwraps the data tensor in the conversion process
-// TODO: replace with bs type
-variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_extra_space = 0) {
-  variable_tensor_list result;
+inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) {
+  Stack result;
   result.reserve(tuple.size() + reserve_extra_space);
   for(auto e : tuple) {
     result.push_back(py::cast<autograd::Variable>(e));
@@ -20,6 +13,20 @@ variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_ex
   return result;
 }
 
-}  // namespace
+inline py::object wrapStack(Stack&& outputs) {
+  if (outputs.size() == 0) {
+    return py::none();
+  } else if (outputs.size() == 1) {
+    JIT_ASSERT(outputs[0].isTensor());
+    return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor()));
+  } else {
+    py::tuple tuple(outputs.size());
+    for(size_t i = 0; i < outputs.size(); i++) {
+      JIT_ASSERT(outputs[i].isTensor());
+      tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor()));
+    }
+    return tuple;
+  }
+}
 
 } }  // namespace torch::jit
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index d4fdb529782a6..c9e41e8a7eee2 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -145,7 +145,7 @@ void initPythonIRBindings(PyObject * module_) {
       return ss.str();
     })
     .def("propagate_shapes", [](Graph& g, std::vector<at::Tensor> inputs, bool with_grad) {
-      PropagateInputShapes(g, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs))));
+      PropagateInputShapes(g, ArgumentSpec(with_grad, fmap<IValue>(inputs)));
     })
     .def("export", [](const std::shared_ptr<Graph> g, const std::vector<at::Tensor>& initializers,
                       int64_t onnx_opset_version, bool defer_weight_export,
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 61100060f7f65..576344427c046 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -371,9 +371,9 @@ static void gatherParametersAndBuffers(std::vector<at::Tensor*> & values, const
 }
 
 py::object runMethodFromPython(Method& m, py::args args) {
-  auto inputs = createVariableTensorList(args);
-  auto outputs = m.run(std::move(inputs));
-  return unpackVariableTensorList(std::move(outputs));
+  auto stack = createStack(args);
+  m.run(stack);
+  return wrapStack(std::move(stack));
 }
 
 void initJitScriptBindings(PyObject* module) {
@@ -502,7 +502,7 @@ void initJitScriptBindings(PyObject* module) {
       })
       .def("graph_for", [](Module& self, py::args args) {
         if (self.find_method("forward")) {
-          return self.get_method("forward").graph_for(createVariableTensorList(args));
+          return self.get_method("forward").graph_for(createStack(args));
         }
         throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()");
       })
@@ -530,7 +530,7 @@ void initJitScriptBindings(PyObject* module) {
     .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes)
     .def("params", &Method::params)
     .def("graph_for", [](Method& self, py::args args) {
-      return self.graph_for(createVariableTensorList(args));
+      return self.graph_for(createStack(args));
     })
     .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) {
       std::vector<Argument> arg_type_args, return_type_args;
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 90ad6f75d1b38..76518aaf1d26f 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -54,13 +54,13 @@ struct Method {
     }
   }
 
-  variable_tensor_list run(variable_tensor_list && inputs) {
-    for(auto tp : member_inputs) {
-      inputs.push_back(*tp);
+  void run(Stack & stack) {
+    for(at::Tensor* tp : member_inputs) {
+      stack.push_back(*tp);
     }
-    return get_executor().run(std::move(inputs));
+    get_executor().run(stack);
   }
-  std::shared_ptr<Graph> graph_for(const variable_tensor_list& inputs) {
+  std::shared_ptr<Graph> graph_for(const Stack& inputs) {
     return get_executor().graphFor(inputs);
   }
   std::shared_ptr<Graph> graph() const {
@@ -95,12 +95,15 @@ struct Method {
 
   std::shared_ptr<Graph> propagate_shapes(std::vector<at::Tensor> inputs, bool with_grad=false) {
     auto retval = graph_->copy();
-    for (auto inp : member_inputs) {
-      inputs.push_back(*inp);
+    Stack stack;
+    stack.reserve(inputs.size() + member_inputs.size());
+    for (at::Tensor & i : inputs) {
+      stack.emplace_back(std::move(i));
+    }
+    for (at::Tensor* inp : member_inputs) {
+      stack.push_back(*inp);
     }
-    PropagateInputShapes(
-      *retval,
-      ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs))));
+    PropagateInputShapes(*retval, ArgumentSpec(with_grad, std::move(stack)));
     return retval;
   }
 
@@ -110,8 +113,7 @@ struct Method {
       inputs.push_back(*inp);
     }
     if (propagate) {
-      auto inputs_copy = inputs;
-      PropagateInputShapes(*retval, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs_copy))));
+      PropagateInputShapes(*retval, ArgumentSpec(with_grad, fmap<IValue>(inputs)));
     }
     JIT_ASSERT(retval->inputs().size() == inputs.size());
     for (size_t i=0; i < retval->inputs().size(); ++i) {
diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h
index 654c87088e02a..2c74ae7e0a4c7 100644
--- a/torch/csrc/jit/stack.h
+++ b/torch/csrc/jit/stack.h
@@ -27,10 +27,10 @@ static inline IValue & peek(Stack & stack, size_t i, size_t N) {
 }
 // treat the last N elements of the stack as a list, looking up the
 // slice starting at index i and having length len
-static inline at::ArrayRef<IValue> peekSlice(Stack & stack, size_t i, size_t len, size_t N) {
+static inline at::ArrayRef<IValue> peekSlice(const Stack & stack, size_t i, size_t len, size_t N) {
   return at::ArrayRef<IValue>(stack).slice(stack.size() - N + i, len);
 }
-static inline at::ArrayRef<IValue> last(Stack & stack, size_t N) {
+static inline at::ArrayRef<IValue> last(const Stack & stack, size_t N) {
   return peekSlice(stack, 0, N, N);
 }
 static inline void drop(Stack & stack, size_t n) {
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index ec889612a1047..ecb8c9b377981 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -714,7 +714,8 @@ bool isEqual(at::IntList lhs, at::IntList rhs) {
   return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
 
-bool isEqual(const TensorInfo & ti, const autograd::Variable & v) {
+bool isEqual(const ArgumentInfo & ti, const autograd::Variable & v) {
+  REQUIRE(ti.isTensor());
   if(!ti.defined())
     return ti.defined() == v.defined();
   return
@@ -728,8 +729,8 @@ bool isEqual(const TensorInfo & ti, const autograd::Variable & v) {
 // work around the fact that variable_tensor_list doesn't duplicate all
 // of std::vector's constructors.
 // most constructors are never used in the implementation, just in our tests.
-variable_tensor_list createVarList(std::vector<at::Tensor> && list) {
-  return variable_tensor_list(std::move(list));
+Stack createStack(std::vector<at::Tensor> && list) {
+  return Stack(std::make_move_iterator(list.begin()), std::make_move_iterator(list.end()));
 }
 
 void argumentSpecTest() {
@@ -738,14 +739,14 @@ void argumentSpecTest() {
   auto & GF = at::CUDA(at::kFloat);
   auto & GD = at::CUDA(at::kDouble);
 
-  auto list =  createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
+  auto list = createStack({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
 
   // make sure we have some non-standard strides
-  list[1].transpose_(0, 1);
+  list[1].toTensor().transpose_(0, 1);
 
   // same list but different backing values
-  auto list2 = createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
-  list2[1].transpose_(0, 1);
+  auto list2 = createStack({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
+  list2[1].toTensor().transpose_(0, 1);
 
 
   ArgumentSpec a(true, list);
@@ -758,7 +759,7 @@ void argumentSpecTest() {
   REQUIRE(d.hashCode() == a.hashCode());
 
   for(size_t i = 0; i < list.size(); ++i) {
-    REQUIRE(isEqual(a.tensorInfo(i), list[i]));
+    REQUIRE(isEqual(a.at(i), list[i].toTensor()));
   }
   ArgumentSpec no_grad(/*with_grad=*/false, list);
   REQUIRE(no_grad != a);
@@ -770,7 +771,7 @@ void argumentSpecTest() {
   spec.insert(std::move(no_grad));
   REQUIRE(spec.count(ArgumentSpec(true,list)) == 1);
 
-  list2[1].transpose_(0,1);
+  list2[1].toTensor().transpose_(0,1);
   ArgumentSpec c(true, list2); // same as list, except for one stride
   REQUIRE(!(c == a));
   REQUIRE(spec.count(c) == 0);
@@ -793,7 +794,7 @@ void shapeAnalysisTest() {
   auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
 
   auto g = build_lstm();
-  ArgumentSpec spec(false, createVarList({v(input), v(hx), v(cx), v(w_ih), v(w_hh) }));
+  ArgumentSpec spec(false, createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh) }));
   PropagateInputShapes(*g, spec);
   at::Tensor r0, r1;
   std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
@@ -818,14 +819,15 @@ void testGraphExecutor() {
   auto w_ih  = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA));
   auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
 
-  std::vector<at::Tensor> inputs = {v(input), v(hx), v(cx), v(w_ih), v(w_hh) };
   auto g = build_lstm();
   GraphExecutor executor(g);
-  auto outputs = executor.run(variable_tensor_list(std::move(inputs)));
+  auto stack = createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh)});
+  executor.run(stack);
+  REQUIRE(stack.size() == 2);
   at::Tensor r0, r1;
   std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
-  REQUIRE(almostEqual(Variable(outputs[0]).data(), r0));
-  REQUIRE(almostEqual(Variable(outputs[1]).data(), r1));
+  REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0));
+  REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1));
 }
 
 void testBlocks(std::ostream & out) {
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 0b5d41f4de297..5c998e3fc690b 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -45,15 +45,13 @@ PreTraceInfo preRecordTrace(Symbol op,
 
 void postRecordTrace(const PreTraceInfo& info,
                      at::ArrayRef<Variable> outputs) {
-  auto assignOutput = [&info](const Variable & output, Value * value) {
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto & output = outputs[i];
+    Value * value = info.n->addOutput();
     if (output.defined()) {
       value->inferTypeFrom(output.data());
       setValueTrace(output, value);
     }
-  };
-
-  for (size_t i = 0; i < outputs.size(); i++) {
-    assignOutput(outputs[i], info.n->addOutput());
   }
 }
 

From 0c84a5c27e37406190e447565c5b5f69aba3a228 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Thu, 26 Jul 2018 11:50:21 -0700
Subject: [PATCH 07/10] Pass shape infos to ONNX -> Caffe2 C++ conversion
 backend (#9870)

Summary:
And let Gemm conversion to inspect the input `C` to try converting to FC.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9870

Reviewed By: houseroad

Differential Revision: D9013198

Pulled By: bddppq

fbshipit-source-id: b4c509cfccca238262e1c406b004e66cef256321
---
 caffe2/onnx/backend.cc                  | 163 ++++++++++++++++--------
 caffe2/onnx/backend.h                   |  79 ++++++++----
 caffe2/python/onnx/backend.py           | 124 +++++++-----------
 caffe2/python/onnx/tests/c2_ref_test.py |  63 +++++++--
 caffe2/python/pybind_state.cc           |  25 +++-
 5 files changed, 281 insertions(+), 173 deletions(-)

diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 6f3986b837d30..f6041932518db 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -25,8 +25,6 @@ namespace onnx {
 
 namespace {
 
-constexpr static int kKnownOpsetVersion = 6;
-
 bool AlmostEqual(double a, double b) {
   constexpr static double kEps = 1e-15;
   return (fabs(a - b) < kEps);
@@ -367,17 +365,19 @@ Caffe2Backend::get_special_operators() const {
 
 Caffe2Ops Caffe2Backend::CreateArgMaxMin(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   if (!attributes.HasAttribute("axis")) {
     auto* attr = attributes.AddRewrittenAttribute("axis");
     attr->set_i(0);
   }
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateCast(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 
   auto onnx_dtype =
       onnx_node->attributes.get<int64_t>("to", TensorProto::UNDEFINED);
@@ -443,7 +443,7 @@ Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateConstant(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   CAFFE_ENFORCE_EQ(onnx_node->node.output_size(), 1);
 
   Caffe2Ops ret;
@@ -486,7 +486,7 @@ Caffe2Ops Caffe2Backend::CreateConstant(
 //  differently.
 Caffe2Ops Caffe2Backend::CreateConvPoolOpBase(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   auto& attributes = onnx_node->attributes;
   if (node.op_type().find("Global") == 0) {
@@ -512,16 +512,18 @@ Caffe2Ops Caffe2Backend::CreateConvPoolOpBase(
     }
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreatePadPool(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   auto& node = onnx_node->node;
   auto& attributes = onnx_node->attributes;
   Caffe2Ops ret;
   // Pad
   bool padding = false;
-  const std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  const std::string pad_name = ctx.opset_version() < 2 ? "paddings" : "pads";
   const auto pad_input = dummy_->NewDummyName();
   if (attributes.HasAttribute("count_include_pad") &&
       attributes.HasAttribute(pad_name)) {
@@ -561,7 +563,7 @@ Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
     }
   }
   // Pool
-  auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, opset_version);
+  auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, ctx);
   auto* pool_op = c2_ops.ops.Mutable(0);
   if (padding) {
     pool_op->set_input(0, pad_input);
@@ -572,8 +574,10 @@ Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateReshape(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
   auto* op = c2_op.ops.Mutable(0);
   op->add_output(dummy_->NewDummyName());
@@ -583,7 +587,7 @@ Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateReciprocal(
     OnnxNode* onnx_node,
-    int /*opset_version*/) {
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() != 1 || node.output_size() != 1) {
     CAFFE_THROW("Caffe2 Reciprocal should have 1 input and 1 output");
@@ -599,7 +603,9 @@ Caffe2Ops Caffe2Backend::CreateReciprocal(
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateGather(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() < 2 || node.output_size() < 1) {
     CAFFE_THROW("Caffe2 Gather should have 2 inputs and 1 output");
@@ -629,7 +635,9 @@ Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) {
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateGemm(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() < 3 || node.output_size() < 1) {
     CAFFE_THROW("Caffe2 Gemm should have 3 inputs and 1 output");
@@ -667,7 +675,22 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
   auto trans_a = onnx_node->attributes.get<int64_t>("transA", 0L);
   auto trans_b = onnx_node->attributes.get<int64_t>("transB", 0L);
   auto broadcast = onnx_node->attributes.get<int64_t>("broadcast", 0L);
-  if ((!trans_a) && trans_b && broadcast) {
+
+  bool use_fc = false;
+  if ((!trans_a) && trans_b) {
+    if (broadcast) {
+      use_fc = true;
+    } else {
+      const auto input_c_vi_iter = ctx.value_infos().find(node.input(2));
+      if (input_c_vi_iter != ctx.value_infos().end() &&
+          input_c_vi_iter->second.type().tensor_type().shape().dim_size() ==
+              1) {
+        use_fc = true;
+      }
+    }
+  }
+
+  if (use_fc) {
     auto* c2_op = ret.ops.Add();
     BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output});
   } else {
@@ -683,7 +706,7 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
     BuildOperator(
         c2_op, "MatMul", {input_a, input_b}, {ab}, {arg_trans_a, arg_trans_b});
     c2_op = ret.ops.Add();
-    if (opset_version >= 7) {
+    if (ctx.opset_version() >= 7) {
       BuildOperator(c2_op, "Add", {ab, input_c}, {output});
     } else {
       caffe2::Argument arg_broadcast;
@@ -696,10 +719,12 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreatePad(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   ::google::protobuf::RepeatedField<::google::protobuf::int64> pads;
-  std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  std::string pad_name = ctx.opset_version() < 2 ? "paddings" : "pads";
   pads = attributes
              .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
                  pad_name);
@@ -734,14 +759,16 @@ Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) {
   attr->add_ints(pads.Get(6));
   attr->add_ints(pads.Get(7));
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
 // TODO: Caffe2 Concat has an extra output. It should be only
 // used when doing training, so we should change Caffe2 to allow
 // 1 output.
-Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateConcat(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
   auto* op = c2_op.ops.Mutable(0);
   op->add_output(dummy_->NewDummyName());
@@ -751,7 +778,7 @@ Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateLogSoftmax(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() < 1 || node.output_size() < 1) {
     CAFFE_THROW("LogSoftmax should have 1 input and 1 output");
@@ -771,8 +798,10 @@ Caffe2Ops Caffe2Backend::CreateLogSoftmax(
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) {
-  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateSlice(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1);
   auto* op = op_tmp.ops.Mutable(0);
   std::unordered_map<std::string, caffe2::Argument*> args;
@@ -922,40 +951,42 @@ Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateBatchNormalization(
     OnnxNode* onnx_node,
-    int opset_version) {
-  if (opset_version < 6) {
+    const ConversionContext& ctx) {
+  if (ctx.opset_version() < 6) {
     auto& attributes = onnx_node->attributes;
     attributes.remove("consumed_inputs");
   }
 
-  if (opset_version >= 7) {
+  if (ctx.opset_version() >= 7) {
     auto& attributes = onnx_node->attributes;
     auto* attr = attributes.AddRewrittenAttribute("is_test");
     attr->set_i(1);
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
 Caffe2Ops Caffe2Backend::CreateSplit(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   if (!attributes.HasAttribute("axis")) {
     auto* attr = attributes.AddRewrittenAttribute("axis");
     attr->set_i(0);
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateMatMul(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() != 2) {
     CAFFE_THROW("MatMul should have 2 inputs");
   }
 
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
   auto* op = c2_op.ops.Mutable(0);
   auto* broadcast_arg = op->add_arg();
@@ -965,10 +996,12 @@ Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) {
   return c2_op;
 }
 
-Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateUpsample(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   attributes.remove("mode");
-  if (opset_version >= 7) {
+  if (ctx.opset_version() >= 7) {
     const auto& scales = attributes.get<::google::protobuf::RepeatedField<float>>("scales");
     if (scales.size() != 4) {
       CAFFE_THROW("The scales argument should have size 4");
@@ -976,7 +1009,7 @@ Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version)
       CAFFE_THROW("The first two elements in the scales argument must be 1");
     }
     attributes.remove("scales");
-    auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+    auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
     auto* op = c2_op.ops.Mutable(0);
     auto* c2_height = op->add_arg();
     c2_height->set_name("height_scale");
@@ -986,21 +1019,25 @@ Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version)
     c2_width->set_f(scales.Get(3));
     return c2_op;
   }
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateDropout(OnnxNode* onnx_node, int opset_version) {
-  if (opset_version >= 7) {
+Caffe2Ops Caffe2Backend::CreateDropout(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  if (ctx.opset_version() >= 7) {
     auto& attributes = onnx_node->attributes;
     auto* attr = attributes.AddRewrittenAttribute("is_test");
     attr->set_i(1);
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateLRN(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateLRN(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   const auto& attributes = onnx_node->attributes;
   if (!attributes.HasAttribute("alpha")) {
       auto* arg = c2_op.ops.Mutable(0)->add_arg();
@@ -1052,7 +1089,7 @@ Caffe2Backend::AllNamesInGraph(const GraphProto &graph) {
 //  and then fixing things up further.
 Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   Caffe2Ops ret;
   auto* c2_op = ret.ops.Add();
 
@@ -1064,12 +1101,12 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
   const auto onnx_op_type = node.op_type();
   auto broken_version = caffe2::get_default(
       get_broken_operators(), onnx_op_type, std::numeric_limits<int>::max());
-  if (broken_version <= opset_version) {
+  if (broken_version <= ctx.opset_version()) {
     CAFFE_THROW(
         "Don't know how to translate op ",
         onnx_op_type,
         " in ONNX operator set v",
-        opset_version,
+        ctx.opset_version(),
         " (I only support prior to v",
         broken_version);
   }
@@ -1102,14 +1139,14 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
 
 Caffe2Ops Caffe2Backend::ConvertNode(
     const std::string& node_str,
-    int opset_version) {
+    const ConversionContext& ctx) {
   ::google::protobuf::RepeatedPtrField<NodeProto> nodes;
   auto* n = nodes.Add();
   ParseProtoFromLargeString(node_str, n);
   ModelProto init_model;
   ModelProto pred_model;
   OnnxNode onnx_node = OnnxNode(nodes.Get(0));
-  return OnnxNodeToCaffe2Ops(init_model, pred_model, &onnx_node, opset_version);
+  return OnnxNodeToCaffe2Ops(init_model, pred_model, ctx, &onnx_node);
 }
 
 void Caffe2Backend::CheckOpSchemaArguments(
@@ -1142,14 +1179,14 @@ void Caffe2Backend::CheckOpSchemaArguments(
 Caffe2Ops Caffe2Backend::OnnxNodeToCaffe2Ops(
     const ModelProto& init_model,
     const ModelProto& pred_model,
-    OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx,
+    OnnxNode* onnx_node) {
   Caffe2Ops res;
   if (get_special_operators().count(onnx_node->node.op_type())) {
     res = (this->*get_special_operators().at(onnx_node->node.op_type()))(
-        onnx_node, opset_version);
+        onnx_node, ctx);
   } else {
-    res = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+    res = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   }
 
   for (const auto& result_op: res.ops){
@@ -1198,6 +1235,17 @@ void Caffe2Backend::OnnxToCaffe2(
   name_set.insert(name_set_pred.begin(), name_set_pred.end());
   dummy_->Reset(name_set);
 
+  ValueInfoMap graph_value_infos{};
+  for (const auto& vi : pred_model.graph().input()) {
+    graph_value_infos[vi.name()].CopyFrom(vi);
+  }
+  for (const auto& vi : pred_model.graph().output()) {
+    graph_value_infos[vi.name()].CopyFrom(vi);
+  }
+  for (const auto& vi : pred_model.graph().value_info()) {
+    graph_value_infos[vi.name()].CopyFrom(vi);
+  }
+
   size_t idx_extra = 0;
   auto converter = [&](const ModelProto& model, caffe2::NetDef* net) mutable {
     net->mutable_device_option()->CopyFrom(device_option);
@@ -1230,9 +1278,16 @@ void Caffe2Backend::OnnxToCaffe2(
               " without enough extra preconverted string");
         }
       } else {
+        ValueInfoMap value_infos{};
+        for (const auto& name : node.input()) {
+          auto iter = graph_value_infos.find(name);
+          if (iter != graph_value_infos.end()) {
+            value_infos[name].CopyFrom(iter->second);
+          }
+        }
         auto onnx_node = OnnxNode(node);
         auto c2ops = OnnxNodeToCaffe2Ops(
-            init_model, pred_model, &onnx_node, opset_version);
+            init_model, pred_model, {value_infos, opset_version}, &onnx_node);
         init_net_tmp->mutable_op()->MergeFrom(c2ops.init_ops);
         net->mutable_op()->MergeFrom(c2ops.ops);
         net->mutable_external_input()->MergeFrom(c2ops.interface_blobs);
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index 437e572b8528b..681ab5b30d10b 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -11,6 +11,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
+constexpr int kKnownOpsetVersion = 6;
+
 namespace caffe2 {
 namespace onnx {
 
@@ -19,6 +21,25 @@ using ::ONNX_NAMESPACE::GraphProto;
 using ::ONNX_NAMESPACE::ModelProto;
 using ::ONNX_NAMESPACE::NodeProto;
 using ::ONNX_NAMESPACE::TensorProto;
+using ::ONNX_NAMESPACE::ValueInfoProto;
+
+using ValueInfoMap = std::unordered_map<std::string, ValueInfoProto>;
+
+class ConversionContext {
+ public:
+  ConversionContext(const ValueInfoMap& value_infos, int opset_version)
+      : value_infos_(value_infos), opset_version_(opset_version) {}
+  const ValueInfoMap& value_infos() const {
+    return value_infos_;
+  }
+  int opset_version() const {
+    return opset_version_;
+  }
+
+ private:
+  const ValueInfoMap& value_infos_;
+  const int opset_version_;
+};
 
 // \brief This struct holds the converted ops after the onnx->c2 conversion.
 // Notice that for RNN ops, it may create ops in init_net. Hence we have the
@@ -129,7 +150,9 @@ class Caffe2Backend {
 
   bool SupportOp(const std::string tyep) const;
 
-  Caffe2Ops ConvertNode(const std::string& node_str, int opset_version);
+  Caffe2Ops ConvertNode(
+      const std::string& node_str,
+      const ConversionContext& ctx);
 
   void BuildTensorFillingOp(
       caffe2::OperatorDef* c2_op,
@@ -137,7 +160,8 @@ class Caffe2Backend {
       const std::string& name = "");
 
  private:
-  using SpecialOpConverter = Caffe2Ops (Caffe2Backend::*)(OnnxNode*, int);
+  using SpecialOpConverter =
+      Caffe2Ops (Caffe2Backend::*)(OnnxNode*, const ConversionContext&);
 
   void OnnxToCaffe2(
       caffe2::NetDef* init_net,
@@ -153,51 +177,56 @@ class Caffe2Backend {
   Caffe2Ops OnnxNodeToCaffe2Ops(
       const ModelProto& init_model,
       const ModelProto& pred_model,
-      OnnxNode* onnx_node,
-      int opset_version);
+      const ConversionContext& ctx,
+      OnnxNode* onnx_node);
 
   std::unordered_set<std::string> AllNamesInGraph(const GraphProto& graph);
 
-  Caffe2Ops CommonOnnxNodeToCaffe2Ops(OnnxNode* onnx_node, int opset_version);
-
-  Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CommonOnnxNodeToCaffe2Ops(
+      OnnxNode* onnx_node,
+      const ConversionContext& ctx);
 
-  Caffe2Ops CreateCast(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateConstant(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateCast(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateConvPoolOpBase(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateConstant(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreatePadPool(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateConvPoolOpBase(
+      OnnxNode* onnx_node,
+      const ConversionContext& ctx);
 
-  Caffe2Ops CreateReshape(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreatePadPool(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateGather(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateReshape(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateGemm(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateGather(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreatePad(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateGemm(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateConcat(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreatePad(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateConcat(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateSlice(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateSplit(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateBatchNormalization(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateMatMul(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateBatchNormalization(
+      OnnxNode* onnx_node,
+      const ConversionContext& ctx);
 
-  Caffe2Ops CreateUpsample(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateMatMul(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateDropout(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateUpsample(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateLRN(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateDropout(OnnxNode* onnx_node, const ConversionContext& ctx);
 
+  Caffe2Ops CreateLRN(OnnxNode* onnx_node, const ConversionContext& ctx);
 
   // LUT related getters
   const std::unordered_map<std::string, std::string>& get_renamed_operators()
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 7a7d9440d1aa7..dab79b8b1fb0b 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -212,34 +212,35 @@ def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version
         super(Caffe2Backend, cls).run_node(node, inputs, device=device,
                                            outputs_info=outputs_info, opset_version=opset_version)
 
+        value_infos = []
         device_option = get_device_option(Device(device))
         ws = Workspace()
         with core.DeviceScope(device_option):  # temporary!
             if isinstance(inputs, dict):
                 for key, value in inputs.items():
                     ws.FeedBlob(key, value)
+                    value_infos.append(onnx.helper.make_tensor_value_info(
+                        name=key,
+                        elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype],
+                        shape=value.shape).SerializeToString())
             else:
                 assert len(node.input) == len(inputs), "{}: expected {} but got {}".format(
                     node.op_type, len(node.input), len(inputs))
                 for key, value in zip(node.input, inputs):
                     ws.FeedBlob(key, value)
+                    value_infos.append(onnx.helper.make_tensor_value_info(
+                        name=key,
+                        elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype],
+                        shape=value.shape).SerializeToString())
 
             ops = []
             cbackend = C.Caffe2Backend(cls._dummy_name)
-            ops_str = cbackend.convert_node(node.SerializeToString(), opset_version)
+            ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version)
             for s in ops_str[0] + ops_str[1]:
                 op = caffe2_pb2.OperatorDef()
                 op.ParseFromString(s)
                 op.device_option.CopyFrom(device_option)
                 ops.append(op)
-            # For testing
-            if "ONNX_CAFFE2_DEBUG" in os.environ:
-                init_ops, ops2, _ = cls._onnx_node_to_caffe2_op(
-                    None, None, node, opset_version or cls._known_opset_version)
-                ops2 = init_ops + ops2
-                for op in ops2:
-                    op.device_option.CopyFrom(device_option)
-                print("\nC++:\n{}\nPython:\n{}".format(ops, ops2))
             ws.RunOperatorsOnce(ops)
             output_values = [ws.FetchBlob(name) for name in node.output]
             return namedtupledict('Outputs', node.output)(*output_values)
@@ -708,82 +709,34 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
 
         model = onnx.shape_inference.infer_shapes(model)
 
-        # Check whether we have RNN related ops
-        pred_model = cls.optimize_onnx(model, predict=True)
-        rnn_nodes = []
-        for node in pred_model.graph.node:
-            if node.op_type in {'LSTM', 'GRU', 'RNN'}:
-                rnn_nodes.append(node)
-
-        # Build the C++ backend
-        # TODO: build a predictor that supports GPU
-        #       And for RNN nets, we need to avoid adding init_net
-        use_cpp_backend = device == 'CPU' and not rnn_nodes
-        # use python backend for now
-        use_cpp_backend = False
-        if use_cpp_backend:
-            c2_rnn_ops = []
-            if rnn_nodes:
-                init_model = cls.optimize_onnx(model, init=True)
-                for node in rnn_nodes:
-                    c2ops = cls._onnx_node_to_caffe2_op(
-                        init_model, pred_model, node, opset_version)
-                    init_ops = [x.SerializeToString() for x in c2ops.init_ops]
-                    ops = [x.SerializeToString() for x in c2ops.ops]
-                    external_inputs = c2ops.interface_blobs
-                    c2_rnn_ops.append(C.Caffe2Ops(init_ops, ops, external_inputs))
-                del init_model
-
-            cbackend = C.Caffe2Backend(cls._dummy_name)
-            if raw_values_dict:
-                cls._external_value_resolution_pass(model, raw_values_dict)
-            rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops)
-            # For testing
-            # Dump the net descriptions to file for comparison with the Python ones
-            if "ONNX_CAFFE2_DEBUG" in os.environ:
-                pred_net_str = rep.pred_net()
-                pn = caffe2_pb2.NetDef()
-                pn.ParseFromString(pred_net_str)
-                init_net_str = rep.init_net()
-                inn = caffe2_pb2.NetDef()
-                inn.ParseFromString(init_net_str)
-                with open("cpp.txt", "w") as f:
-                    f.write("pred_net: \n{}".format(pn))
-
-            rep_wrapper = Caffe2CppRep(rep)
-            return rep_wrapper
-        else:
-            ws = Workspace()
-            device_option = get_device_option(Device(device))
+        ws = Workspace()
+        device_option = get_device_option(Device(device))
 
-            init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
+        init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
 
-            if raw_values_dict:
-                cls._external_value_resolution_pass(model, raw_values_dict)
+        if raw_values_dict:
+            cls._external_value_resolution_pass(model, raw_values_dict)
 
-            # Directly load initializer data into blobs in workspace
-            cls._direct_initialize_parameters(
-                model.graph.initializer,
-                ws,
-                device_option,
-            )
+        # Directly load initializer data into blobs in workspace
+        cls._direct_initialize_parameters(
+            model.graph.initializer,
+            ws,
+            device_option,
+        )
 
-            initialized = {init.name for init in model.graph.initializer}
+        initialized = {init.name for init in model.graph.initializer}
 
-            cls._direct_initialize_inputs(
-                model.graph.input,
-                initialized,
-                ws,
-                device_option,
-            )
+        cls._direct_initialize_inputs(
+            model.graph.input,
+            initialized,
+            ws,
+            device_option,
+        )
 
-            uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]
+        uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]
 
-            if "ONNX_CAFFE2_DEBUG" in os.environ:
-                with open("python.txt", "w") as f:
-                    f.write("pred_net: \n{}".format(predict_net))
-            retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
-            return retval
+        retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
+        return retval
 
 
     @classmethod
@@ -791,7 +744,20 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
     def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version):
         cbackend = C.Caffe2Backend(cls._dummy_name)
         if cbackend.support_onnx_import(node_def.op_type):
-            op_strs = cbackend.convert_node(node_def.SerializeToString(), opset_version)
+
+            # extract value infos from pred model (value infos of
+            # node's inputs that are in init model should be all
+            # available in pred model)
+            value_infos = []
+            for name in node_def.input:
+                if pred_model is not None:
+                    for vi in itertools.chain(pred_model.graph.input,
+                                              pred_model.graph.output,
+                                              pred_model.graph.value_info):
+                        if vi.name == name:
+                            value_infos.append(vi.SerializeToString())
+
+            op_strs = cbackend.convert_node(node_def.SerializeToString(), value_infos, opset_version)
             init_ops = []
             for s in op_strs[0]:
                 op = caffe2_pb2.OperatorDef()
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index 97d824e05897a..e526d74f73921 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -1,4 +1,4 @@
-## @package onnx
+# @package onnx
 # Module caffe2.python.onnx.tests.c2_ref_test
 
 from __future__ import absolute_import
@@ -39,14 +39,14 @@ def test_dummy_name(self):
     def test_check_arguments(self):
         b2 = C.Caffe2Backend()
 
-        node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"])
-        b2.convert_node(node_def.SerializeToString(), 6)
+        node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"])
+        b2.convert_node(node_def.SerializeToString())
 
-        bad_node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"], foo = 42, bar = 56)
+        bad_node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"], foo=42, bar=56)
         with self.assertRaisesRegexp(
-            RuntimeError,
-            ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"):
-            b2.convert_node(bad_node_def.SerializeToString(), 6)
+                RuntimeError,
+                ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"):
+            b2.convert_node(bad_node_def.SerializeToString())
 
     def test_relu_graph(self):
         X = np.random.randn(3, 2).astype(np.float32)
@@ -199,6 +199,54 @@ def test_gemm(self):
             output["Y"],
             alpha * np.dot(A, B) + beta * C)
 
+    def test_gemm_conversion(self):
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=2.,
+            beta=3.,
+            transB=True)
+
+        backend = C.Caffe2Backend()
+
+        # without broadcast and without shape info, gemm will be
+        # converted to matmul + add
+        _, op_strs = backend.convert_node(node_def.SerializeToString())
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
+
+        # with shape info (that indicates C is 1D), gemm will be
+        # converted to FC
+        _, op_strs = backend.convert_node(node_def.SerializeToString(
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()])
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
+
+        # or with broadcast, gemm will be converted to fc
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            transB=True,
+            broadcast=1)
+
+        _, op_strs = backend.convert_node(node_def.SerializeToString())
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['FC'])
+
     def test_tensor_filling_ops(self):
         for dtype in [
                 onnx.TensorProto.FLOAT,
@@ -267,7 +315,6 @@ def test_tensor_filling_ops_c_backend(self):
             np.testing.assert_almost_equal(output[0], vals)
             np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals)
 
-
     def test_slice(self):
         X = np.random.randn(1, 2, 3).astype(np.float32)
         starts = np.array([0, 1, 0], dtype=np.int32)
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 3412e5c306fba..b7784fffa5059 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -795,13 +795,21 @@ void addObjectMethods(py::module& m) {
           "convert_node",
           [](caffe2::onnx::Caffe2Backend& instance,
              const py::bytes& node_str,
+             const std::vector<py::bytes>& value_infos_bytes,
              int opset_version) -> std::vector<std::vector<py::bytes>> {
             // Note that we return two lists of serialized ops. The first set is
             // init_ops and the second set is ops for pred net. When converting
             // RNN related op, it is possible that we will create ops in the
             // init_net. Hence the return structure here
+            caffe2::onnx::ValueInfoMap value_infos{};
+            for (const auto& vi_bytes : value_infos_bytes) {
+              ::ONNX_NAMESPACE::ValueInfoProto vi{};
+              vi.ParseFromString(vi_bytes);
+              auto name = vi.name();
+              value_infos.emplace(std::move(name), std::move(vi));
+            }
             auto c2ops = instance.ConvertNode(
-                node_str.cast<std::string>(), opset_version);
+                node_str.cast<std::string>(), {value_infos, opset_version});
             std::vector<std::vector<py::bytes>> vals;
             vals.emplace_back();
             auto& init_vals = vals.back();
@@ -818,12 +826,15 @@ void addObjectMethods(py::module& m) {
               normal_vals.emplace_back(py::bytes(out));
             }
             return vals;
-          })
+          },
+          py::arg("node_str"),
+          py::arg("value_infos_bytes") = std::vector<py::bytes>{},
+          py::arg("opset_version") = kKnownOpsetVersion)
       .def(
-        "_build_tensor_filling_op",
-        [](caffe2::onnx::Caffe2Backend& instance,
-           const py::bytes& tensor_proto_str,
-           const std::string& name="") -> py::bytes {
+          "_build_tensor_filling_op",
+          [](caffe2::onnx::Caffe2Backend& instance,
+             const py::bytes& tensor_proto_str,
+             const std::string& name = "") -> py::bytes {
             caffe2::OperatorDef op;
             ::ONNX_NAMESPACE::TensorProto tp;
             ParseProtoFromLargeString(tensor_proto_str, &tp);
@@ -831,7 +842,7 @@ void addObjectMethods(py::module& m) {
             std::string out;
             op.SerializeToString(&out);
             return py::bytes(out);
-        });
+          });
 
   py::class_<Predictor>(m, "Predictor")
       .def(

From 1dc708493e0b164d2fd7c577415e1cc5345d739b Mon Sep 17 00:00:00 2001
From: zou3519 <zou3519@gmail.com>
Date: Thu, 26 Jul 2018 12:01:30 -0700
Subject: [PATCH 08/10] Add html-stable target to docs Makefile (#9884)

Summary:
This lets one build docs for the release easier. All of the unstable
warnings are removed in `make html-stable`.

cc soumith SsnL

Sample build:
![image](https://user-images.githubusercontent.com/5652049/43277115-05e2f720-90d5-11e8-9977-b0b4a6ee4b8e.png)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9884

Reviewed By: SsnL

Differential Revision: D9016001

Pulled By: zou3519

fbshipit-source-id: 5cf2dfbf886de993242db28cdac5d0c5fadbdc4d
---
 docs/Makefile       | 7 +++++++
 docs/source/conf.py | 9 ++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/Makefile b/docs/Makefile
index 2a63943f00f0a..4a56c12ca22d8 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -23,6 +23,13 @@ docset: html
 	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
 	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
 
+html-stable:
+	# stable differs from `make html` in two ways:
+	# 1) The stable logo is used instead of the unstable logo
+	# 2) There will not be a link to the stable docs.
+	# See conf.py for more details.
+	RELEASE=1 make html
+
 .PHONY: help Makefile docset
 
 # Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b48a5ad27362a..1eaaa3b9086d9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-# import os
+import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import torch
@@ -28,6 +28,8 @@
     warnings.warn('unable to load "torchvision" package')
 import sphinx_rtd_theme
 
+RELEASE = os.environ.get('RELEASE', False)
+
 
 # -- General configuration ------------------------------------------------
 
@@ -54,6 +56,8 @@
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
+if RELEASE:
+    templates_path = ['_templates-stable']
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
@@ -122,6 +126,9 @@
 }
 
 html_logo = '_static/img/pytorch-logo-dark-unstable.png'
+if RELEASE:
+    html_logo = '_static/img/pytorch-logo-dark.svg'
+
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

From 456f41301ccaa6240155a6ecd7f5df82dedad7a6 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Thu, 26 Jul 2018 12:58:10 -0700
Subject: [PATCH 09/10] Disable unique ops test on rocm (#9892)

Summary:
Somehow we have Unique operator tests in two places test_unqiue_ops.py and hypothesis_test.py
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9892

Reviewed By: houseroad

Differential Revision: D9017631

Pulled By: bddppq

fbshipit-source-id: 1f9e40e4953afca26141ef4581202b9b9fce0ae9
---
 .jenkins/caffe2/test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 60502343472ae..053a9be5e0548 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -101,6 +101,7 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py")
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py")
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
+  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py")
 
   # Need to go through roi ops to replace max(...) with fmaxf(...)
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py")

From 969b62f2763647fe32dbe76aa358f00873edbf4e Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Thu, 26 Jul 2018 13:54:32 -0700
Subject: [PATCH 10/10] Revert D8121878: Remove template parameter from Tensor

Differential Revision:
D8121878

Original commit changeset: 4a5e9a677ba4

fbshipit-source-id: d8e2c0bb145b52fbcca323b22d1d3346f0b3249e
---
 binaries/benchmark_helper.cc                  |   6 +-
 binaries/benchmark_helper.h                   |   2 +-
 binaries/core_overhead_benchmark.cc           |   6 +-
 binaries/print_core_object_sizes.cc           |   3 +-
 binaries/speed_benchmark.cc                   |   2 +-
 caffe2/contrib/aten/aten_op_template.h        |  14 +-
 caffe2/contrib/gloo/common.cc                 |   2 +-
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  10 +-
 .../contrib/nervana/nervana_fc_op_gpu_test.cc |   6 +-
 caffe2/contrib/warpctc/ctc_op.h               |  14 +-
 caffe2/core/allocator.cc                      |   2 +-
 caffe2/core/blob.h                            |  29 +-
 caffe2/core/blob_gpu_test.cc                  |  34 +-
 caffe2/core/blob_serialization.cc             | 416 +-------
 caffe2/core/blob_serialization.h              | 445 +++++++-
 caffe2/core/blob_serialization_gpu.cc         |  15 +-
 caffe2/core/blob_test.cc                      | 105 +-
 caffe2/core/context.cc                        |  13 -
 caffe2/core/context.h                         | 150 +--
 caffe2/core/context_base.cc                   |   5 -
 caffe2/core/context_base.h                    | 187 ----
 caffe2/core/context_gpu.cu                    |  48 +-
 caffe2/core/context_gpu.h                     |  89 +-
 caffe2/core/context_test.cc                   |   2 +-
 caffe2/core/dispatch/CMakeLists.txt           |   1 -
 caffe2/core/dispatch/OpSchema.h               |  38 +-
 caffe2/core/dispatch/OpSchema_test.cpp        |  13 +-
 caffe2/core/hip/blob_serialization_hip.cc     |  12 +-
 caffe2/core/hip/context_hip.cc                |  79 +-
 caffe2/core/hip/context_hip.h                 |  88 +-
 caffe2/core/int8_serialization.cc             |   2 +-
 caffe2/core/operator.h                        |  75 +-
 caffe2/core/plan_executor.cc                  |   3 +-
 caffe2/core/predictor.cc                      |   8 +-
 caffe2/core/predictor_test.cc                 |  10 +-
 caffe2/core/tensor.cc                         |  64 +-
 caffe2/core/tensor.h                          | 226 ++--
 caffe2/core/tensor_int8.h                     |   3 +-
 caffe2/core/typeid.cc                         |   3 +-
 caffe2/core/typeid.h                          |  50 +-
 caffe2/core/workspace.h                       |   6 +-
 .../fully_connected_op_decomposition.h        |  12 +-
 .../operators/fully_connected_op_prune.h      |   8 +-
 .../operators/fully_connected_op_sparse.h     |   4 +-
 .../operators/sparse_matrix_reshape_op.h      |   1 +
 caffe2/ideep/operators/concat_split_op.cc     |   4 +-
 .../ideep/operators/operator_fallback_ideep.h |   9 +-
 caffe2/ideep/operators/utility_ops.cc         |   8 +-
 caffe2/ideep/utils/ideep_context.h            |  72 +-
 caffe2/ideep/utils/ideep_register.cc          |  12 +-
 caffe2/image/image_input_op.h                 |  45 +-
 caffe2/image/transform_gpu.cu                 |  33 +-
 caffe2/image/transform_gpu.h                  |   9 +-
 caffe2/mkl/mkl_utils_test.cc                  |  14 +-
 caffe2/mkl/operators/conv_op.cc               |   6 +-
 caffe2/mkl/operators/conv_op_mkldnn.cc        |   2 +-
 caffe2/mkl/operators/operator_fallback_mkl.h  |   4 +-
 caffe2/mkl/operators/packed_fc_op.cc          |   4 +-
 caffe2/mkl/operators/pool_op.cc               |   4 +-
 caffe2/mkl/operators/utility_ops.cc           |   4 +-
 caffe2/mkl/utils/mkl_context.cc               |   8 -
 caffe2/mkl/utils/mkl_context.h                |  93 +-
 caffe2/mobile/contrib/CMakeLists.txt          |   9 +-
 .../contrib/arm-compute/operators/copy_op.cc  |   4 +-
 .../arm-compute/test/gl_operator_test.h       |   2 +-
 caffe2/mobile/contrib/ios/ios_caffe.cc        |   2 +-
 .../mobile/contrib/ios/ios_caffe_predictor.cc |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |   6 +-
 caffe2/mobile/contrib/ios/pool_test.cc        |   2 +-
 caffe2/mobile/contrib/ios/resize_test.cc      |   2 +-
 caffe2/mobile/contrib/nnapi/nnapi.cc          |   2 +-
 .../mobile/contrib/nnapi/nnapi_benchmark.cc   |  22 +-
 caffe2/mobile/contrib/nnapi/nnapi_test.cc     |  28 +-
 caffe2/mobile/contrib/opengl/CMakeLists.txt   |  17 +-
 .../mobile/contrib/opengl/test/opengl_test.cc |  94 +-
 caffe2/mobile/contrib/snpe/snpe_op.cc         |   3 +-
 .../mobile/contrib/snpe/snpe_op_benchmark.cc  |  34 +-
 caffe2/mobile/contrib/ulp2/ulp.cc             |  16 +-
 caffe2/mobile/contrib/ulp2/ulp_neon.cc        |   2 +-
 caffe2/mobile/contrib/ulp2/ulp_test.cc        |  28 +-
 caffe2/mpi/mpi_gpu_test.cc                    |  17 +-
 caffe2/mpi/mpi_ops.h                          |  15 +-
 caffe2/observers/profile_observer_gpu.cc      |  23 +-
 caffe2/operators/accuracy_op.cc               |  37 +-
 caffe2/operators/accuracy_op.cu               |   2 +-
 caffe2/operators/affine_channel_op.cc         |  17 +-
 caffe2/operators/affine_channel_op.cu         |  17 +-
 caffe2/operators/apmeter_op.cc                |   4 +-
 caffe2/operators/assert_op.h                  |   2 +-
 caffe2/operators/atomic_ops.cc                |   6 +-
 caffe2/operators/batch_gather_ops.cu          |   6 +-
 caffe2/operators/batch_gather_ops.h           |   7 +-
 caffe2/operators/batch_matmul_op.h            |   4 +-
 caffe2/operators/batch_matmul_op_gpu_test.cc  |   8 +-
 caffe2/operators/batch_matmul_op_test.cc      |   4 +-
 caffe2/operators/bbox_transform_op.cc         |   6 +-
 caffe2/operators/boolean_mask_ops.cc          |  23 +-
 caffe2/operators/boolean_mask_ops.cu          |  29 +-
 caffe2/operators/boolean_unmask_ops.cu        |  22 +-
 caffe2/operators/boolean_unmask_ops_test.cc   |   6 +-
 caffe2/operators/box_with_nms_limit_op.cc     |  18 +-
 caffe2/operators/ceil_op.cu                   |   2 +-
 caffe2/operators/channel_backprop_stats_op.cc |   6 +-
 caffe2/operators/channel_backprop_stats_op.cu |   4 +-
 caffe2/operators/channel_backprop_stats_op.h  |   4 +-
 caffe2/operators/channel_shuffle_op_gpu.cu    |  14 +-
 caffe2/operators/channel_stats_op.cc          |   6 +-
 caffe2/operators/channel_stats_op.cu          |   4 +-
 caffe2/operators/channel_stats_op.h           |   4 +-
 caffe2/operators/clip_op.cc                   |   4 +-
 caffe2/operators/clip_op.cu                   |  24 +-
 ...ect_and_distribute_fpn_rpn_proposals_op.cc |  16 +-
 caffe2/operators/concat_split_op.h            |  10 +-
 caffe2/operators/conditional_op.cc            |   4 +-
 caffe2/operators/conv_op.h                    |  16 +-
 caffe2/operators/conv_op_impl.h               |  12 +-
 caffe2/operators/conv_op_shared.cc            |   6 +-
 caffe2/operators/conv_op_shared.h             |   4 +-
 caffe2/operators/conv_op_shared_gpu.cc        |   8 +-
 caffe2/operators/conv_pool_op_base.h          |  20 +-
 caffe2/operators/conv_transpose_op.h          |   8 +-
 caffe2/operators/conv_transpose_op_impl.h     |  12 +-
 caffe2/operators/conv_transpose_op_mobile.h   |   2 +-
 .../operators/conv_transpose_op_mobile_impl.h |   6 +-
 .../conv_transpose_op_mobile_test.cc          |  18 +-
 .../operators/conv_transpose_unpool_op_base.h |   5 +-
 .../cosine_embedding_criterion_op.cc          |   4 +-
 .../cosine_embedding_criterion_op.cu          |   6 +-
 caffe2/operators/counter_ops.h                |  10 +-
 caffe2/operators/cross_entropy_op.cc          |  26 +-
 caffe2/operators/cross_entropy_op.cu          |  59 +-
 .../operators/ctc_beam_search_decoder_op.cc   |   3 +-
 caffe2/operators/ctc_greedy_decoder_op.cc     |   5 +-
 caffe2/operators/dataset_ops.cc               |  62 +-
 caffe2/operators/dataset_ops.h                |   3 +-
 caffe2/operators/deform_conv_op.h             |  16 +-
 caffe2/operators/deform_conv_op_impl.h        |   8 +-
 .../operators/depthwise_3x3_conv_op_cudnn.cu  |   4 +-
 caffe2/operators/distance_op.cc               |  24 +-
 caffe2/operators/distance_op.cu               |  60 +-
 caffe2/operators/distance_op.h                |   4 +-
 caffe2/operators/dropout_op.cc                |  20 +-
 caffe2/operators/dropout_op.cu                |  18 +-
 caffe2/operators/dropout_op_cudnn.cc          |  11 +-
 caffe2/operators/elementwise_linear_op.cc     |  10 +-
 caffe2/operators/elementwise_linear_op.cu     |  23 +-
 caffe2/operators/elementwise_logical_ops.cc   |   4 +-
 caffe2/operators/elementwise_logical_ops.h    |   4 +-
 caffe2/operators/elementwise_op_test.h        |  32 +-
 caffe2/operators/elementwise_ops.cu           |   6 +-
 caffe2/operators/elementwise_ops.h            |   4 +-
 caffe2/operators/elementwise_ops_utils.cc     |  39 -
 caffe2/operators/elementwise_ops_utils.h      |  44 +-
 caffe2/operators/enforce_finite_op.cu         |   2 +-
 caffe2/operators/enforce_finite_op.h          |   4 +-
 caffe2/operators/ensure_cpu_output_op.h       |  10 +-
 caffe2/operators/expand_op.h                  |   2 +-
 caffe2/operators/feature_maps_ops.h           |  18 +-
 caffe2/operators/filler_op.cc                 |   7 +-
 caffe2/operators/filler_op.cu                 |   6 +-
 caffe2/operators/filler_op.h                  |  38 +-
 caffe2/operators/find_op.cu                   |   2 +-
 caffe2/operators/flatten_op.h                 |   2 +-
 caffe2/operators/floor_op.cu                  |   2 +-
 caffe2/operators/fully_connected_op.h         |   5 +-
 .../operators/gather_fused_8bit_rowwise_op.h  |   2 +-
 caffe2/operators/gather_ranges_to_dense_op.h  |   4 +-
 caffe2/operators/generate_proposals_op.cc     |   4 +-
 .../operators/generate_proposals_op_test.cc   |  16 +-
 caffe2/operators/given_tensor_fill_op.h       |  12 +-
 caffe2/operators/group_norm_op.h              |   4 +-
 caffe2/operators/gru_unit_op.h                |  10 +-
 caffe2/operators/h_softmax_op.cc              |  72 +-
 caffe2/operators/h_softmax_op.h               |   6 +-
 caffe2/operators/half_float_ops.cu            |   4 +-
 caffe2/operators/if_op.h                      |   2 +-
 caffe2/operators/index_ops.cc                 |  24 +-
 caffe2/operators/instance_norm_op.cu          |  24 +-
 caffe2/operators/instance_norm_op.h           |   8 +-
 caffe2/operators/integral_image_op.cu         |  14 +-
 caffe2/operators/integral_image_op.h          |   2 +-
 caffe2/operators/jsd_op.cc                    |   4 +-
 caffe2/operators/last_n_window_collector.cc   |   6 +-
 caffe2/operators/layer_norm_op.cu             |  14 +-
 caffe2/operators/layer_norm_op.h              |  14 +-
 caffe2/operators/leaky_relu_op.cc             |   2 +-
 caffe2/operators/leaky_relu_op.cu             |   4 +-
 caffe2/operators/lengths_pad_op.h             |  11 +-
 caffe2/operators/lengths_tile_op.h            |   9 +-
 caffe2/operators/listwise_l2r_op.cc           |   6 +-
 caffe2/operators/listwise_l2r_op.h            |  18 +-
 caffe2/operators/load_save_op.h               |   2 +-
 .../local_response_normalization_op.cc        |  43 +-
 .../local_response_normalization_op.cu        |  39 +-
 .../local_response_normalization_op.h         |   8 +-
 caffe2/operators/locally_connected_op.h       |  40 +-
 caffe2/operators/locally_connected_op_impl.h  |  24 +-
 caffe2/operators/logit_op.cu                  |   6 +-
 caffe2/operators/lp_pool_op.cc                |  12 +-
 caffe2/operators/lp_pool_op.cu                | 176 ++--
 caffe2/operators/lpnorm_op.cc                 |  14 +-
 caffe2/operators/lstm_unit_op.h               |   4 +-
 caffe2/operators/map_ops.h                    |  10 +-
 .../operators/margin_ranking_criterion_op.cc  |  12 +-
 .../operators/margin_ranking_criterion_op.cu  |   6 +-
 caffe2/operators/max_pool_with_index.cu       |  42 +-
 caffe2/operators/mem_query_op.cu              |  10 +-
 caffe2/operators/multi_class_accuracy_op.cc   |   4 +-
 caffe2/operators/multi_class_accuracy_op.cu   |   4 +-
 caffe2/operators/norm_planar_yuv_op.cc        |   2 +-
 caffe2/operators/normalize_ops.cu             |   7 +-
 caffe2/operators/numpy_tile_op.h              |   7 +-
 caffe2/operators/one_hot_ops.cc               |   6 +-
 caffe2/operators/one_hot_ops.cu               |   6 +-
 caffe2/operators/one_hot_ops.h                |   6 +-
 caffe2/operators/onnx_while_op.h              |  32 +-
 caffe2/operators/onnxifi_op.cc                |   2 +-
 caffe2/operators/operator_fallback_gpu.h      |   9 +-
 .../operators/operator_fallback_gpu_test.cc   |  10 +-
 caffe2/operators/order_switch_ops.cc          |  22 +-
 caffe2/operators/order_switch_ops.cu          |  18 +-
 caffe2/operators/pack_rnn_sequence_op.h       |   2 +-
 caffe2/operators/pack_segments.cc             |   6 +-
 caffe2/operators/pack_segments.cu             |  20 +-
 caffe2/operators/pack_segments.h              |  20 +-
 caffe2/operators/pad_op.cc                    |   8 +-
 caffe2/operators/pad_op_gpu.cu                |   8 +-
 caffe2/operators/partition_ops.h              |   4 +-
 caffe2/operators/percentile_op.h              |   4 +-
 caffe2/operators/perplexity_op.cc             |   2 +-
 caffe2/operators/perplexity_op.cu             |   2 +-
 .../piecewise_linear_transform_op.cc          |   6 +-
 .../piecewise_linear_transform_op.cu          |  24 +-
 .../operators/piecewise_linear_transform_op.h |   6 +-
 caffe2/operators/pool_op.cu                   | 972 +++++++++---------
 caffe2/operators/pool_op_cudnn.cu             |  14 +-
 caffe2/operators/prelu_op.cc                  |   4 +-
 caffe2/operators/prelu_op.cu                  |   6 +-
 caffe2/operators/prepend_dim_op.h             |   4 +-
 caffe2/operators/quant_decode_op.h            |  38 +-
 caffe2/operators/reducer_functors.h           |  26 +-
 caffe2/operators/reduction_front_back_ops.h   |   2 +-
 caffe2/operators/reduction_ops.cc             |   4 +-
 caffe2/operators/reduction_ops.cu             |  15 +-
 caffe2/operators/reduction_ops.h              |   6 +-
 caffe2/operators/relu_n_op.cc                 |   4 +-
 caffe2/operators/remove_data_blocks_op.h      |   2 +-
 caffe2/operators/reservoir_sampling.cc        |   2 +-
 caffe2/operators/reshape_op.h                 |   9 +-
 caffe2/operators/reshape_op_gpu_test.cc       |   6 +-
 caffe2/operators/resize_op.cc                 |  10 +-
 caffe2/operators/resize_op.cu                 |   6 +-
 caffe2/operators/reverse_packed_segs_op.h     |   6 +-
 caffe2/operators/rmac_regions_op.cc           |   4 +-
 caffe2/operators/rmac_regions_op.cu           |   5 +-
 caffe2/operators/rmac_regions_op.h            |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |  11 +-
 .../rnn/recurrent_network_executor.h          |   4 +-
 caffe2/operators/rnn/recurrent_network_op.h   |  36 +-
 caffe2/operators/rnn/recurrent_op_cudnn.cc    |  18 +-
 caffe2/operators/rnn/recurrent_op_cudnn.h     |  10 +-
 caffe2/operators/roi_align_gradient_op.cc     |   4 +-
 caffe2/operators/roi_align_gradient_op.cu     |   8 +-
 caffe2/operators/roi_align_op.cc              |   6 +-
 caffe2/operators/roi_align_op.cu              |   2 +-
 caffe2/operators/roi_align_op_gpu_test.cc     |  18 +-
 caffe2/operators/roi_pool_op.cc               |   4 +-
 caffe2/operators/roi_pool_op.cu               |  74 +-
 caffe2/operators/scale_op.cc                  |  16 +-
 caffe2/operators/segment_reduction_op.h       |   4 +-
 caffe2/operators/segment_reduction_op_gpu.cu  |  67 +-
 caffe2/operators/selu_op.cc                   |   5 +-
 caffe2/operators/selu_op.cu                   |   8 +-
 caffe2/operators/sequence_ops.cc              |  10 +-
 caffe2/operators/sequence_ops.cu              |  18 +-
 caffe2/operators/sequence_ops.h               |  12 +-
 caffe2/operators/shape_op.h                   |   6 +-
 .../operators/sinusoid_position_encoding_op.h |   2 +-
 caffe2/operators/slice_op.cu                  |  16 +-
 caffe2/operators/slice_op.h                   |  34 +-
 caffe2/operators/softmax_op.cc                |   4 +-
 caffe2/operators/softmax_op.h                 |  10 +-
 caffe2/operators/softmax_ops.cu               |  38 +-
 caffe2/operators/softmax_shared.cc            |   2 +-
 caffe2/operators/softmax_with_loss_op.cc      |   8 +-
 caffe2/operators/softmax_with_loss_op.h       |  21 +-
 caffe2/operators/softplus_op.cc               |   4 +-
 caffe2/operators/softplus_op.cu               |   7 +-
 caffe2/operators/space_batch_op.h             |   8 +-
 caffe2/operators/space_batch_op_gpu.cu        |  14 +-
 caffe2/operators/sparse_to_dense_mask_op.h    |   8 +-
 caffe2/operators/sparse_to_dense_op.h         |   6 +-
 .../spatial_batch_norm_gradient_op.cc         |  10 +-
 caffe2/operators/spatial_batch_norm_op.cc     |   4 +-
 .../operators/spatial_softmax_with_loss_op.cc |  48 +-
 .../operators/spatial_softmax_with_loss_op.h  |  21 +-
 caffe2/operators/stats_ops.cc                 |  10 +-
 caffe2/operators/string_ops.cc                |   4 +-
 caffe2/operators/string_ops_test.cc           |  24 +-
 caffe2/operators/stump_func_op.cu             |   2 +-
 caffe2/operators/stylizer_ops.cc              |  12 +-
 caffe2/operators/summarize_op.cc              |   8 +-
 caffe2/operators/summarize_op.cu              |   6 +-
 caffe2/operators/swish_op.cc                  |   4 +-
 caffe2/operators/tensor_protos_db_input.h     |  21 +-
 caffe2/operators/thresholded_relu_op.cc       |   6 +-
 caffe2/operators/thresholded_relu_op.cu       |   7 +-
 caffe2/operators/tile_op.h                    |  14 +-
 caffe2/operators/top_k.cu                     |  18 +-
 caffe2/operators/tt_linear_op.h               |   8 +-
 caffe2/operators/unique_ops.cu                |   6 +-
 caffe2/operators/unique_ops.h                 |   6 +-
 caffe2/operators/utility_ops.cc               |   2 +-
 caffe2/operators/utility_ops.cu               |  17 +-
 caffe2/operators/utility_ops.h                |  60 +-
 caffe2/operators/utility_ops_gpu_test.cc      |   6 +-
 caffe2/operators/utility_ops_test.cc          |   6 +-
 .../operators/weighted_multi_sampling_op.cc   |   6 +-
 caffe2/operators/weighted_sample_op.cc        |  12 +-
 caffe2/operators/weighted_sample_op.cu        |   8 +-
 caffe2/operators/weighted_sample_op.h         |   2 +-
 caffe2/operators/while_op.h                   |   2 +-
 caffe2/operators/workspace_ops.cc             |   3 +-
 caffe2/opt/fusion.cc                          |   8 +-
 caffe2/opt/onnxifi_transformer.cc             |   4 +-
 caffe2/python/pybind_state.cc                 |  44 +-
 caffe2/python/pybind_state.h                  |  58 +-
 caffe2/python/pybind_state_dlpack.h           |   4 +-
 caffe2/python/pybind_state_gpu.cc             |   1 +
 caffe2/python/pybind_state_hip.cc             |   1 +
 caffe2/python/pybind_state_int8.cc            |   3 +-
 caffe2/queue/blobs_queue_db.h                 |   4 +-
 caffe2/queue/queue_ops.h                      |   2 +-
 caffe2/queue/rebatching_queue.cc              |   8 +-
 caffe2/queue/rebatching_queue_ops.h           |   4 +-
 caffe2/sgd/adam_op.h                          |   8 +-
 caffe2/sgd/adam_op_gpu.cu                     |   2 +-
 caffe2/sgd/fp16_momentum_sgd_op.h             |   5 +-
 caffe2/sgd/fp32_momentum_sgd_op.h             |   5 +-
 caffe2/sgd/iter_op.h                          |   9 +-
 caffe2/sgd/learning_rate_op.h                 |   4 +-
 caffe2/sgd/momentum_sgd_op.h                  |  10 +-
 caffe2/sgd/yellowfin_op.h                     |  32 +-
 .../contrib/depthwise/depthwise3x3_conv_op.cc |   6 +-
 .../depthwise/depthwise3x3_conv_op_test.cc    |   2 +-
 caffe2/share/contrib/nnpack/conv_op.cc        |  12 +-
 caffe2/share/contrib/nnpack/nnpack_test.cc    |   2 +-
 caffe2/utils/filler.h                         |   4 +-
 caffe2/utils/hip/math_blas_hip_test.cc        |  56 +-
 caffe2/utils/hip/math_hip.cc                  |  16 +-
 caffe2/utils/math.h                           |   9 +-
 caffe2/utils/math_cpu.cc                      |  40 +-
 caffe2/utils/math_gpu.cu                      |  16 +-
 caffe2/utils/math_gpu_test.cc                 | 105 +-
 caffe2/utils/math_test.cc                     |  56 +-
 caffe2/utils/smart_tensor_printer.cc          |   6 +-
 caffe2/utils/smart_tensor_printer.h           |  10 +-
 caffe2/utils/smart_tensor_printer_test.cc     |   2 +-
 caffe2/video/video_input_op.h                 |  28 +-
 modules/detectron/group_spatial_softmax_op.h  |   2 +-
 modules/detectron/select_smooth_l1_loss_op.h  |   4 +-
 .../detectron/sigmoid_cross_entropy_loss_op.h |  10 +-
 modules/detectron/sigmoid_focal_loss_op.h     |   8 +-
 modules/detectron/smooth_l1_loss_op.h         |   4 +-
 modules/detectron/softmax_focal_loss_op.h     |   4 +-
 365 files changed, 3491 insertions(+), 4167 deletions(-)
 delete mode 100644 caffe2/core/context_base.cc
 delete mode 100644 caffe2/core/context_base.h

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index f240ea45f26f7..52b51174cf34d 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -160,7 +160,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -197,7 +197,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+    caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
     tensor->Resize(std::vector<caffe2::TIndex>());
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       (tensor->mutable_data<std::string>())[0] = tensor_proto->string_data(0);
@@ -286,7 +286,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
+              workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>(),
               output_prefix,
               name);
         }
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
index 7e75f557f88ad..0a52e16a50079 100644
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@@ -35,7 +35,7 @@ void writeTextOutput(
     const string& output_prefix,
     const string& name) {
   string output_name = output_prefix + "/" + name + ".txt";
-  caffe2::TensorSerializer ser;
+  caffe2::TensorSerializer<ContextType> ser;
   caffe2::BlobProto blob_proto;
   ser.Serialize(
       *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
diff --git a/binaries/core_overhead_benchmark.cc b/binaries/core_overhead_benchmark.cc
index 5cb0a62797553..74f19d58e32cd 100644
--- a/binaries/core_overhead_benchmark.cc
+++ b/binaries/core_overhead_benchmark.cc
@@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
 
 static void BM_CudaPointerAffinity(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
+  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
   float* ptr = tensor.mutable_data<float>();
   while (state.KeepRunning()) {
     volatile int id = GetGPUIDForPointer(ptr);
@@ -198,7 +198,7 @@ static void BM_RawAllocDeallocCPU(benchmark::State& state) {
 BENCHMARK(BM_RawAllocDeallocCPU);
 
 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
-  Tensor tensor(CPU);
+  Tensor<CPUContext> tensor;
   // small allocation
   tensor.Resize(32, 32);
   while (state.KeepRunning()) {
@@ -210,7 +210,7 @@ BENCHMARK(BM_TensorAllocDeallocCPU);
 
 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  Tensor tensor(CUDA);
+  Tensor<CUDAContext> tensor;
   // small allocation
   tensor.Resize(32, 32);
   while (state.KeepRunning()) {
diff --git a/binaries/print_core_object_sizes.cc b/binaries/print_core_object_sizes.cc
index f99ef09ca4e97..2000c349fb8b9 100644
--- a/binaries/print_core_object_sizes.cc
+++ b/binaries/print_core_object_sizes.cc
@@ -28,7 +28,8 @@
 
 int main(int /* unused */, char** /* unused */) {
   PRINT_SIZE(caffe2::Blob);
-  PRINT_SIZE(caffe2::Tensor);
+  PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
+  PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
   PRINT_SIZE(caffe2::CPUContext);
   PRINT_SIZE(caffe2::CUDAContext);
   PRINT_SIZE(caffe2::OperatorBase);
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index cf6d400fe1e00..196be4a77946c 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+        caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index 9d646d04bf71b..feccafd514cbb 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -54,11 +54,11 @@ class ATenOp : public Operator<Context> {
     #undef DEFINE_CASE
   }
 
-  at::Type& typeFor(const Tensor& ten) {
+  at::Type & typeFor(const Tensor<Context> & ten) {
     return at::getType(backend(), atScalarTypeFor(ten.meta()));
   }
-  at::Tensor tensorWrapping(const Tensor& ten_) {
-    auto& ten = const_cast<Tensor&>(ten_);
+  at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
+    auto& ten = const_cast<Tensor<Context>&>(ten_);
     return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
   }
 
@@ -88,7 +88,7 @@ class ATenOp : public Operator<Context> {
     }
     CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
   }
-  void assignTo(Tensor* dst, const at::Tensor& src_) {
+  void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
     at::Tensor src = src_.contiguous();
     auto at_sizes = src.sizes();
     std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
@@ -121,7 +121,7 @@ class ATenOp : public Operator<Context> {
     return s.toLong();
   }
 
-  void assignTo(Tensor* dst, at::Type& inferred_type, at::Scalar scalar) {
+  void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
     switch(inferred_type.scalarType()) {
       #define DEFINE_CASE(ctype,aten_name,native) \
         case at::k##aten_name: { \
@@ -134,8 +134,8 @@ class ATenOp : public Operator<Context> {
         CAFFE_THROW("Unknown ATen Type");
     }
   }
-  template <typename T>
-  void assignToValue(Tensor* dst, T v) {
+  template<typename T>
+  void assignToValue(Tensor<Context> * dst, T v) {
     dst->Resize(std::vector<TIndex>());
     math::Set(1, v, dst->template mutable_data<T>(), &context_);
   }
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
index 21ce0343d8181..a3f20b301c0d3 100644
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutableTensor(CPU);
+  auto* res = status_blob->GetMutable<TensorCPU>();
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index 9722d5891334d..102c854736815 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -17,17 +17,17 @@ nccl::NCCLExecution getNCCLElements(
   ex.elements.resize(op->InputSize());
   for (auto i = 0; i < op->InputSize(); ++i) {
     auto& el = ex.elements[i];
-    el.src = &(op->Input<Tensor>(i, CUDA));
+    el.src = &(op->Input<TensorCUDA>(i));
     if (op->OutputSize() == 1) {
       // Reduce op
       if (i == ex.root) {
-        el.dst = op->Output<Tensor>(0, CUDA);
+        el.dst = op->Output<TensorCUDA>(0);
       }
     } else if (i < op->OutputSize()) {
-      el.dst = op->Output<Tensor>(i, CUDA);
+      el.dst = op->Output<TensorCUDA>(i);
     }
     // TODO - expensive (>1ms) - cache these.
-    el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
+    el.device = GetGPUIDForPointer(op->Input<TensorCUDA>(i).raw_data());
   }
 
   return ex;
@@ -38,7 +38,7 @@ namespace {
 template <typename T>
 bool AllInputsAre(OperatorBase* op) {
   for (auto i = 0; i < op->InputSize(); ++i) {
-    if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
+    if (op->Input<TensorCUDA>(i).IsType<T>()) {
       continue;
     } else {
       return false;
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
index 012eea69c9dc6..3eb0fc3ace3f4 100644
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
@@ -54,8 +54,8 @@ TEST(NervanaFullyConnectedTest, Test) {
   EXPECT_TRUE(op->Run());
   Blob* Yblob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, Yblob);
-  auto& Y = Yblob->Get<Tensor>();
-  Tensor Y_cpu(Y, CPU);
+  auto& Y = Yblob->Get<Tensor<CUDAContext>>();
+  TensorCPU Y_cpu(Y);
   EXPECT_EQ(Y.size(), 5 * 6);
   for (int i = 0; i < Y.size(); ++i) {
     CHECK_LT(Y_cpu.data<float>()[i], 10.11);
diff --git a/caffe2/contrib/warpctc/ctc_op.h b/caffe2/contrib/warpctc/ctc_op.h
index 6c27c907726b8..748e3a595206f 100644
--- a/caffe2/contrib/warpctc/ctc_op.h
+++ b/caffe2/contrib/warpctc/ctc_op.h
@@ -47,26 +47,26 @@ class CTCOp final : public Operator<Context> {
     const auto& inputs = Input(INPUTS);
     const auto minibatchSize = inputs.dim(1);
     const auto alphabetSize = inputs.dim(2);
-    const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
+    const auto& labels = OperatorBase::template Input<TensorCPU>(LABELS);
     const auto& labelLengths =
-        OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
+        OperatorBase::template Input<TensorCPU>(LABEL_LENGTHS);
     const auto& inputLengths =
-        OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
+        OperatorBase::template Input<TensorCPU>(INPUT_LENGTHS);
 
     // outputs
-    Tensor* gradients = nullptr;
+    Tensor<Context>* gradients = nullptr;
     TensorCPU* costs;
-    Tensor* workspace;
+    Tensor<Context>* workspace;
     if (!is_test_) {
       // [grads, costs, workspace] to maintain backward compatibility
       gradients = Output(0);
       gradients->ResizeLike(inputs);
-      costs = OperatorBase::template Output<Tensor>(1, CPU);
+      costs = OperatorBase::template Output<TensorCPU>(1);
       costs->ResizeLike(labelLengths);
       workspace = Output(2);
     } else {
       // [costs, workspace]
-      costs = OperatorBase::template Output<Tensor>(0, CPU);
+      costs = OperatorBase::template Output<TensorCPU>(0);
       costs->ResizeLike(labelLengths);
       workspace = Output(1);
     }
diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc
index 10fa078cf4b82..4edc4915ea69b 100644
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@@ -26,7 +26,7 @@ void SetCPUAllocator(CPUAllocator* alloc) {
   g_cpu_allocator.reset(alloc);
 }
 
-MemoryAllocationReporter CPUStaticContext::reporter_;
+MemoryAllocationReporter CPUContext::reporter_;
 
 void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
   std::lock_guard<std::mutex> guard(mutex_);
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 93659de70c9c1..c7c020e7a7cc0 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -9,9 +9,8 @@
 
 #include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
 #include "caffe2/core/typeid.h"
+#include "caffe2/core/logging.h"
 #include "caffe2/proto/caffe2.pb.h"
 
 namespace caffe2 {
@@ -61,20 +60,6 @@ class Blob {
   template <class T>
   bool IsType() const { return meta_.Match<T>(); }
 
-  // TODO(jerryzh): Remove template
-  template <class T>
-  bool IsType(DeviceType device_type) const {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "IsType(DeviceType) only available on "
-        "Tensor types.");
-    auto* tensor = static_cast<Tensor*>(pointer_);
-    if (tensor && tensor->GetDeviceType() == device_type) {
-      return true;
-    }
-    return false;
-  }
-
   /**
    * Returns the meta info of the blob.
    */
@@ -89,7 +74,6 @@ class Blob {
    * @brief Gets the const reference of the stored object. The code checks if
    * the stored object is of the desired type.
    */
-  // TODO(jerryzh): add a Get(DeviceType) function?
   template <class T>
   const T& Get() const {
     CAFFE_ENFORCE(
@@ -139,17 +123,6 @@ class Blob {
     }
   }
 
-  inline Tensor* GetMutableTensor(DeviceType device_type) {
-    if (IsType<Tensor>() &&
-        static_cast<Tensor*>(pointer_)->GetDeviceType() == device_type) {
-      return static_cast<Tensor*>(pointer_);
-    } else {
-      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-              << " DeviceType:" << device_type;
-      return Reset<Tensor>(new Tensor(device_type));
-    }
-  }
-
   /**
    * Sets the underlying object to the allocated one. The Blob then takes over
    * the ownership of the passed in pointer. If there is already an object in
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 536ad02f4ea0c..498f0b5deb55a 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -17,7 +17,7 @@ TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
 
 TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
   if (!caffe2::HasCudaGPU()) return;
-  Tensor tensor(CUDA);
+  TensorCUDA tensor;
   EXPECT_EQ(tensor.ndim(), 0);
   vector<int> dims(3);
   dims[0] = 2;
@@ -38,7 +38,7 @@ TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CUDA);
+  TensorCUDA tensor(dims);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 3);
@@ -65,8 +65,8 @@ TYPED_TEST(TensorGPUTest, TensorShareData) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CUDA);
-  Tensor other_tensor(dims, CUDA);
+  TensorCUDA tensor(dims);
+  TensorCUDA other_tensor(dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@@ -82,8 +82,8 @@ TYPED_TEST(TensorGPUTest, TensorShareDataCanUseDifferentShapes) {
   dims[2] = 5;
   vector<int> alternate_dims(1);
   alternate_dims[0] = 2 * 3 * 5;
-  Tensor tensor(dims, CUDA);
-  Tensor other_tensor(alternate_dims, CUDA);
+  TensorCUDA tensor(dims);
+  TensorCUDA other_tensor(alternate_dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(other_tensor.ndim(), 1);
@@ -99,8 +99,8 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CUDA);
-  Tensor other_tensor(dims, CUDA);
+  TensorCUDA tensor(dims);
+  TensorCUDA other_tensor(dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@@ -115,7 +115,7 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
 TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
   if (!HasCudaGPU()) return;
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  Tensor tensor(CUDA);
+  TensorCUDA tensor;
   EXPECT_EQ(tensor.ndim(), 0);
   EXPECT_THROW(tensor.data<TypeParam>(), EnforceNotMet);
 }
@@ -126,12 +126,12 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
       return;                                                              \
     }                                                                      \
     Blob blob;                                                             \
-    Tensor cpu_tensor(CPU);                                                \
+    TensorCPU cpu_tensor;                                                  \
     cpu_tensor.Resize(2, 3);                                               \
     for (int i = 0; i < 6; ++i) {                                          \
       cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
     }                                                                      \
-    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
+    blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor);                   \
     string serialized = blob.Serialize("test");                            \
     BlobProto proto;                                                       \
     CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@@ -148,8 +148,8 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                     \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CUDA));                            \
-    Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
+    EXPECT_TRUE(new_blob.IsType<TensorCUDA>());                            \
+    TensorCPU new_cpu_tensor(blob.Get<TensorCUDA>());                      \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(1), 3);                                   \
@@ -172,7 +172,7 @@ TEST_SERIALIZATION_GPU_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerializationMultiDevices) {
   Blob blob;
-  Tensor tensor(CPU);
+  TensorCPU tensor;
   tensor.Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor.mutable_data<float>()[i] = i;
@@ -180,7 +180,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
   for (int gpu_id = 0; gpu_id < NumCudaDevices(); ++gpu_id) {
     DeviceGuard guard(gpu_id);
     CUDAContext context(gpu_id);
-    blob.Reset(new Tensor(tensor, &context, CUDA));
+    blob.Reset(new TensorCUDA(tensor, &context));
     string serialized = blob.Serialize("test");
     BlobProto proto;
     CAFFE_ENFORCE(proto.ParseFromString(serialized));
@@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(blob.Deserialize(serialized));
-    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
+    EXPECT_TRUE(blob.IsType<TensorCUDA>());
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
-    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
+    EXPECT_TRUE(blob.IsType<TensorCUDA>());
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index b870aa39067e6..a7cbb4186a68c 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -33,7 +33,7 @@ class StringSerializer : public BlobSerializerBase {
   StringSerializer() {}
   ~StringSerializer() {}
   /**
-   * Serializes a Blob. Note that this blob has to contain Tensor,
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
    * otherwise this function produces a fatal error.
    */
   void Serialize(
@@ -83,242 +83,12 @@ std::string Blob::Serialize(const string& name) const {
   return data;
 }
 
-void TensorSerializer::Serialize(
-    const Blob& blob,
-    const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor) {
-  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
-}
-
-void TensorSerializer::SerializeWithChunkSize(
-    const Blob& blob,
-    const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor,
-    int chunk_size) {
-  CAFFE_ENFORCE(blob.IsType<Tensor>());
-  const auto& tensor = blob.template Get<Tensor>();
-  if (chunk_size == kNoChunking) {
-    chunk_size = tensor.size() + 1; // to account for empty tensors
-  } else if (chunk_size == kDefaultChunkSize) {
-    chunk_size = FLAGS_caffe2_tensor_chunk_size;
-  }
-
-  auto processChunk = [&](int64_t chunkStart) {
-    BlobProto blob_proto;
-    blob_proto.set_name(name);
-    blob_proto.set_type(kTensorBlobType);
-    TensorProto& proto = *blob_proto.mutable_tensor();
-    proto.set_name(name);
-    this->Serialize(
-        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
-    acceptor(
-        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
-        blob_proto.SerializeAsString());
-  };
-
-#ifndef __ANDROID__
-  std::vector<std::future<void>> futures;
-  // Poorman's IOBound ThreadPool
-  SimpleQueue<size_t> chunkQueue;
-  auto task = [&]() {
-    size_t chunkStart;
-    while (chunkQueue.Pop(&chunkStart)) {
-      processChunk(chunkStart);
-    }
-  };
-  if (tensor.size() > chunk_size) {
-    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
-      futures.emplace_back(std::async(std::launch::async, task));
-    }
-  }
-#endif
-
-  VLOG(1) << "Serializing blob " << name;
-  // Serialize whole vector. If vector is empty, it's shape still needs to be
-  // serialized in empty proto
-  for (size_t chunkBegin = 0;
-       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
-       chunkBegin += chunk_size) {
-    VLOG(2) << "Starting a chunk at " << chunkBegin;
-#ifndef __ANDROID__
-    if (tensor.size() > chunk_size) {
-      chunkQueue.Push(chunkBegin);
-    } else {
-      // Sync mode for small tensors
-      processChunk(chunkBegin);
-    }
-#else
-    // Since Android does not have std::future, we will always do sync mode
-    processChunk(chunkBegin);
-#endif
-  }
-
-#ifndef __ANDROID__
-  chunkQueue.NoMoreJobs();
-  for (auto& fut : futures) {
-    fut.get();
-  }
-#endif
-}
-
-void TensorSerializer::Serialize(
-    const Tensor& input,
-    const string& /*name*/,
-    TensorProto* proto_ptr,
-    size_t chunkBegin,
-    int32_t chunkSize) {
-  CAFFE_ENFORCE(
-      chunkBegin <= input.size(),
-      "Chunk begin is out of tensor: ",
-      chunkBegin,
-      ' ',
-      input.size());
-  if (chunkBegin + chunkSize > input.size()) {
-    chunkSize = input.size() - chunkBegin;
-  }
-
-  CAFFE_ENFORCE(
-      input.raw_data() || chunkSize == 0,
-      "The input does not have data input yet. This is probably because you "
-      "created a tensor of non-zero shape but never filled its data via "
-      "mutable_data() calls. This means that it makes no sense to serialize "
-      "the tensor content.");
-
-  TensorProto& proto = *proto_ptr;
-  proto.mutable_segment()->set_begin(chunkBegin);
-  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
-
-  for (int i = 0; i < input.ndim(); ++i) {
-    proto.add_dims(input.dim(i));
-  }
-  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
-  proto.set_data_type(data_type);
-  StoreDeviceDetail(input, &proto);
-  auto uniq_ptr = input.GetStaticContext()->CreateContext();
-  // A lot of copypaste is error prone. Should we create a macro for this?
-  switch (data_type) {
-    case TensorProto_DataType_FLOAT:
-      detail::CopyToProtoAsIs(
-          chunkSize,
-          input.template data<float>() + chunkBegin,
-          proto.mutable_float_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_INT32:
-      detail::CopyToProtoAsIs(
-          chunkSize,
-          input.template data<int>() + chunkBegin,
-          proto.mutable_int32_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_BYTE:
-      LOG(FATAL) << "This should not happen. When serializing, "
-                    "BYTE is deprecated and moved to UINT8.";
-      break;
-    case TensorProto_DataType_STRING: {
-      proto.mutable_string_data()->Reserve(chunkSize);
-      const string* content = input.template data<string>();
-      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
-        proto.add_string_data(content[i]);
-      }
-      break;
-    }
-    case TensorProto_DataType_BOOL:
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          input.template data<bool>() + chunkBegin,
-          proto.mutable_int32_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_UINT8:
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          input.template data<uint8_t>() + chunkBegin,
-          proto.mutable_int32_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_INT8:
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          input.template data<int8_t>() + chunkBegin,
-          proto.mutable_int32_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_UINT16:
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          input.template data<uint16_t>() + chunkBegin,
-          proto.mutable_int32_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_INT16:
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          input.template data<int16_t>() + chunkBegin,
-          proto.mutable_int32_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_INT64:
-      detail::CopyToProtoAsIs(
-          chunkSize,
-          input.template data<int64_t>() + chunkBegin,
-          proto.mutable_int64_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_FLOAT16: {
-      if (FLAGS_caffe2_serialize_fp16_as_bytes) {
-        const int kValue = 1;
-        CAFFE_ENFORCE_EQ(
-            reinterpret_cast<const char*>(&kValue)[0],
-            1,
-            "Serialization of FLOAT16 on big endian platform "
-            "is not written yet.");
-        unique_ptr<char[]> buffer(new char[2 * chunkSize]);
-        this->context_->template CopyToCPU<char>(
-            2 * chunkSize,
-            reinterpret_cast<const char*>(
-                input.template data<float16>() + chunkBegin),
-            buffer.get());
-        this->context_->FinishDeviceComputation();
-        proto.set_byte_data(buffer.release(), 2 * chunkSize);
-      } else {
-        detail::CopyToProtoWithCast(
-            chunkSize,
-            reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
-                chunkBegin,
-            proto.mutable_int32_data(),
-            uniq_ptr.get());
-      }
-    } break;
-    case TensorProto_DataType_DOUBLE:
-      detail::CopyToProtoAsIs(
-          chunkSize,
-          input.template data<double>() + chunkBegin,
-          proto.mutable_double_data(),
-          uniq_ptr.get());
-      break;
-    case TensorProto_DataType_UNDEFINED: {
-      proto.mutable_string_data()->Reserve(chunkSize);
-      Blob temp_blob;
-      const char* raw_data = static_cast<const char*>(input.raw_data());
-      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
-        temp_blob.ShareExternal(
-            const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
-        proto.add_string_data(temp_blob.Serialize(""));
-      }
-    } break;
-      // Note: we intentially do not provide "default:" so if any new data types
-      // are added, the compiler should warn the user to add the case here.
-  }
-}
+// Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
+template <>
+void TensorSerializer<CPUContext>::StoreDeviceDetail(
+    const Tensor<CPUContext>& /*input*/,
+    TensorProto* /*proto*/) {}
 
-int GetGPUIDForPointer(const void* ptr);
-
-void TensorSerializer::StoreDeviceDetail(
-    const Tensor& input,
-    TensorProto* proto) {
-  input.ExtractDeviceOption(proto->mutable_device_detail());
-}
 // The actual serialization registry objects.
 CAFFE_DEFINE_TYPED_REGISTRY(
     BlobSerializerRegistry,
@@ -357,176 +127,12 @@ void Blob::Deserialize(const BlobProto& blob_proto) {
   }
 }
 
-void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
-  auto tensor_proto = blob_proto.tensor();
-  Deserialize(
-      tensor_proto,
-      blob->GetMutableTensor(
-          static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
-}
-
-void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
-  // We create a local context for deserializing. Since Caffe2 contexts are
-  // usually lightweight, this should not involve too much overhead.
-  auto uniq_ptr =
-      tensor->GetStaticContext()->CreateContext(proto.device_detail());
-  auto context = uniq_ptr.get();
-  context->SwitchToDevice(0);
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
-    dims.push_back(d);
-  }
-  tensor->Resize(dims);
-
-  int64_t chunkBegin = 0;
-  auto chunkEnd = tensor->size();
-  if (proto.has_segment()) {
-    chunkBegin = proto.segment().begin();
-    chunkEnd = proto.segment().end();
-  }
-  CAFFE_ENFORCE(
-      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
-      "Invalid chunk ",
-      chunkBegin,
-      ' ',
-      chunkEnd,
-      " with total tensor size ",
-      tensor->size());
-  auto chunkSize = chunkEnd - chunkBegin;
-
-  switch (proto.data_type()) {
-    case TensorProto_DataType_FLOAT:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.float_data(),
-          tensor->template mutable_data<float>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_INT32:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_BYTE:
-      // Since BYTE stores the data in a string field instead of a repreated
-      // field we will have it special cased.
-      CAFFE_ENFORCE_EQ(
-          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
-      context->template CopyToCPU<uint8_t>(
-          chunkSize,
-          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
-          tensor->template mutable_data<uint8_t>() + chunkBegin);
-      break;
-    case TensorProto_DataType_STRING:
-      // Special handing of string because it is a non-fundamental type.
-      {
-        string* content = tensor->template mutable_data<string>();
-        for (int i = 0; i < chunkSize; ++i) {
-          content[i + chunkBegin] = proto.string_data(i);
-        }
-      }
-      break;
-    case TensorProto_DataType_BOOL:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<bool>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_UINT8:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<uint8_t>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_INT8:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int8_t>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_UINT16:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<uint16_t>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_INT16:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int16_t>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_INT64:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.int64_data(),
-          tensor->template mutable_data<int64_t>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_FLOAT16:
-      if (proto.has_byte_data()) {
-        const int kValue = 1;
-        CAFFE_ENFORCE_EQ(
-            reinterpret_cast<const char*>(&kValue)[0],
-            1,
-            "Serialization of FLOAT16 on big endian platform "
-            "is not written yet.");
-        CAFFE_ENFORCE_EQ(
-            2 * chunkSize,
-            proto.byte_data().size(),
-            "Incorrect proto field size.");
-        context->template CopyToCPU<float16>(
-            chunkSize,
-            reinterpret_cast<const float16*>(proto.byte_data().data()),
-            tensor->template mutable_data<float16>() + chunkBegin);
-      } else {
-        // Backward compatibility with models which used int32_data field
-        detail::CopyFromProtoWithCast(
-            chunkSize,
-            proto.int32_data(),
-            reinterpret_cast<uint16_t*>(
-                tensor->template mutable_data<float16>()) +
-                chunkBegin,
-            context);
-      }
-      break;
-    case TensorProto_DataType_DOUBLE:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.double_data(),
-          tensor->template mutable_data<double>() + chunkBegin,
-          context);
-      break;
-    case TensorProto_DataType_UNDEFINED: {
-      Blob temp_blob;
-      void* raw_ptr = nullptr;
-      for (int i = 0; i < chunkSize; ++i) {
-        temp_blob.Deserialize(proto.string_data(i));
-        if (i == 0) {
-          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
-        }
-        temp_blob.meta().copy()(
-            temp_blob.GetRaw(),
-            static_cast<char*>(raw_ptr) +
-                (i + chunkBegin) * temp_blob.meta().itemsize(),
-            1);
-      }
-    }
-  }
-  context->FinishDeviceComputation();
-}
-
 namespace {
-// Serialize Tensor
-REGISTER_BLOB_SERIALIZER((TypeMeta::Id<Tensor>()), TensorSerializer);
-REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer);
+// Serialize TensorCPU.
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<TensorCPU>()),
+    TensorSerializer<CPUContext>);
+REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
 // Serialize std::string
 REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
 REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
index 18cb95d541b4b..94af8a9fcacac 100644
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@@ -42,12 +42,13 @@ inline unique_ptr<BlobSerializerBase> CreateSerializer(CaffeTypeId id) {
  * TensorSerializer takes in a blob that contains a Tensor, and serializes it
  * into a TensorProto protocol buffer.
  */
+template <class Context>
 class TensorSerializer : public BlobSerializerBase {
  public:
-  TensorSerializer() {}
+  TensorSerializer() : context_() {}
   ~TensorSerializer() override {}
   /**
-   * Serializes a Blob. Note that this blob has to contain Tensor,
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
    * otherwise this function produces a fatal error.
    */
   void Serialize(
@@ -60,17 +61,13 @@ class TensorSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor,
       int chunk_size) override;
 
-  void Serialize(
-      const Tensor& tensor,
-      const string& name,
-      TensorProto* proto,
-      size_t chunkBegin,
-      int32_t chunkSize);
+  void Serialize(const Tensor<Context>& tensor, const string& name,
+                 TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
 
  private:
   // A utility function to store the device context detauls.
-  void StoreDeviceDetail(const Tensor& input, TensorProto* proto);
-  unique_ptr<BaseContext> context_;
+  void StoreDeviceDetail(const Tensor<Context>& input, TensorProto* proto);
+  Context context_;
 };
 
 /**
@@ -101,10 +98,11 @@ inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
  * tensor, change the TensorProto's corresponding fields before calling
  * Deserialize.
  */
+template <class Context>
 class TensorDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override;
-  void Deserialize(const TensorProto& proto, Tensor* tensor);
+  void Deserialize(const TensorProto& proto, Tensor<Context>* tensor);
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -112,12 +110,12 @@ class TensorDeserializer : public BlobDeserializerBase {
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace detail {
-template <typename SrcType, typename DstType>
+template <typename SrcType, typename DstType, class Context>
 inline void CopyToProtoAsIs(
     const size_t size,
     const SrcType* src,
     google::protobuf::RepeatedField<DstType>* field,
-    BaseContext* context) {
+    Context* context) {
   static_assert(
       sizeof(SrcType) == sizeof(DstType),
       "The source type and dest type cannot be copied as-is. Did "
@@ -126,22 +124,23 @@ inline void CopyToProtoAsIs(
   for (int i = 0; i < size; ++i) {
     field->Add(0);
   }
-  context->template CopyToCPU<SrcType>(
+  context->template Copy<SrcType, Context, CPUContext>(
       size, src, reinterpret_cast<SrcType*>(field->mutable_data()));
   // Make sure that we finish the copy into the protobuf.
   context->FinishDeviceComputation();
 }
 
-template <typename SrcType, typename DstType>
+template <typename SrcType, typename DstType, class Context>
 inline void CopyToProtoWithCast(
     const size_t size,
     const SrcType* src,
     google::protobuf::RepeatedField<DstType>* field,
-    BaseContext* context) {
+    Context* context) {
   // TODO: we are having one unnecessary copy here if the context is already
   // CPUContext. Remove it if it is performance critical.
   unique_ptr<SrcType[]> buffer(new SrcType[size]);
-  context->template CopyToCPU<SrcType>(size, src, buffer.get());
+  context->template Copy<SrcType, Context, CPUContext>(
+      size, src, buffer.get());
   context->FinishDeviceComputation();
   field->Reserve(size);
   for (int i = 0; i < size; ++i) {
@@ -149,27 +148,27 @@ inline void CopyToProtoWithCast(
   }
 }
 
-template <typename SrcType, typename DstType>
+template <typename SrcType, typename DstType, class Context>
 inline void CopyFromProtoAsIs(
     const size_t size,
     const google::protobuf::RepeatedField<SrcType>& field,
     DstType* dst,
-    BaseContext* context) {
+    Context* context) {
   static_assert(
       sizeof(SrcType) == sizeof(DstType),
       "The source type and dest type cannot be copied as-is. Did "
       "you mean CopyFromProtoWithCast?");
   CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
-  context->template CopyFromCPU<DstType>(
+  context->template Copy<DstType, CPUContext, Context>(
       size, reinterpret_cast<const DstType*>(field.data()), dst);
 }
 
-template <typename SrcType, typename DstType>
+template <typename SrcType, typename DstType, class Context>
 inline void CopyFromProtoWithCast(
     const size_t size,
     const google::protobuf::RepeatedField<SrcType>& field,
     DstType* dst,
-    BaseContext* context) {
+    Context* context) {
   CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
   // TODO: we are having one unnecessary copy here if the context is already
   // CPUContext. Remove it if it is performance critical.
@@ -178,10 +177,410 @@ inline void CopyFromProtoWithCast(
   for (int i = 0; i < size; ++i) {
     buffer[i] = static_cast<DstType>(src[i]);
   }
-  context->template CopyFromCPU<DstType>(size, buffer.get(), dst);
+  context->template Copy<DstType, CPUContext, Context>(size, buffer.get(), dst);
 }
 
 }  // namespace detail
+
+template <class Context>
+void TensorSerializer<Context>::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
+}
+
+template <class Context>
+void TensorSerializer<Context>::SerializeWithChunkSize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) {
+  CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
+  const auto& tensor = blob.template Get<Tensor<Context>>();
+  if (chunk_size == kNoChunking) {
+    chunk_size = tensor.size() + 1; // to account for empty tensors
+  } else if (chunk_size == kDefaultChunkSize) {
+    chunk_size = FLAGS_caffe2_tensor_chunk_size;
+  }
+
+  auto processChunk = [&](int64_t chunkStart) {
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type(kTensorBlobType);
+    TensorProto& proto = *blob_proto.mutable_tensor();
+    proto.set_name(name);
+    this->Serialize(
+        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
+    acceptor(
+        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
+        blob_proto.SerializeAsString());
+  };
+
+#ifndef __ANDROID__
+  std::vector<std::future<void>> futures;
+  // Poorman's IOBound ThreadPool
+  SimpleQueue<size_t> chunkQueue;
+  auto task = [&]() {
+    size_t chunkStart;
+    while (chunkQueue.Pop(&chunkStart)) {
+      processChunk(chunkStart);
+    }
+  };
+  if (tensor.size() > chunk_size) {
+    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
+      futures.emplace_back(std::async(std::launch::async, task));
+    }
+  }
+#endif
+
+  VLOG(1) << "Serializing blob " << name;
+  // Serialize whole vector. If vector is empty, it's shape still needs to be
+  // serialized in empty proto
+  for (size_t chunkBegin = 0;
+       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
+       chunkBegin += chunk_size) {
+    VLOG(2) << "Starting a chunk at " << chunkBegin;
+#ifndef __ANDROID__
+    if (tensor.size() > chunk_size) {
+      chunkQueue.Push(chunkBegin);
+    } else {
+      // Sync mode for small tensors
+      processChunk(chunkBegin);
+    }
+#else
+    // Since Android does not have std::future, we will always do sync mode
+    processChunk(chunkBegin);
+#endif
+  }
+
+#ifndef __ANDROID__
+  chunkQueue.NoMoreJobs();
+  for (auto& fut : futures) {
+    fut.get();
+  }
+#endif
+}
+
+template <class Context>
+void TensorSerializer<Context>::Serialize(
+    const Tensor<Context>& input,
+    const string& /*name*/,
+    TensorProto* proto_ptr,
+    size_t chunkBegin,
+    int32_t chunkSize) {
+  CAFFE_ENFORCE(
+      chunkBegin <= input.size(),
+      "Chunk begin is out of tensor: ",
+      chunkBegin,
+      ' ',
+      input.size());
+  if (chunkBegin + chunkSize > input.size()) {
+    chunkSize = input.size() - chunkBegin;
+  }
+
+  CAFFE_ENFORCE(
+      input.raw_data() || chunkSize == 0,
+      "The input does not have data input yet. This is probably because you "
+      "created a tensor of non-zero shape but never filled its data via "
+      "mutable_data() calls. This means that it makes no sense to serialize "
+      "the tensor content.");
+
+  TensorProto& proto = *proto_ptr;
+  proto.mutable_segment()->set_begin(chunkBegin);
+  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
+
+  for (int i = 0; i < input.ndim(); ++i) {
+    proto.add_dims(input.dim(i));
+  }
+  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
+  proto.set_data_type(data_type);
+  StoreDeviceDetail(input, &proto);
+
+  // A lot of copypaste is error prone. Should we create a macro for this?
+  switch (data_type) {
+  case TensorProto_DataType_FLOAT:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<float>() + chunkBegin,
+        proto.mutable_float_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT32:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<int>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_BYTE:
+    LOG(FATAL) << "This should not happen. When serializing, "
+                  "BYTE is deprecated and moved to UINT8.";
+    break;
+  case TensorProto_DataType_STRING:
+    {
+      proto.mutable_string_data()->Reserve(chunkSize);
+      const string* content = input.template data<string>();
+      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+        proto.add_string_data(content[i]);
+      }
+      break;
+    }
+  case TensorProto_DataType_BOOL:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<bool>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_UINT8:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<uint8_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT8:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<int8_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_UINT16:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<uint16_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT16:
+    detail::CopyToProtoWithCast(
+        chunkSize,
+        input.template data<int16_t>() + chunkBegin,
+        proto.mutable_int32_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_INT64:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<int64_t>() + chunkBegin,
+        proto.mutable_int64_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_FLOAT16: {
+    if (FLAGS_caffe2_serialize_fp16_as_bytes) {
+      const int kValue = 1;
+      CAFFE_ENFORCE_EQ(
+          reinterpret_cast<const char*>(&kValue)[0],
+          1,
+          "Serialization of FLOAT16 on big endian platform "
+          "is not written yet.");
+      unique_ptr<char[]> buffer(new char[2 * chunkSize]);
+      this->context_.template Copy<char, Context, CPUContext>(
+          2 * chunkSize,
+          reinterpret_cast<const char*>(
+              input.template data<float16>() + chunkBegin),
+          buffer.get());
+      this->context_.FinishDeviceComputation();
+      proto.set_byte_data(buffer.release(), 2 * chunkSize);
+    } else {
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
+              chunkBegin,
+          proto.mutable_int32_data(),
+          &this->context_);
+    }
+  } break;
+  case TensorProto_DataType_DOUBLE:
+    detail::CopyToProtoAsIs(
+        chunkSize,
+        input.template data<double>() + chunkBegin,
+        proto.mutable_double_data(),
+        &this->context_);
+    break;
+  case TensorProto_DataType_UNDEFINED: {
+    proto.mutable_string_data()->Reserve(chunkSize);
+    Blob temp_blob;
+    const char* raw_data = static_cast<const char*>(input.raw_data());
+    for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+      temp_blob.ShareExternal(
+          const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
+      proto.add_string_data(temp_blob.Serialize(""));
+    }
+  } break;
+    // Note: we intentially do not provide "default:" so if any new data types
+    // are added, the compiler should warn the user to add the case here.
+  }
+}
+
+template <class Context>
+void TensorDeserializer<Context>::Deserialize(
+    const BlobProto& blob_proto,
+    Blob* blob) {
+  Deserialize(blob_proto.tensor(), blob->GetMutable<Tensor<Context>>());
+}
+
+template <class Context>
+void TensorDeserializer<Context>::Deserialize(
+    const TensorProto& proto,
+    Tensor<Context>* tensor) {
+  // We create a local context for deserializing. Since Caffe2 contexts are
+  // usually lightweighted, this should not involve too much overhead.
+  Context context(proto.device_detail());
+  context.SwitchToDevice(0);
+  vector<TIndex> dims;
+  for (const TIndex d : proto.dims()) {
+    dims.push_back(d);
+  }
+  tensor->Resize(dims);
+
+  int64_t chunkBegin = 0;
+  auto chunkEnd = tensor->size();
+  if (proto.has_segment()) {
+    chunkBegin = proto.segment().begin();
+    chunkEnd = proto.segment().end();
+  }
+  CAFFE_ENFORCE(
+      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
+      "Invalid chunk ",
+      chunkBegin,
+      ' ',
+      chunkEnd,
+      " with total tensor size ",
+      tensor->size());
+  auto chunkSize = chunkEnd - chunkBegin;
+
+  switch (proto.data_type()) {
+    case TensorProto_DataType_FLOAT:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.float_data(),
+          tensor->template mutable_data<float>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT32:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_BYTE:
+      // Since BYTE stores the data in a string field instead of a repreated
+      // field we will have it special cased.
+      CAFFE_ENFORCE_EQ(
+          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
+      context.template Copy<uint8_t, Context, CPUContext>(
+          chunkSize,
+          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
+          tensor->template mutable_data<uint8_t>() + chunkBegin);
+      break;
+    case TensorProto_DataType_STRING:
+      // Special handing of string because it is a non-fundamental type.
+      {
+        string* content = tensor->template mutable_data<string>();
+        for (int i = 0; i < chunkSize; ++i) {
+          content[i + chunkBegin] = proto.string_data(i);
+        }
+      }
+      break;
+    case TensorProto_DataType_BOOL:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<bool>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_UINT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint8_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int8_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_UINT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint16_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int16_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_INT64:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int64_data(),
+          tensor->template mutable_data<int64_t>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_FLOAT16:
+      if (proto.has_byte_data()) {
+        const int kValue = 1;
+        CAFFE_ENFORCE_EQ(
+            reinterpret_cast<const char*>(&kValue)[0],
+            1,
+            "Serialization of FLOAT16 on big endian platform "
+            "is not written yet.");
+        CAFFE_ENFORCE_EQ(
+            2 * chunkSize,
+            proto.byte_data().size(),
+            "Incorrect proto field size.");
+        context.template Copy<float16, Context, CPUContext>(
+            chunkSize,
+            reinterpret_cast<const float16*>(proto.byte_data().data()),
+            tensor->template mutable_data<float16>() + chunkBegin);
+      } else {
+        // Backward compatibility with models which used int32_data field
+        detail::CopyFromProtoWithCast(
+            chunkSize,
+            proto.int32_data(),
+            reinterpret_cast<uint16_t*>(
+                tensor->template mutable_data<float16>()) +
+                chunkBegin,
+            &context);
+      }
+      break;
+    case TensorProto_DataType_DOUBLE:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.double_data(),
+          tensor->template mutable_data<double>() + chunkBegin,
+          &context);
+      break;
+    case TensorProto_DataType_UNDEFINED: {
+      Blob temp_blob;
+      void* raw_ptr = nullptr;
+      for (int i = 0; i < chunkSize; ++i) {
+        temp_blob.Deserialize(proto.string_data(i));
+        if (i == 0) {
+          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
+        }
+        temp_blob.meta().copy()(
+            temp_blob.GetRaw(),
+            static_cast<char*>(raw_ptr) +
+                (i + chunkBegin) * temp_blob.meta().itemsize(),
+            1);
+      }
+    }
+  }
+  context.FinishDeviceComputation();
+}
+
 }  // namespace caffe2
 
 #endif  // CAFFE2_CORE_BLOB_SERIALIZATION_H_
diff --git a/caffe2/core/blob_serialization_gpu.cc b/caffe2/core/blob_serialization_gpu.cc
index 4d675354531c8..76349f3173dbe 100644
--- a/caffe2/core/blob_serialization_gpu.cc
+++ b/caffe2/core/blob_serialization_gpu.cc
@@ -4,7 +4,20 @@
 
 namespace caffe2 {
 
+template <>
+void TensorSerializer<CUDAContext>::StoreDeviceDetail(
+    const Tensor<CUDAContext>& input, TensorProto* proto) {
+  auto* device_detail = proto->mutable_device_detail();
+  device_detail->set_device_type(CUDA);
+  device_detail->set_cuda_gpu_id(
+      GetGPUIDForPointer(input.raw_data()));
+}
+
 namespace {
-REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer);
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<TensorCUDA>()),
+    TensorSerializer<CUDAContext>);
+REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer<CUDAContext>);
 }
 }  // namespace caffe2
+
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 40e53a2840ae8..3fafbf2fc5028 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -47,7 +47,7 @@ class BlobTestFooSerializer : public BlobSerializerBase {
   BlobTestFooSerializer() {}
   ~BlobTestFooSerializer() {}
   /**
-   * Serializes a Blob. Note that this blob has to contain Tensor,
+   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
    * otherwise this function produces a fatal error.
    */
   void Serialize(
@@ -181,7 +181,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
 
   auto* ptr = tensor.mutable_data<int>();
   EXPECT_TRUE(ptr != nullptr);
@@ -200,7 +200,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
 
   // share the data with other tensor so that the pointer won't be reused
   // when we reallocate
-  Tensor other_tensor(dims, CPU);
+  TensorCPU other_tensor(dims);
   other_tensor.ShareData(tensor);
   // but double is bigger, so it should allocate a new one
   auto* doubleptr = tensor.mutable_data<double>();
@@ -215,7 +215,7 @@ TEST(TensorNonTypedTest, NonDefaultConstructible) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
 
   // this doesn't compile - good!
   // auto* ptr = tensor.mutable_data<BlobTestNonDefaultConstructible>();
@@ -232,7 +232,7 @@ TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
 TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
 
 TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
-  Tensor tensor(CPU);
+  TensorCPU tensor;
   EXPECT_EQ(tensor.ndim(), 0);
   vector<int> dims(3);
   dims[0] = 2;
@@ -253,7 +253,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 3);
@@ -279,7 +279,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedZeroDim) {
   dims[0] = 2;
   dims[1] = 0;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 0);
@@ -293,7 +293,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   EXPECT_EQ(tensor.ndim(), 3);
   EXPECT_EQ(tensor.dim32(0), 2);
   EXPECT_EQ(tensor.dim32(1), 3);
@@ -317,7 +317,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
 
 TYPED_TEST(TensorCPUTest, TensorInitializedScalar) {
   vector<int> dims;
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   EXPECT_EQ(tensor.ndim(), 0);
   EXPECT_EQ(tensor.size(), 1);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
@@ -329,8 +329,8 @@ TYPED_TEST(TensorCPUTest, TensorShareData) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
-  Tensor other_tensor(dims, CPU);
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@@ -349,7 +349,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointer) {
   dims[1] = 3;
   dims[2] = 5;
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2*3*5]);
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   tensor.ShareExternalPointer(raw_buffer.get());
   EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
   EXPECT_EQ(tensor.data<TypeParam>(), raw_buffer.get());
@@ -366,7 +366,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
   dims[1] = 3;
   dims[2] = 5;
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2 * 3 * 5]);
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   TypeMeta meta = TypeMeta::Make<TypeParam>();
   tensor.ShareExternalPointer(raw_buffer.get(), meta);
   EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
@@ -380,7 +380,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
 
 TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
-  Tensor tensor(CPU);
+  TensorCPU tensor;
   ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
 }
 
@@ -391,8 +391,8 @@ TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
   dims[2] = 5;
   vector<int> alternate_dims(1);
   alternate_dims[0] = 2 * 3 * 5;
-  Tensor tensor(dims, CPU);
-  Tensor other_tensor(alternate_dims, CPU);
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(alternate_dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(other_tensor.ndim(), 1);
@@ -413,8 +413,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterResize) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
-  Tensor other_tensor(dims, CPU);
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@@ -431,8 +431,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterFreeMemory) {
   dims[0] = 2;
   dims[1] = 3;
   dims[2] = 5;
-  Tensor tensor(dims, CPU);
-  Tensor other_tensor(dims, CPU);
+  TensorCPU tensor(dims);
+  TensorCPU other_tensor(dims);
   EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
   other_tensor.ShareData(tensor);
   EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@@ -449,7 +449,7 @@ TYPED_TEST(TensorCPUTest, KeepOnShrink) {
   FLAGS_caffe2_max_keep_on_shrink_memory = LLONG_MAX;
 
   vector<int> dims{2, 3, 5};
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   TypeParam* ptr = tensor.mutable_data<TypeParam>();
   EXPECT_TRUE(ptr != nullptr);
   // Expanding - will reallocate
@@ -480,7 +480,7 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
   FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 4 * sizeof(TypeParam);
 
   vector<int> dims{1, 8, 8};
-  Tensor tensor(dims, CPU);
+  TensorCPU tensor(dims);
   TypeParam* ptr = tensor.mutable_data<TypeParam>();
   EXPECT_TRUE(ptr != nullptr);
   // Shrinking - will not reallocate
@@ -501,19 +501,19 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
 }
 
 TYPED_TEST(TensorCPUDeathTest, CannotAccessRawDataWhenEmpty) {
-  Tensor tensor(CPU);
+  TensorCPU tensor;
   EXPECT_EQ(tensor.ndim(), 0);
   ASSERT_ANY_THROW(tensor.raw_data());
 }
 
 TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
-  Tensor tensor(CPU);
+  TensorCPU tensor;
   EXPECT_EQ(tensor.ndim(), 0);
   ASSERT_ANY_THROW(tensor.data<TypeParam>());
 }
 
 TEST(TensorTest, TensorNonFundamentalType) {
-  Tensor tensor(vector<int>{2, 3, 4}, CPU);
+  TensorCPU tensor(vector<int>{2, 3, 4});
   EXPECT_TRUE(tensor.mutable_data<std::string>() != nullptr);
   const std::string* ptr = tensor.data<std::string>();
   for (int i = 0; i < tensor.size(); ++i) {
@@ -522,14 +522,14 @@ TEST(TensorTest, TensorNonFundamentalType) {
 }
 
 TEST(TensorTest, TensorNonFundamentalTypeClone) {
-  Tensor tensor(vector<int>{2, 3, 4}, CPU);
+  TensorCPU tensor(vector<int>{2, 3, 4});
   std::string* ptr = tensor.mutable_data<std::string>();
   EXPECT_TRUE(ptr != nullptr);
   for (int i = 0; i < tensor.size(); ++i) {
     EXPECT_TRUE(ptr[i] == "");
     ptr[i] = "filled";
   }
-  Tensor dst_tensor = tensor.Clone();
+  TensorCPU dst_tensor = tensor.Clone();
   const std::string* dst_ptr = dst_tensor.data<std::string>();
   for (int i = 0; i < dst_tensor.size(); ++i) {
     EXPECT_TRUE(dst_ptr[i] == "filled");
@@ -549,7 +549,7 @@ TEST(TensorTest, Tensor64BitDimension) {
   // Initialize a large tensor.
   TIndex large_number =
       static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  Tensor tensor(vector<TIndex>{large_number}, CPU);
+  TensorCPU tensor(vector<TIndex>{large_number});
   EXPECT_EQ(tensor.ndim(), 1);
   EXPECT_EQ(tensor.dim(0), large_number);
   EXPECT_EQ(tensor.size(), large_number);
@@ -581,7 +581,7 @@ TEST(TensorTest, Tensor64BitDimension) {
 TEST(TensorDeathTest, CannotCastDownLargeDims) {
   TIndex large_number =
       static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  Tensor tensor(vector<TIndex>{large_number}, CPU);
+  TensorCPU tensor(vector<TIndex>{large_number});
   EXPECT_EQ(tensor.ndim(), 1);
   EXPECT_EQ(tensor.dim(0), large_number);
   ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
@@ -590,7 +590,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
   TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
     Blob blob;                                                            \
-    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
     tensor->Resize(2, 3);                                                 \
     for (int i = 0; i < 6; ++i) {                                         \
       tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@@ -611,7 +611,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
+    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -624,7 +624,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                           \
   TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
     Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
     tensor->Resize(0, 3);                                                 \
     tensor->mutable_data<TypeParam>();                                    \
     string serialized = blob.Serialize("test");                           \
@@ -640,7 +640,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
+    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -659,7 +659,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerialization_CustomType) {
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
   tensor->Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor->mutable_data<BlobTestFoo>()[i].val = i;
@@ -671,7 +671,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -686,7 +686,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, float16) {
   const TIndex kSize = 3000000;
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
   tensor->Resize(kSize);
   for (int i = 0; i < tensor->size(); ++i) {
     tensor->mutable_data<float16>()[i].x = i % 10000;
@@ -714,7 +714,7 @@ TEST(TensorTest, float16) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -850,7 +850,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
   {
     VLOG(1) << "Test begin";
     Blob blob;
-    Tensor* tensor = blob.GetMutableTensor(CPU);
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();
     VLOG(1) << "Allocating blob";
     tensor->Resize(d1, d2);
     auto mutableData = tensor->mutable_data<TypeParam>();
@@ -893,7 +893,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsType<Tensor>(CPU));
+    EXPECT_TRUE(new_blob->IsType<TensorCPU>());
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
@@ -1020,7 +1020,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
   int64_t size = d1 * d2;
 
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
   tensor->Resize(d1, d2);
   tensor->mutable_data<float>();
   std::mutex mutex;
@@ -1070,9 +1070,10 @@ TEST(BlobTest, CastingMessage) {
 }
 
 TEST(TensorConstruction, UnitializedCopyTest) {
-  Tensor x(CPU);
-  Tensor y(x, CPU);
-  Tensor z = x.Clone();
+  CPUContext context;
+  TensorCPU x;
+  TensorCPU y(x, &context);
+  TensorCPU z = x.Clone();
   // should be uninitialized
   EXPECT_EQ(x.size(), -1);
   EXPECT_EQ(y.size(), -1);
@@ -1081,11 +1082,14 @@ TEST(TensorConstruction, UnitializedCopyTest) {
 }
 
 TEST(TensorConstruction, CopyConstructorTest) {
-  Tensor x(CPU);
+  CPUContext context;
+
+  TensorCPU x;
   x.Resize(5);
   x.mutable_data<float>()[0] = 1;
-  Tensor y = x.Clone();
-  Tensor z(x, CPU);
+  TensorCPU y = x.Clone();
+  TensorCPU z(x, &context);
+  TensorCPU w;
 
   EXPECT_EQ(*x.data<float>(), 1);
   EXPECT_EQ(*y.data<float>(), 1);
@@ -1096,12 +1100,13 @@ TEST(TensorConstruction, CopyConstructorTest) {
   EXPECT_EQ(*z.data<float>(), 1);
 }
 
-TEST(TensorConstruction, MoveAssignmentOpTest) {
-  Tensor x(CPU);
+TEST(TensorConstruction, MoveConstructorTest) {
+  CPUContext context;
+
+  TensorCPU x;
   x.Resize(5);
   x.mutable_data<float>()[0] = 1;
-  Tensor y(CPU);
-  y = std::move(x);
+  TensorCPU y = std::move(x);
 
   EXPECT_EQ(*y.data<float>(), 1);
 }
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
index 05af9a8cdb12c..427e8bb60aa31 100644
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@@ -7,12 +7,6 @@
 
 namespace caffe2 {
 
-// We put this here because context.h rather than context_base.h is included in
-// user code
-// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
-CAFFE2_API BaseStaticContext*
-    BaseContext::static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
-
 uint32_t RandomNumberSeed() {
   // Originally copied from folly::randomNumberSeed (at 418ad4)
   // modified to use chrono instead of sys/time.h
@@ -30,11 +24,4 @@ uint32_t RandomNumberSeed() {
       kPrime2 * tv_sec + kPrime3 * tv_usec;
 }
 
-BaseStaticContext* GetCPUStaticContext() {
-  static CPUStaticContext context;
-  return &context;
-}
-
-REGISTER_STATIC_CONTEXT(CPU, GetCPUStaticContext());
-
 } // namespace caffe2
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index 017bc51744800..df3b0f20772d6 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -7,7 +7,6 @@
 #include <unordered_map>
 
 #include "caffe2/core/allocator.h"
-#include "caffe2/core/context_base.h"
 #include "caffe2/core/event.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/typeid.h"
@@ -17,8 +16,6 @@ CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
 namespace caffe2 {
 
-BaseStaticContext* GetCPUStaticContext();
-
 /**
  * A function to generate a random number seed that is unique in a best-effort
  * basis, using an ever-incrementing seed and the current time.
@@ -29,15 +26,44 @@ uint32_t RandomNumberSeed();
  * The CPU Context, representing the bare minimum of what a Context class in
  * Caffe2 should implement.
  *
- * // TODO modify docs
  * See operator.h, especially Operator<Context>, for how Context are used in
  * actual operator implementations that are associated with specific devices.
  * In general, the Context class is passed in as a template argument, and
  * the operator can use the functions defined in the context to execute whatever
  * computation it has.
  *
+ * A Context defines all the necessities to run an operator on a specific
+ * device. Specific Context classes have the freedom to choose what functions it
+ * implements, but there are a few functions that you should consider
+ * implementing if you want to write your own context class:
+ * - void SwitchToDevice(): any necessary code to switch to the device before
+ *     running anything.
+ * - void WaitEvent(const Event& ev): make the current context to wait on
+ *     an event. For example, for cuda, this is the equivalent of
+ *     cudaStreamWaitEvent. For CPU context, it essentially synchronizes the
+ *     event.
+ * - void Record(Event* ev): record the async activities on the current context
+ *     to the event. For example, for cuda, this is the equivalent of
+ *     cudaEventRecord on the current stream. For CPU context, it is always
+ *     synchronous.
+ * - void FinishDeviceComputation(): any wrapping-up work after all the
+ *     computation of the operator is done. If there are errors during the
+ *     execution, throw exception. For example, in a CUDAContext, this function
+ *     carries out a stream synchronization and spots potential errors for
+ *     the cuda kernel calls.
+ * - static std::pair<void*, MemoryDeleter> New(size_t nbytes): allocates
+       memory and returns a deleter.
+ * - template <class SrcContext, class DstContext> void CopyBytes(...): does
+ *     cross context memory copy.
+ * - template <typename T, class SrcContext, class DstContext> void Copy(...):
+ *     usually a simple wrapper around the above CopyBytes function.
+ *
+ * We intentionally did not create a base class for the various possible Context
+ * classes there might be, since they are intended to be specified during
+ * compile time using templates rather than via polymorphism. You should also
+ * not have classes derived from existing context classes.
  */
-class CPUContext final : public BaseContext {
+class CPUContext final {
  public:
   typedef std::mt19937 rand_gen_type;
   CPUContext() : random_seed_(RandomNumberSeed()) {}
@@ -48,30 +74,23 @@ class CPUContext final : public BaseContext {
     CAFFE_ENFORCE_EQ(option.device_type(), CPU);
   }
 
-  ~CPUContext() noexcept override {}
-
-  BaseStaticContext* GetStaticContext() const override {
-    return GetCPUStaticContext();
-  }
+  ~CPUContext() noexcept {}
 
-  static BaseStaticContext* StaticContext() {
-    return GetCPUStaticContext();
+  inline void SwitchToDevice(int /*stream_id*/) {}
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
   }
 
-  inline void SwitchToDevice(int /*stream_id*/) override {}
-
-  using BaseContext::SwitchToDevice;
-
-  inline void WaitEvent(const Event& ev) override {
+  inline void WaitEvent(const Event& ev) {
     ev.Wait(CPU, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(CPU, this, err_msg);
   }
 
-  inline void FinishDeviceComputation() override {}
+  inline void FinishDeviceComputation() {}
 
   inline rand_gen_type& RandGenerator() {
     if (!random_generator_.get()) {
@@ -80,35 +99,16 @@ class CPUContext final : public BaseContext {
     return *random_generator_.get();
   }
 
-  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return StaticContext()->New(nbytes);
-  }
-
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
-      override {
-    if (nbytes == 0) {
-      return;
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      reporter_.New(data_and_deleter.first, nbytes);
+      data_and_deleter.second = ReportAndDelete;
     }
-    CAFFE_ENFORCE(src);
-    CAFFE_ENFORCE(dst);
-    memcpy(dst, src, nbytes);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  bool SupportsNonFundamentalTypes() const override {
-    // CPU non fumdamental type copy OK
-    return true;
+    return data_and_deleter;
   }
 
+  // Two copy functions that deals with cross-device copies.
   template <class SrcContext, class DstContext>
   inline void CopyBytes(size_t nbytes, const void* src, void* dst);
 
@@ -147,31 +147,26 @@ class CPUContext final : public BaseContext {
 
   // CPU streams are not implemented and are silently ignored by CPU ops,
   // return true to signal executor to schedule a CPU op
-  static bool IsStreamFree(
-      const DeviceOption& /* option */,
-      int /* stream_id */) {
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
-    return CPU;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return CPU;
-  }
-
  protected:
   // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
   int random_seed_{1701};
   std::unique_ptr<rand_gen_type> random_generator_;
+  CAFFE2_API static MemoryAllocationReporter reporter_;
+
+ private:
+  static void ReportAndDelete(void* ptr) {
+    reporter_.Delete(ptr);
+    GetCPUAllocator()->GetDeleter()(ptr);
+  }
 };
 
-template <>
+template<>
 inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
+    size_t nbytes, const void* src, void* dst) {
   if (nbytes == 0) {
     return;
   }
@@ -180,41 +175,6 @@ inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
   memcpy(dst, src, nbytes);
 }
 
-// TODO(jerryzh): merge CPUStaticContext with Allocator
-class CPUStaticContext : public BaseStaticContext {
- public:
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
-    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
-    if (FLAGS_caffe2_report_cpu_memory_usage) {
-      reporter_.New(data_and_deleter.first, nbytes);
-      data_and_deleter.second = ReportAndDelete;
-    }
-    return data_and_deleter;
-  }
-
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CPUContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CPUContext>(option);
-  }
-
-  DeviceType GetDeviceType() override {
-    return CPU;
-  }
-
- protected:
-  CAFFE2_API static MemoryAllocationReporter reporter_;
-
- private:
-  static void ReportAndDelete(void* ptr) {
-    reporter_.Delete(ptr);
-    GetCPUAllocator()->GetDeleter()(ptr);
-  }
-};
-
 }  // namespace caffe2
 
 #endif  // CAFFE2_CORE_CONTEXT_H_
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
deleted file mode 100644
index 08ff7755121cd..0000000000000
--- a/caffe2/core/context_base.cc
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "context_base.h"
-
-namespace caffe2 {
-
-}
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
deleted file mode 100644
index c3b3b4958910a..0000000000000
--- a/caffe2/core/context_base.h
+++ /dev/null
@@ -1,187 +0,0 @@
-#pragma once
-
-#include <cstdlib>
-#include <ctime>
-#include <memory>
-#include <unordered_map>
-
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/event.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/typeid.h"
-#include "caffe2/proto/caffe2.pb.h"
-
-namespace caffe2 {
-
-class BaseContext;
-
-/* BaseStaticContext defines the interface for static context, which contains
-   functions that are invoked statically before in Tensor class, e.g. New,
-   We will merge this with Allocator later.
- */
-class BaseStaticContext {
- public:
-  virtual ~BaseStaticContext() noexcept {}
-
-  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) const = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext(const DeviceOption&) = 0;
-
-  virtual DeviceType GetDeviceType() = 0;
-
-  /*
-   * @brief: Sets the DeviceOption for argument `device` based on the
-   * current context and the a data pointer
-   */
-  virtual void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) {
-    device->set_device_type(GetDeviceType());
-  }
-};
-
-/**
- * Virtual interface for the Context class in Caffe2.
- *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes needs to implement all the pure virtual
- * functions in the BaseContext class.
- * TODO: add docs after this is finalized.
- */
-class BaseContext {
- public:
-  virtual ~BaseContext() noexcept {}
-
-  virtual BaseStaticContext* GetStaticContext() const = 0;
-
-  /* Sorry for the naming, will get rid of this in future diff */
-  virtual DeviceType GetDevicetype() const = 0;
-
-  virtual void SwitchToDevice(int /*stream_id*/) = 0;
-
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
-
-  virtual void WaitEvent(const Event& ev) = 0;
-
-  virtual void Record(Event* ev, const char* err_msg = nullptr) const = 0;
-
-  virtual void FinishDeviceComputation() = 0;
-
-  // This used to be arbitrary cross-device copy, but it turns out everyone
-  // did direct CPU-X copy, so we just make three functions for it (to avoid
-  // double dispatch).  This will get obsoleted by C10. where copies
-  // will be proper operators (and get to rely on multiple dispatch there.)
-  virtual void
-  CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesToDevice(
-      size_t nbytes,
-      const void* src,
-      void* dst,
-      DeviceType type) {
-    if (type == CPU) {
-      CopyBytesToCPU(nbytes, src, dst);
-    } else if (type == GetDevicetype()) {
-      CopyBytesSameDevice(nbytes, src, dst);
-    } else {
-      CAFFE_THROW("CopyBytesToDevice can only copy to CPU or between same "
-                  "device. Can't copy from: ", GetDevicetype(), " to", type);
-    }
-  }
-
-  template <typename T>
-  inline void CopySameDevice(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value,
-        "CopySameDevice requires fundamental types");
-    CopyBytesSameDevice(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value,
-        "CopyFromCPU requires fundamental types");
-    CopyBytesFromCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyToCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
-    CopyBytesToCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  virtual bool SupportsNonFundamentalTypes() const {
-    return false;
-  }
-
-  inline void EnforceMetaCopyOK() {
-    CAFFE_ENFORCE(
-        SupportsNonFundamentalTypes(), "Context requires fundamental types");
-  }
-
-  inline void CopyItemsSameDevice(
-      const TypeMeta& meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  inline void
-  CopyItemsFromCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  inline void
-  CopyItemsToCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesToCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  CAFFE2_API static BaseStaticContext*
-      static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
-
-  template <int d>
-  friend struct StaticContextFunctionRegisterer;
-};
-
-template <int d>
-struct StaticContextFunctionRegisterer {
-  explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
-    static_assert(d < COMPILE_TIME_MAX_DEVICE_TYPES, "");
-    BaseContext::static_context_[d] = ptr;
-  }
-};
-
-#define REGISTER_STATIC_CONTEXT(d, f)                                \
-  namespace {                                                        \
-  static StaticContextFunctionRegisterer<d> g_static_context_##d(f); \
-  }
-
-#define GET_STATIC_CONTEXT(d) BaseContext::static_context_[d]
-} // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index ad0d80774450a..b8f14d80b54e8 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -59,6 +59,7 @@ CAFFE2_DEFINE_int(
 
 namespace caffe2 {
 
+
 thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
 
 // TODO(jiayq): these variables shouldn't be currently accessed during static
@@ -99,6 +100,19 @@ CudaMemoryPoolType GetCudaMemoryPoolType() {
   return g_cuda_memory_pool_type;
 }
 
+vector<TIndex> GetCUDATensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  vector<TIndex> dims =
+      GetTensorInfo<CUDAContext>(c, shares_data, capacity, device);
+  const Tensor<CUDAContext>* tc = static_cast<const Tensor<CUDAContext>*>(c);
+  device->set_device_type(CUDA);
+  device->set_cuda_gpu_id(GetGPUIDForPointer(tc->raw_data()));
+  return dims;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // A wrapper to allow us to lazily initialize all cuda environments that Caffe
 // uses. This gets done the first time a caffe2::CUDAContext::New() gets called
@@ -149,6 +163,14 @@ static void Caffe2InitializeCuda() {
     }
   }
 
+  RegisterTypeCallFunction(
+    TypeMeta::Id<Tensor<CUDAContext>>(),
+    GetTensorType<CUDAContext>
+  );
+
+  RegisterTensorInfoFunction(
+      TypeMeta::Id<Tensor<CUDAContext>>(), GetCUDATensorInfo);
+
 #ifdef CAFFE2_USE_CUDNN
   // Check the versions of cuDNN that were compiled and linked with are compatible
   CheckCuDNNVersions();
@@ -230,6 +252,21 @@ struct Caffe2CudaInitializerHelper {
     }
   }
 };
+
+struct TensorCUDAStatGetter : BlobStatGetter {
+  size_t sizeBytes(const Blob& blob) const override {
+    const auto& tensor = blob.Get<TensorCUDA>();
+    auto nbytes = tensor.nbytes();
+    if (nbytes > 0 && tensor.IsType<std::string>()) {
+      const auto* data = tensor.data<std::string>();
+      for (int i = 0; i < tensor.size(); ++i) {
+        nbytes += data[i].size();
+      }
+    }
+    return nbytes;
+  }
+};
+REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter);
 } // namespace
 
 /**
@@ -306,7 +343,7 @@ void TrackMemoryAlloc(size_t nbytes) {
 }
 }
 
-std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
+std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
   // Lock the mutex
   std::lock_guard<std::mutex> lock(CUDAContext::mutex());
   // A one-time caffe2 cuda initializer.
@@ -344,7 +381,7 @@ std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
   return {nullptr, Delete};
 }
 
-void CUDAStaticContext::Delete(void* ptr) {
+void CUDAContext::Delete(void* ptr) {
   // lock the mutex
   std::lock_guard<std::mutex> lock(CUDAContext::mutex());
 
@@ -396,11 +433,4 @@ void CUDAStaticContext::Delete(void* ptr) {
   }
 }
 
-BaseStaticContext* GetCUDAStaticContext() {
-  static CUDAStaticContext context;
-  return &context;
-}
-
-REGISTER_STATIC_CONTEXT(CUDA, GetCUDAStaticContext());
-
 }  // namespace caffe2
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index a76fcd6a16505..1668d4f2ab7e4 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -7,7 +7,6 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
-#include "caffe2/core/context_base.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/numa.h"
 #include "caffe2/core/tensor.h"
@@ -135,46 +134,37 @@ class ThreadLocalCUDAObjects {
 #endif // CAFFE2_USE_CUDNN
 };
 
-BaseStaticContext* GetCUDAStaticContext();
-
-class CUDAContext final : public BaseContext {
+class CUDAContext final {
  public:
   // The default cuda context constructor.
   explicit CUDAContext(const int gpu_id = -1);
   explicit CUDAContext(const DeviceOption& option);
 
-  ~CUDAContext() override {
+  ~CUDAContext() {
     if (curand_generator_) {
       CURAND_CHECK(curandDestroyGenerator(curand_generator_));
     }
     FinishDeviceComputation();
   }
 
-  BaseStaticContext* GetStaticContext() const override {
-    return GetCUDAStaticContext();
-  }
-
-  static BaseStaticContext* StaticContext() {
-    return GetCUDAStaticContext();
-  }
-
-  inline void SwitchToDevice(int stream_id) override {
+  inline void SwitchToDevice(int stream_id) {
     set_stream_id(stream_id);
     CaffeCudaSetDevice(gpu_id_);
   }
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
 
-  using BaseContext::SwitchToDevice;
-
-  inline void WaitEvent(const Event& ev) override {
+  inline void WaitEvent(const Event& ev) {
     ev.Wait(CUDA, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(CUDA, this, err_msg);
   }
 
-  void FinishDeviceComputation() override {
+  void FinishDeviceComputation() {
     cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) {
@@ -221,9 +211,7 @@ class CUDAContext final : public BaseContext {
     return curand_generator_;
   }
 
-  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return StaticContext()->New(nbytes);
-  }
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
 
   // Get a mutex to lock out cudaMalloc / cudaFree calls when
   // NCCL kernels are being launched. Should remove threat of
@@ -245,21 +233,6 @@ class CUDAContext final : public BaseContext {
         cuda_objects_.GetStream(gpu_id_, stream_id_)));
   }
 
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
-  }
-
   template <typename T, class SrcContext, class DstContext>
   inline void Copy(int n, const T* src, T* dst) {
     CopyBytes<SrcContext, DstContext>(n * sizeof(T),
@@ -288,15 +261,8 @@ class CUDAContext final : public BaseContext {
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
-  DeviceType GetDevicetype() const override {
-    return CUDA;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return CUDA;
-  }
-
  protected:
+  static void Delete(void* data);
   void set_stream_id(int stream_id) {
     stream_id_ = stream_id;
   }
@@ -384,37 +350,8 @@ struct PinnedCPUAllocator final : CPUAllocator {
   DefaultCPUAllocator baseAllocator_;
 };
 
-class CUDAStaticContext final : public BaseStaticContext {
- public:
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
-
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CUDAContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CUDAContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<CUDAContext>(gpu_id);
-  }
-
-  DeviceType GetDeviceType() override {
-    return CUDA;
-  }
-
-  void ExtractDeviceOption(DeviceOption* device, const void* data) override {
-    device->set_device_type(GetDeviceType());
-    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
-  }
-
- protected:
-  static void Delete(void* data);
-};
-
-using TensorCUDA = Tensor;
+// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
+typedef Tensor<CUDAContext> TensorCUDA;
 
 }  // namespace caffe2
 
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index a6e44846e9e0b..e2338d7f70481 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -26,7 +26,7 @@ TEST(CPUContextTest, TestAllocDealloc) {
   }
   DeviceOption option;
   CPUContext context(option);
-  context.CopyToCPU<float>(10, data, dst_data);
+  context.Copy<float, CPUContext, CPUContext>(10, data, dst_data);
   for (int i = 0; i < 10; ++i) {
     EXPECT_FLOAT_EQ(dst_data[i], i);
   }
diff --git a/caffe2/core/dispatch/CMakeLists.txt b/caffe2/core/dispatch/CMakeLists.txt
index c028bfa2b9307..841bfca164684 100644
--- a/caffe2/core/dispatch/CMakeLists.txt
+++ b/caffe2/core/dispatch/CMakeLists.txt
@@ -18,7 +18,6 @@ set(TEST_SOURCES
 
 add_library(dispatch OBJECT ${LIB_SOURCES})
 target_enable_style_warnings(dispatch)
-add_dependencies(dispatch Caffe2_PROTO)
 
 if(BUILD_TEST)
     add_executable(dispatch_test ${TEST_SOURCES} $<TARGET_OBJECTS:dispatch>)
diff --git a/caffe2/core/dispatch/OpSchema.h b/caffe2/core/dispatch/OpSchema.h
index 6a7da5a8ea310..bdfd14ed42396 100644
--- a/caffe2/core/dispatch/OpSchema.h
+++ b/caffe2/core/dispatch/OpSchema.h
@@ -1,12 +1,13 @@
 #pragma once
 
 #include "caffe2/core/dispatch/DispatchKey.h"
-#include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/Metaprogramming.h"
 #include "caffe2/utils/Array.h"
 
 namespace caffe2 {
-class Tensor;
+template<class Context> class Tensor;
+class CPUContext;
+class CUDAContext;
 }  // namespace caffe2
 
 namespace c10 {
@@ -17,29 +18,26 @@ namespace details {
  * If Arg is a Tensor or reference to a Tensor, provide the member constant value equal to true.  Otherwise
  * return false.
  */
-template<class Arg>
-using is_tensor_arg = std::
-  is_same<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
-
-inline DeviceTypeId to_device_type_id(caffe2::DeviceType device_type) {
-  switch (device_type) {
-    case caffe2::CPU:
-      return DeviceTypeId::CPU;
-    case caffe2::CUDA:
-      return DeviceTypeId::CUDA;
-    default:
-      return DeviceTypeId::UNDEFINED;
-  }
-}
+template<class Arg> using is_tensor_arg = guts::is_instantiation_of<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
 
 // TODO get rid of tensor_to_dispatch_key once c2::Tensor is de-templatized. This then fits into a template lambda instead of a functor.
+template<class TensorType, class Enable = void> struct tensor_to_dispatch_key_ final {};
+template<class TensorType>
+struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CPUContext>>::value>> final {
+    static TensorParameterDispatchKey call(const TensorType& tensor) {
+      return TensorParameterDispatchKey{DeviceTypeId::CPU, LayoutId(0), tensor.meta().id()};
+    }
+};
+template<class TensorType>
+struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CUDAContext>>::value>> final {
+    static TensorParameterDispatchKey call(const TensorType& tensor) {
+      return TensorParameterDispatchKey{DeviceTypeId::CUDA, LayoutId(0), tensor.meta().id()};
+    }
+};
 struct tensor_to_dispatch_key final {
     template<class TensorType>
     TensorParameterDispatchKey operator()(const TensorType& tensor) const {
-      return TensorParameterDispatchKey{
-          to_device_type_id(tensor.GetDeviceType()),
-          LayoutId(0),
-          tensor.meta().id()};
+      return tensor_to_dispatch_key_<TensorType, void>::call(tensor);
     }
 };
 
diff --git a/caffe2/core/dispatch/OpSchema_test.cpp b/caffe2/core/dispatch/OpSchema_test.cpp
index 3c079886c989e..77936a0347a04 100644
--- a/caffe2/core/dispatch/OpSchema_test.cpp
+++ b/caffe2/core/dispatch/OpSchema_test.cpp
@@ -4,13 +4,16 @@
 using namespace c10;
 using namespace caffe2;
 
-static_assert(details::is_tensor_arg<Tensor>::value, "");
-static_assert(details::is_tensor_arg<const Tensor &>::value, "");
-static_assert(details::is_tensor_arg<Tensor &&>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CPUContext>>::value, "");
+static_assert(details::is_tensor_arg<const Tensor<CPUContext> &>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CPUContext> &&>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CUDAContext>>::value, "");
+static_assert(details::is_tensor_arg<const Tensor<CUDAContext> &>::value, "");
+static_assert(details::is_tensor_arg<Tensor<CUDAContext> &&>::value, "");
 static_assert(!details::is_tensor_arg<int>::value, "");
 
 struct SchemaDef final {
-  using Signature = bool (int, Tensor, float, Tensor, Tensor, unsigned int);
+  using Signature = bool (int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int);
   static constexpr guts::array<const char*, 6> parameter_names = {{
       "1", "2", "3", "4", "5", "6"
   }};
@@ -18,4 +21,4 @@ struct SchemaDef final {
 static_assert(6 == OpSchema<SchemaDef>::signature::num_args, "test num_dispatch_args");
 static_assert(3 == OpSchema<SchemaDef>::signature::num_tensor_args, "test num_dispatch_args");
 static_assert(std::is_same<bool, typename OpSchema<SchemaDef>::signature::return_type>::value, "test num_dispatch_args");
-static_assert(std::is_same<guts::typelist::typelist<int, Tensor, float, Tensor, Tensor, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
+static_assert(std::is_same<guts::typelist::typelist<int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
diff --git a/caffe2/core/hip/blob_serialization_hip.cc b/caffe2/core/hip/blob_serialization_hip.cc
index 144bc3ce5257f..d472456b98ccf 100644
--- a/caffe2/core/hip/blob_serialization_hip.cc
+++ b/caffe2/core/hip/blob_serialization_hip.cc
@@ -4,7 +4,17 @@
 
 namespace caffe2 {
 
+template <>
+void TensorSerializer<HIPContext>::StoreDeviceDetail(const Tensor<HIPContext>& input,
+                                                     TensorProto* proto)
+{
+    auto* device_detail = proto->mutable_device_detail();
+    device_detail->set_device_type(HIP);
+    device_detail->set_hip_gpu_id(GetGPUIDForPointer(input.raw_data()));
+}
+
 namespace {
-REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer);
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<TensorHIP>()), TensorSerializer<HIPContext>);
+REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer<HIPContext>);
 }
 } // namespace caffe2
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 889553650a149..86a5fe6a376c4 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -50,6 +50,8 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
 
 namespace caffe2 {
 
+CAFFE_KNOWN_TYPE(Tensor<HIPContext>);
+
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
 
 // TODO(jiayq): these variables shouldn't be currently accessed during static
@@ -86,6 +88,16 @@ static long g_last_rep  = 0;
 
 HipMemoryPoolType GetHipMemoryPoolType() { return g_hip_memory_pool_type; }
 
+vector<TIndex>
+GetHipTensorInfo(const void* c, bool* shares_data, size_t* capacity, DeviceOption* device)
+{
+    vector<TIndex> dims          = GetTensorInfo<HIPContext>(c, shares_data, capacity, device);
+    const Tensor<HIPContext>* tc = static_cast<const Tensor<HIPContext>*>(c);
+    device->set_device_type(HIP);
+    device->set_hip_gpu_id(GetGPUIDForPointer(tc->raw_data()));
+    return dims;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // A wrapper to allow us to lazily initialize all HIP environments that Caffe
 // uses. This gets done the first time a caffe2::HIPContext::New() gets called
@@ -139,6 +151,10 @@ static void Caffe2InitializeHip()
         }
     }
 
+    RegisterTypeCallFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetTensorType<HIPContext>);
+
+    RegisterTensorInfoFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetHipTensorInfo);
+
     // CheckMiOpenVersions();
 }
 
@@ -311,17 +327,20 @@ void TrackMemoryAlloc(size_t nbytes)
 }
 }
 
-std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
-  // Lock the mutex
-  std::lock_guard<std::mutex> lock(HIPContext::mutex());
-  // A one-time caffe2 cuda initializer.
-  static Caffe2HipInitializerHelper g_hip_initializer_;
-  void* ptr = nullptr;
+std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
+{
+    // Lock the mutex
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+    // A one-time caffe2 cuda initializer.
+    static Caffe2HipInitializerHelper g_hip_initializer_;
+    void* ptr = nullptr;
 
-  if (FLAGS_caffe2_gpu_memory_tracking) {
-    TrackMemoryAlloc(nbytes);
-  }
-  switch (g_hip_memory_pool_type) {
+    if(FLAGS_caffe2_gpu_memory_tracking)
+    {
+        TrackMemoryAlloc(nbytes);
+    }
+    switch(g_hip_memory_pool_type)
+    {
     case HipMemoryPoolType::NONE:
         HIP_ENFORCE(hipMalloc(&ptr, nbytes));
         if(FLAGS_caffe2_gpu_memory_tracking)
@@ -343,21 +362,24 @@ std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
     return {nullptr, Delete};
 }
 
-void HIPStaticContext::Delete(void* ptr) {
-  // lock the mutex
-  std::lock_guard<std::mutex> lock(HIPContext::mutex());
-
-  if (FLAGS_caffe2_gpu_memory_tracking) {
-    auto sz_it = g_size_map.find(ptr);
-    DCHECK(sz_it != g_size_map.end());
-    auto aff_it = g_hip_device_affiliation.find(ptr);
-    DCHECK(aff_it != g_hip_device_affiliation.end());
-    g_total_mem -= sz_it->second;
-    g_total_by_gpu_map[aff_it->second] -= sz_it->second;
-    g_size_map.erase(sz_it);
-  }
-
-  switch (g_hip_memory_pool_type) {
+void HIPContext::Delete(void* ptr)
+{
+    // lock the mutex
+    std::lock_guard<std::mutex> lock(HIPContext::mutex());
+
+    if(FLAGS_caffe2_gpu_memory_tracking)
+    {
+        auto sz_it = g_size_map.find(ptr);
+        DCHECK(sz_it != g_size_map.end());
+        auto aff_it = g_hip_device_affiliation.find(ptr);
+        DCHECK(aff_it != g_hip_device_affiliation.end());
+        g_total_mem -= sz_it->second;
+        g_total_by_gpu_map[aff_it->second] -= sz_it->second;
+        g_size_map.erase(sz_it);
+    }
+
+    switch(g_hip_memory_pool_type)
+    {
     case HipMemoryPoolType::NONE:
     {
         // If memory pool is not set up, use simple hipFree.
@@ -393,11 +415,4 @@ void HIPStaticContext::Delete(void* ptr) {
     }
 }
 
-BaseStaticContext* GetHIPStaticContext() {
-  static HIPStaticContext context;
-  return &context;
-}
-
-REGISTER_STATIC_CONTEXT(HIP, GetHIPStaticContext());
-
 } // namespace caffe2
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index 36644f3715188..577ccd6792824 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -119,46 +119,37 @@ class ThreadLocalHIPObjects {
   vector<miopenHandle_t> miopen_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
 };
 
-BaseStaticContext* GetHIPStaticContext();
-
-class HIPContext final : public BaseContext {
+class HIPContext final {
  public:
   // The default HIP context constructor.
   explicit HIPContext(const int gpu_id = -1);
   explicit HIPContext(const DeviceOption& option);
 
-  ~HIPContext() override {
+  ~HIPContext() {
     if (hiprand_generator_) {
       HIPRAND_CHECK(hiprandDestroyGenerator(hiprand_generator_));
     }
     FinishDeviceComputation();
   }
 
-  BaseStaticContext* GetStaticContext() const override {
-    return GetHIPStaticContext();
-  }
-
-  static BaseStaticContext* StaticContext() {
-    return GetHIPStaticContext();
-  }
-
-  inline void SwitchToDevice(int stream_id) override {
+  inline void SwitchToDevice(int stream_id) {
     set_stream_id(stream_id);
     CaffeHipSetDevice(gpu_id_);
   }
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
 
-  using BaseContext::SwitchToDevice;
-
-  inline void WaitEvent(const Event& ev) override {
+  inline void WaitEvent(const Event& ev) {
     ev.Wait(HIP, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(HIP, this, err_msg);
   }
 
-  void FinishDeviceComputation() override {
+  void FinishDeviceComputation() {
     hipStreamSynchronize(hip_objects_.GetStream(gpu_id_, stream_id_));
     hipError_t error = hipGetLastError();
     if (error != hipSuccess) {
@@ -203,9 +194,7 @@ class HIPContext final : public BaseContext {
     return hiprand_generator_;
   }
 
-  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return StaticContext()->New(nbytes);
-  }
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
 
   // Get a mutex to lock out hipMalloc / hipFree calls when
   // NCCL kernels are being launched. Should remove threat of
@@ -229,21 +218,6 @@ class HIPContext final : public BaseContext {
         hip_objects_.GetStream(gpu_id_, stream_id_)));
   }
 
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytes<HIPContext, HIPContext>(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
-  }
-
   template <typename T, class SrcContext, class DstContext>
   inline void Copy(int n, const T* src, T* dst) {
     CopyBytes<SrcContext, DstContext>(
@@ -271,14 +245,6 @@ class HIPContext final : public BaseContext {
     return hipStreamQuery(stream) == hipSuccess;
   }
 
-  DeviceType GetDevicetype() const override {
-    return HIP;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return HIP;
-  }
-
  protected:
   static void Delete(void* data);
   void set_stream_id(int stream_id) {
@@ -372,38 +338,8 @@ struct PinnedCPUAllocator final : CPUAllocator {
   DefaultCPUAllocator baseAllocator_;
 };
 
-class HIPStaticContext final : public BaseStaticContext {
- public:
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
-
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<HIPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<HIPContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<HIPContext>(gpu_id);
-  }
-
-  DeviceType GetDeviceType() override {
-    return HIP;
-  }
-
-
-  void ExtractDeviceOption(DeviceOption* device, const void* data) override {
-    device->set_device_type(GetDeviceType());
-    device->set_hip_gpu_id(GetGPUIDForPointer(data));
-  }
-
- protected:
-  static void Delete(void* data);
-};
-
-typedef Tensor TensorHIP;
+// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
+typedef Tensor<HIPContext> TensorHIP;
 
 } // namespace caffe2
 
diff --git a/caffe2/core/int8_serialization.cc b/caffe2/core/int8_serialization.cc
index 190cf5797f01f..4003c1f1384e8 100644
--- a/caffe2/core/int8_serialization.cc
+++ b/caffe2/core/int8_serialization.cc
@@ -56,7 +56,7 @@ class Int8TensorCPUSerializer : public BlobSerializerBase {
   CPUContext context_;
 };
 
-class Int8TensorCPUDeserializer : public TensorDeserializer {
+class Int8TensorCPUDeserializer : public TensorDeserializer<CPUContext> {
  public:
   void Deserialize(const BlobProto& blob_proto, Blob* blob) override {
     const QTensorProto& proto = blob_proto.qtensor();
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 45f44049e49c6..325ccd3761afb 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -79,45 +79,11 @@ class OperatorBase : public Observable<OperatorBase> {
     }
   }
 
-  // TODO(jerryzh): Remove template
-  // and the type argument?
-  // This is to keep the API changes minimal and make refactoring
-  // a bit easier
-  template <typename T>
-  inline const T& Input(int idx, DeviceType type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "Input(int, DeviceType) is only available for Tensor");
-    DCHECK_LT(idx, inputs_.size());
-    try {
-      // TODO(jerryzh): We'll need to check device type in Get<T>() later
-      // Get<T>() -> Get<T>(type)
-      const auto& tensor = inputs_.at(idx)->template Get<T>();
-      return tensor;
-    } catch (::caffe2::EnforceNotMet& enf) {
-      if (has_debug_def()) {
-        enf.AppendMessage(".\nOffending Blob name: ");
-        enf.AppendMessage(debug_def().input(idx));
-        enf.AppendMessage(".\n");
-      }
-      throw enf;
-    }
-  }
-
   template <typename T>
   inline T* Output(int idx) {
     return outputs_.at(idx)->template GetMutable<T>();
   }
 
-  // TODO(jerryzh): Remove this template
-  template <typename T>
-  inline T* Output(int idx, DeviceType type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "Output(int, DeviceType) is only available for Tensor");
-    return outputs_.at(idx)->GetMutableTensor(type);
-  }
-
   template <typename T>
   inline T* Output(int idx, T* allocated) {
     outputs_.at(idx)->Reset(allocated);
@@ -137,29 +103,11 @@ class OperatorBase : public Observable<OperatorBase> {
     return inputs_.at(idx)->template IsType<T>();
   }
 
-  template <typename T>
-  inline bool InputIsType(int idx, DeviceType device_type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "InputIsType(idx, DeviceType) only available on "
-        "Tensor types.");
-    return inputs_.at(idx)->template IsType<T>(device_type);
-  }
-
   template <typename T>
   inline bool OutputIsType(int idx) {
     return outputs_.at(idx)->template IsType<T>();
   }
 
-  template <typename T>
-  inline bool OutputIsType(int idx, DeviceType type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "OutputIsType(idx, DeviceType) only available on "
-        "Tensor types.");
-    return outputs_.at(idx)->template IsType<T>(type);
-  }
-
   inline int InputSize() const {
     return inputs_.size();
   }
@@ -432,14 +380,11 @@ class Operator : public OperatorBase {
   }
   ~Operator() noexcept override {}
 
-  inline const Tensor& Input(
-      int idx,
-      DeviceType type = Context::GetDeviceType()) {
-    return OperatorBase::template Input<Tensor>(idx, type);
+  inline const Tensor<Context>& Input(int idx) {
+    return OperatorBase::template Input<Tensor<Context>>(idx);
   }
-
-  inline Tensor* Output(int idx, DeviceType type = Context::GetDeviceType()) {
-    return OperatorBase::template Output<Tensor>(idx, type);
+  inline Tensor<Context>* Output(int idx) {
+    return OperatorBase::template Output<Tensor<Context>>(idx);
   }
 
   void WaitEvent(const Event& ev, int stream_id = -1) final {
@@ -767,8 +712,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
       return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>::             \
           template call<Op>(op, meta);                                         \
     }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Tensor& tensor) {                           \
+    template <typename Op, typename Context>                                   \
+    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
       return call<Op>(op, tensor.meta());                                      \
     }                                                                          \
     template <typename Op>                                                     \
@@ -783,8 +728,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
     static bool call(Op* /* unused */, const TypeMeta& meta) {                 \
       CAFFE_THROW("Unsupported type of tensor: ", meta.name());                \
     }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Tensor& tensor) {                           \
+    template <typename Op, typename Context>                                   \
+    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
       return call<Op>(op, tensor.meta());                                      \
     }                                                                          \
     template <typename Op>                                                     \
@@ -801,8 +746,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
     static bool call(Op* op, const TypeMeta&) {                                \
       return op->template DoRunWithOtherType<ExtraArgs...>();                  \
     }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Tensor& tensor) {                           \
+    template <typename Op, typename Context>                                   \
+    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
       return call<Op>(op, tensor.meta());                                      \
     }                                                                          \
     template <typename Op>                                                     \
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 1944874437d73..fba9c9d56a1c8 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -131,7 +131,8 @@ struct WorkspaceIdInjector {
           "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
       int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
       Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
+      TensorCPU* global_ws_id_tensor =
+          global_ws_id_blob->template GetMutable<TensorCPU>();
       global_ws_id_tensor->Resize();
       global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
       VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
index cb80f90aa02c5..2aaa7a2dac3a3 100644
--- a/caffe2/core/predictor.cc
+++ b/caffe2/core/predictor.cc
@@ -14,7 +14,7 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->template IsType<Tensor>(CPU), "Blob is not a CPU Tensor: ", name);
+      blob->template IsType<TensorCPU>(), "Blob is not a CPU Tensor: ", name);
 }
 
 void shareInputTensor(
@@ -24,7 +24,7 @@ void shareInputTensor(
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->template GetMutable<TensorCPU>();
   tensor->ResizeLike(*input);
   tensor->ShareData(*input);
 }
@@ -33,7 +33,7 @@ TensorCPU* extractOutputTensor(Workspace* ws, const std::string& name) {
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return blob->GetMutableTensor(CPU);
+  return blob->template GetMutable<TensorCPU>();
 }
 
 // We don't use the getNet() from predictor_utils.cc here because that file
@@ -115,7 +115,7 @@ Predictor::Predictor(
   for (const auto& name : predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = ws_.CreateBlob(name);
-      blob->GetMutableTensor(CPU);
+      blob->template GetMutable<TensorCPU>();
     }
   }
 
diff --git a/caffe2/core/predictor_test.cc b/caffe2/core/predictor_test.cc
index c8c00538eaa6c..a37dbbb9e8d39 100644
--- a/caffe2/core/predictor_test.cc
+++ b/caffe2/core/predictor_test.cc
@@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
     const std::vector<TIndex>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
-  auto* t = blob->GetMutableTensor(CPU);
+  auto* t = blob->GetMutable<TensorCPU>();
   t->Resize(dims);
   math::RandUniform<float, CPUContext>(
       t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@@ -178,7 +178,7 @@ class PredictorTest : public testing::Test {
 
 TEST_F(PredictorTest, SimpleBatchSized) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorVector input{inputData->GetMutableTensor(CPU)};
+  Predictor::TensorVector input{inputData->template GetMutable<TensorCPU>()};
   Predictor::TensorVector output;
   p_->run(input, &output);
   EXPECT_EQ(output.size(), 1);
@@ -190,7 +190,8 @@ TEST_F(PredictorTest, SimpleBatchSized) {
 
 TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorMap input{{"data", inputData->GetMutableTensor(CPU)}};
+  Predictor::TensorMap input{
+      {"data", inputData->template GetMutable<TensorCPU>()}};
   Predictor::TensorVector output;
   p_->run_map(input, &output);
   EXPECT_EQ(output.size(), 1);
@@ -215,7 +216,8 @@ class PredictorMetaNetDefTest : public testing::Test {
 
 TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorMap input{{"data", inputData->GetMutableTensor(CPU)}};
+  Predictor::TensorMap input{
+      {"data", inputData->template GetMutable<TensorCPU>()}};
   Predictor::TensorVector output;
   p_->run_map(input, &output);
   EXPECT_EQ(output.size(), 1);
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 83e907be0e263..1f115e14f6715 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -43,33 +43,9 @@ TensorPrinter::~TensorPrinter() {
   }
 }
 
-void TensorPrinter::PrintMeta(const Tensor& tensor) {
-  if (to_file_) {
-    (*log_file_) << MetaStr(tensor) << std::endl;
-  } else {
-    LOG(INFO) << MetaStr(tensor);
-  }
-}
-
-std::string TensorPrinter::MetaStr(const Tensor& tensor) {
-  std::stringstream meta_stream;
-  meta_stream << "Tensor " << tensor_name_ << " of type "
-              << tensor.meta().name() << ". Dims: (";
-  for (const auto dim : tensor.dims()) {
-    meta_stream << dim << ",";
-  }
-  meta_stream << "): ";
-  return meta_stream.str();
-}
-
-TypeMeta GetTensorType(const void* c) {
-  const Tensor* tc = static_cast<const Tensor*>(c);
-  return tc->meta();
-}
-
-// TODO(jerryzh): Remove
-static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_{
-    {TypeMeta::Id<Tensor>(), GetTensorType}};
+static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_ {
+  {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorType<CPUContext>}
+};
 
 TypeCall GetTypeCallFunction(CaffeTypeId id) {
   auto f = type_call_registry_.find(id);
@@ -83,26 +59,9 @@ void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c) {
   type_call_registry_[id] = c;
 }
 
-int GetGPUIDForPointer(const void* ptr);
-
-vector<TIndex> GetTensorInfo(
-    const void* c,
-    bool* shares_data,
-    size_t* capacity,
-    DeviceOption* device) {
-  const Tensor* tc = static_cast<const Tensor*>(c);
-  *shares_data = tc->shares_data();
-  *capacity = tc->capacity_nbytes();
-  tc->ExtractDeviceOption(device);
-  return tc->dims();
-}
-
-// since we only have one tensor, probably need to remove this at some point?
 static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
-    {TypeMeta::Id<Tensor>(), GetTensorInfo}};
+    {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorInfo<CPUContext>}};
 
-// TODO: Remove this code in a separate diff, since we only have one
-// GetTensorInfo function now
 TensorInfoCall GetTensorInfoFunction(CaffeTypeId id) {
   auto f = tensor_info_call_registry_.find(id);
   if (f == tensor_info_call_registry_.end()) {
@@ -115,20 +74,11 @@ void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c) {
   tensor_info_call_registry_[id] = c;
 }
 
-void TensorVectorResize(std::vector<Tensor>& tensors,
-                        int size,
-                        DeviceType type) {
-  tensors.reserve(size);
-  for (auto i = 0; i < size; ++i) {
-    tensors.emplace_back(type);
-  }
-}
-
 namespace {
 
-struct TensorStatGetter : BlobStatGetter {
+struct TensorCPUStatGetter : BlobStatGetter {
   size_t sizeBytes(const Blob& blob) const override {
-    const auto& tensor = blob.Get<Tensor>();
+    const auto& tensor = blob.Get<TensorCPU>();
     auto nbytes = tensor.nbytes();
     if (nbytes > 0 && tensor.IsType<std::string>()) {
       const auto* data = tensor.data<std::string>();
@@ -139,7 +89,7 @@ struct TensorStatGetter : BlobStatGetter {
     return nbytes;
   }
 };
-REGISTER_BLOB_STAT_GETTER(Tensor, TensorStatGetter);
+REGISTER_BLOB_STAT_GETTER(TensorCPU, TensorCPUStatGetter);
 }
 
 } // namespace caffe2
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 2c150d6d0d55a..a0a170505acec 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -89,10 +89,13 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
  * the allocation and de-allocation of such memory. We make a simplified
  * assumption that the memory is always contiguous.
  */
+template <class Context>
 class Tensor {
  public:
-  Tensor() = delete;
-  explicit Tensor(DeviceType type) : device_type_(type) {}
+  /**
+   * Initializes an empty tensor.
+   */
+  Tensor() {}
 
   /**
    * @brief Creates a tensor of the given dimension.
@@ -100,86 +103,67 @@ class Tensor {
    * Note that the actual data allocation is not going to be carried out until
    * the first time mutable_data() is called.
    */
-  explicit Tensor(const vector<TIndex>& dims, DeviceType type)
-      : device_type_(type) {
-    Resize(dims);
-  }
-  explicit Tensor(const vector<int>& dims, DeviceType type)
-      : device_type_(type) {
-    Resize(dims);
-  }
+  explicit Tensor(const vector<TIndex>& dims) { Resize(dims); }
+  explicit Tensor(const vector<int>& dims) { Resize(dims); }
 
-  /* Now we require that context_for_copy has the same device type as src since template
-   * is removed
+  /**
+   * @brief Creates a tensor from a source tensor, copying over the content.
+   *
+   * Note that the source tensor can be from a different device context. The
+   * second argument provides a device context object (either Context or
+   * SrcContext) that will be responsible for copying the underlying data.
+   * If you do not wish to pass in a Context object, an equivalent constructor
+   * function exists that will create an implicit context object for copy, but
+   * be noted that this will cause a potential performance hit.
    */
-  Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type) : device_type_(type) {
-    CopyFrom(src, context_for_copy);
+  template <class SrcContext, class ContextForCopy>
+  Tensor(const Tensor<SrcContext>& src, ContextForCopy* context) {
+    CopyFrom(src, context);
   }
 
   /**
-   * @brief: Create a Tensor of DeviceType `type` and initialize it with
-   * src Tensor
+   * @brief Creates a tensor from a source tensor, copying over the content.
+   *
+   * Note that this may have a potential performance hit, since a temporary
+   * context object will be created for the memory copy. Prefer explicitly
+   * providing a context for copy if you can.
+   *
+   * Since it's a potentially expensive operation - making copy constructor
+   * explicit here. If SrcContext != Context it's actually a typecast
+   * constructor and it should be definitely explicit.
    */
-  Tensor(const Tensor& src, DeviceType type) : device_type_(type) {
+  template <class SrcContext>
+  explicit Tensor(const Tensor<SrcContext>& src) {
     CopyFrom(src);
   }
 
   /**
    * @brief Creates a tensor, and fills its contents with the given values.
-   * The type of tensor will be decided by the context parameter
    */
   template <typename T>
-  Tensor(
-      const vector<TIndex>& dims,
-      const vector<T>& values,
-      BaseContext* context)
+  Tensor(const vector<TIndex>& dims, const vector<T>& values, Context* context)
       : meta_(TypeMeta::Make<T>()) {
     Resize(dims);
     CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size_);
-    device_type_ = context->GetDevicetype();
-    context->CopyItemsFromCPU(meta_, size_, values.data(), mutable_data<T>());
+    context->template Copy<T, CPUContext, Context>(size_, values.data(), mutable_data<T>());
   }
 
   /**
    * @brief Creates a scalar tensor, and fills its content with the given value.
-   * The type of tensor will be decided by the context parameter
    */
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  Tensor(const T& value, BaseContext* context) : meta_(TypeMeta::Make<T>()) {
+  template <typename T,
+            typename = typename std::enable_if<std::is_scalar<T>::value>::type>
+  Tensor(const T& value, Context* context) {
     Resize(vector<TIndex>{});
-    device_type_ = context->GetDevicetype();
-    context->CopyItemsFromCPU(meta_, size_, &value, mutable_data<T>());
-  }
-
-  /*
-   * Since we removed template from tensor, we now store a static
-   * context pointer in tensor, which indicates the type of the tensor.
-   */
-  BaseStaticContext* GetStaticContext() const {
-    return GET_STATIC_CONTEXT(device_type_);
-  }
-
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us
-   */
-  std::unique_ptr<BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
+    context->template Copy<T, CPUContext, Context>(size_, &value, mutable_data<T>());
   }
 
-  DeviceType GetDeviceType() const {
-    return device_type_;
-  }
   /**
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.
    */
-  void CopyFrom(const Tensor& src, BaseContext* context = nullptr) {
+  template <class SrcContext, class ContextForCopy>
+  void CopyFrom(const Tensor<SrcContext>& src, ContextForCopy* context) {
     if ((void*)&src == (void*)this) {
       return;
     }
@@ -196,39 +180,27 @@ class Tensor {
     Resize(src.dims());
     if (size() > 0) {
       if (meta_.copy()) {
-        CAFFE_ENFORCE(
-            GetDeviceType() == CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        CAFFE_ENFORCE(
-            src.GetDeviceType() == CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
         meta_.copy()(src.raw_data(), raw_mutable_data(), size());
       } else {
-        // We'll need to use a non-CPU context to perform the copy if
-        // one of the context is not CPU since only non-CPU context
-        // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
-          if (!context) {
-            src.CreateContext().get()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          } else {
-            CAFFE_ENFORCE(
-                context->GetDevicetype() == src.GetDeviceType(),
-                "Type for provided context does not match the type of source");
-            context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          }
-        } else {
-          // In case source context is CPU, and target context is non-CPU
-          // We'll have to create a Context from target and perform the
-          // copy using that context
-          CreateContext().get()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
-        }
+        context->template CopyBytes<SrcContext, Context>(
+            nbytes(), src.raw_data(), raw_mutable_data());
       }
     }
   }
 
+  /**
+   * @brief Copies the data from a source tensor.
+   *
+   * Note that this may have a potential performance hit, since a temporary
+   * context object will be created for the memory copy. Prefer explicitly
+   * providing a context for copy if you can.
+   */
+  template <class SrcContext>
+  inline void CopyFrom(const Tensor<SrcContext>& src) {
+    SrcContext tmp_context;
+    CopyFrom(src, &tmp_context);
+  }
+
   virtual ~Tensor() noexcept {}
 
   /**
@@ -240,7 +212,8 @@ class Tensor {
    * growthPct. This ensures that Extend runs on an amortized O(1) time
    * complexity.
    */
-  void Extend(TIndex num, float growthPct, BaseContext* context) {
+  template <class ContextForCopy>
+  void Extend(TIndex num, float growthPct, ContextForCopy* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     auto newDims = dims_;
     newDims[0] += num;
@@ -266,8 +239,8 @@ class Tensor {
     size_ = newSize;
   }
 
-  template <class T>
-  void Reserve(const std::vector<T>& newCapacity, BaseContext* context) {
+  template <class T, class ContextForCopy>
+  void Reserve(const std::vector<T>& newCapacity, ContextForCopy* context) {
     auto newSize = std::accumulate(
         newCapacity.begin(),
         newCapacity.end(),
@@ -281,7 +254,8 @@ class Tensor {
     auto oldDims = dims_;
     Resize(newCapacity);
     auto* newData = raw_mutable_data(meta_);
-    context->CopyItemsSameDevice(meta_, oldSize, oldData.get(), newData);
+    context->template CopyItems<ContextForCopy, ContextForCopy>(
+        meta_, oldSize, oldData.get(), newData);
     dims_ = oldDims;
     size_ = oldSize;
     reserved_ = true;
@@ -346,7 +320,8 @@ class Tensor {
    * Resize the tensor like the source tensor. Note that this is just a
    * sugar wrapper that essentially calls Resize(src_tensor.dims()).
    */
-  inline void ResizeLike(const Tensor& src_tensor) {
+  template <class OtherContext>
+  inline void ResizeLike(const Tensor<OtherContext>& src_tensor) {
     // Note: need casting for different context types.
     if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
       Resize(src_tensor.dims());
@@ -409,7 +384,7 @@ class Tensor {
     return ss.str();
   }
 
-  void swap(Tensor& other) noexcept {
+  void swap(Tensor<Context>& other) {
     std::swap(dims_, other.dims_);
     std::swap(size_, other.size_);
     std::swap(meta_, other.meta_);
@@ -417,7 +392,6 @@ class Tensor {
     std::swap(shares_data_, other.shares_data_);
     std::swap(capacity_, other.capacity_);
     std::swap(reserved_, other.reserved_);
-    std::swap(device_type_, other.device_type_);
   }
 
   /**
@@ -568,8 +542,7 @@ class Tensor {
         // destruction procedure.
         auto size = size_;
         auto dtor = meta_.dtor();
-        auto ptr_and_deleter =
-            GetStaticContext()->New(size_ * meta_.itemsize());
+        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
         auto deleter = ptr_and_deleter.second;
         data_.reset(
             ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
@@ -579,8 +552,7 @@ class Tensor {
         meta_.ctor()(data_.get(), size_);
       } else {
         // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter =
-            GetStaticContext()->New(size_ * meta_.itemsize());
+        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
         data_.reset(ptr_and_deleter.first, ptr_and_deleter.second);
       }
       capacity_ = size_ * meta_.itemsize();
@@ -718,28 +690,20 @@ class Tensor {
     return dims_[i];
   }
 
-  // We don't allow change to the type of
-  // tensor after initialization
   Tensor Clone() const {
-    Tensor x(GetDeviceType());
+    Tensor x;
     x.CopyFrom(*this);
     return x;
   }
 
-  Tensor(Tensor&& src) noexcept {
+  Tensor(Tensor<Context>&& src) noexcept {
     swap(src);
   }
 
-  Tensor& operator=(Tensor&&) = default;
-
   /**
    * @brief Delete the copy constructor and use Clone explicitly
    */
-  Tensor(const Tensor& src) = delete;
-
-  void ExtractDeviceOption(DeviceOption* device) const {
-    GetStaticContext()->ExtractDeviceOption(device, raw_data());
-  }
+  Tensor(const Tensor<Context>& src) = delete;
 
  protected:
   vector<TIndex> dims_;
@@ -749,7 +713,6 @@ class Tensor {
   bool shares_data_ = false;
   size_t capacity_ = 0;
   bool reserved_ = false;
-  DeviceType device_type_ = CPU;
   // In case of chunk load we store how much data was already loaded
 
  private:
@@ -822,7 +785,8 @@ class Tensor {
   Tensor& operator=(const Tensor& src) = delete;
 };
 
-using TensorCPU = Tensor;
+// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
+typedef Tensor<CPUContext> TensorCPU;
 
 constexpr int k_limit_default_ = 1000;
 
@@ -831,6 +795,12 @@ typedef TypeMeta (*TypeCall)(const void*);
 TypeCall GetTypeCallFunction(CaffeTypeId id);
 void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c);
 
+template <class Context>
+TypeMeta GetTensorType(const void* c) {
+  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
+  return tc->meta();
+}
+
 // Shape call registry
 typedef vector<TIndex> (*TensorInfoCall)(
     const void*,
@@ -840,8 +810,19 @@ typedef vector<TIndex> (*TensorInfoCall)(
 TensorInfoCall GetTensorInfoFunction(CaffeTypeId id);
 void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c);
 
-// resize helper function
-void TensorVectorResize(std::vector<Tensor>& tensors, int size, DeviceType type);
+template <class Context>
+vector<TIndex> GetTensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
+  *shares_data = tc->shares_data();
+  *capacity = tc->capacity_nbytes();
+  device->set_device_type(CPU);
+  device->set_cuda_gpu_id(0);
+  return tc->dims();
+}
 
 class TensorPrinter {
  public:
@@ -852,11 +833,13 @@ class TensorPrinter {
   ~TensorPrinter();
 
   template <class T>
-  void Print(const Tensor& tensor);
+  void Print(const Tensor<CPUContext>& tensor);
 
-  void PrintMeta(const Tensor& tensor);
+  template <class Context>
+  void PrintMeta(const Tensor<Context>& tensor);
 
-  string MetaStr(const Tensor& tensor);
+  template <class Context>
+  string MetaStr(const Tensor<Context>& tensor);
 
  private:
   bool to_file_;
@@ -866,7 +849,7 @@ class TensorPrinter {
 };
 
 template <class T>
-void TensorPrinter::Print(const Tensor& tensor) {
+void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
   std::stringstream values_stream;
   // One most likely doesn't want to print int64-number of items for visual
   // inspection, so we cast down to int here.
@@ -886,5 +869,26 @@ void TensorPrinter::Print(const Tensor& tensor) {
   }
 }
 
+template <class Context>
+void TensorPrinter::PrintMeta(const Tensor<Context>& tensor) {
+  if (to_file_) {
+    (*log_file_) << MetaStr(tensor) << std::endl;
+  } else {
+    LOG(INFO) << MetaStr(tensor);
+  }
+}
+
+template <class Context>
+std::string TensorPrinter::MetaStr(const Tensor<Context>& tensor) {
+  std::stringstream meta_stream;
+  meta_stream << "Tensor " << tensor_name_ << " of type "
+              << tensor.meta().name() << ". Dims: (";
+  for (const auto dim : tensor.dims()) {
+    meta_stream << dim << ",";
+  }
+  meta_stream << "): ";
+  return meta_stream.str();
+}
+
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_TENSOR_H_
diff --git a/caffe2/core/tensor_int8.h b/caffe2/core/tensor_int8.h
index ec7d2aaa618a3..93efe66a79de3 100644
--- a/caffe2/core/tensor_int8.h
+++ b/caffe2/core/tensor_int8.h
@@ -3,7 +3,6 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/tensor.h"
-#include "caffe2/proto/caffe2.pb.h"
 
 namespace caffe2 {
 namespace int8 {
@@ -13,7 +12,7 @@ struct Int8TensorCPU {
   int32_t zero_point{0};
   // Generally stores uint8_t data, but sometimes int32_t (e.g. bias
   // parameters).
-  Tensor t{CPU};
+  TensorCPU t;
 };
 } // namespace int8
 } // namespace caffe2
diff --git a/caffe2/core/typeid.cc b/caffe2/core/typeid.cc
index ba81e3babc6d0..2e1214656479b 100644
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@@ -69,7 +69,8 @@ CaffeTypeId CaffeTypeId::createTypeId() {
   return CaffeTypeId(new_value);
 }
 
-CAFFE_DEFINE_KNOWN_TYPE(Tensor);
+CAFFE_DEFINE_KNOWN_TYPE(Tensor<CPUContext>);
+CAFFE_DEFINE_KNOWN_TYPE(Tensor<CUDAContext>);
 CAFFE_DEFINE_KNOWN_TYPE(float);
 CAFFE_DEFINE_KNOWN_TYPE(int);
 CAFFE_DEFINE_KNOWN_TYPE(std::string);
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index b4a01b57cc11e..6a497861af8e3 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -437,37 +437,41 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
       #T);                                                     \
   }
 
+template <class Context>
 class Tensor;
+class CPUContext;
+class CUDAContext;
 
 // note: first preallocated id is 1, because 0 is used for uninitialized type
 // ids.
 struct _CaffeHighestPreallocatedTypeId final {};
 
-CAFFE_DECLARE_KNOWN_TYPE(1, Tensor);
-CAFFE_DECLARE_KNOWN_TYPE(2, float);
-CAFFE_DECLARE_KNOWN_TYPE(3, int);
-CAFFE_DECLARE_KNOWN_TYPE(4, std::string);
-CAFFE_DECLARE_KNOWN_TYPE(5, bool);
-CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t);
-CAFFE_DECLARE_KNOWN_TYPE(7, int8_t);
-CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t);
-CAFFE_DECLARE_KNOWN_TYPE(9, int16_t);
-CAFFE_DECLARE_KNOWN_TYPE(10, int64_t);
-CAFFE_DECLARE_KNOWN_TYPE(11, double);
-CAFFE_DECLARE_KNOWN_TYPE(12, char);
-CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr<std::mutex>);
-CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::atomic<bool>>);
-CAFFE_DECLARE_KNOWN_TYPE(15, std::vector<int32_t>);
-CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int64_t>);
-CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<unsigned long>);
-CAFFE_DECLARE_KNOWN_TYPE(18, bool*);
-CAFFE_DECLARE_KNOWN_TYPE(19, char*);
-CAFFE_DECLARE_KNOWN_TYPE(20, int*);
+CAFFE_DECLARE_KNOWN_TYPE(1, Tensor<CPUContext>);
+CAFFE_DECLARE_KNOWN_TYPE(2, Tensor<CUDAContext>);
+CAFFE_DECLARE_KNOWN_TYPE(3, float);
+CAFFE_DECLARE_KNOWN_TYPE(4, int);
+CAFFE_DECLARE_KNOWN_TYPE(5, std::string);
+CAFFE_DECLARE_KNOWN_TYPE(6, bool);
+CAFFE_DECLARE_KNOWN_TYPE(7, uint8_t);
+CAFFE_DECLARE_KNOWN_TYPE(8, int8_t);
+CAFFE_DECLARE_KNOWN_TYPE(9, uint16_t);
+CAFFE_DECLARE_KNOWN_TYPE(10, int16_t);
+CAFFE_DECLARE_KNOWN_TYPE(11, int64_t);
+CAFFE_DECLARE_KNOWN_TYPE(12, double);
+CAFFE_DECLARE_KNOWN_TYPE(13, char);
+CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
+CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
+CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
+CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
+CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
+CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
+CAFFE_DECLARE_KNOWN_TYPE(20, char*);
+CAFFE_DECLARE_KNOWN_TYPE(21, int*);
 
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
-CAFFE_DECLARE_KNOWN_TYPE(21, long);
-CAFFE_DECLARE_KNOWN_TYPE(22, std::vector<long>);
+CAFFE_DECLARE_KNOWN_TYPE(22, long);
+CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
 #endif // CAFFE2_UNIQUE_LONG_TYPEMETA
 
-CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId);
+CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
 }
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 4a759b8703dc4..a593604114d3e 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -136,14 +136,14 @@ class Workspace {
       auto* from_blob = parent_ws->GetBlob(ws_blob.second);
       CAFFE_ENFORCE(from_blob);
       CAFFE_ENFORCE(
-          from_blob->template IsType<Tensor>(),
+          from_blob->template IsType<Tensor<Context>>(),
           "Expected blob with tensor value",
           ws_blob.second);
       forwarded_blobs_.erase(blob);
       auto* to_blob = CreateBlob(blob);
       CAFFE_ENFORCE(to_blob);
-      const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
+      const auto& from_tensor = from_blob->template Get<Tensor<Context>>();
+      auto* to_tensor = to_blob->template GetMutable<Tensor<Context>>();
       to_tensor->CopyFrom(from_tensor);
     }
   }
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition.h b/caffe2/experiments/operators/fully_connected_op_decomposition.h
index ae6a228684670..f06877f188b8e 100644
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@@ -100,8 +100,8 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
   }
 
  protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor multi_buffer_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> multi_buffer_;
 };
 
 template <typename T, class Context, class Engine=DefaultEngine>
@@ -207,10 +207,10 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
   }
 
  protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor du_buffer_{Context::GetDeviceType()};
-  Tensor dv_buffer_{Context::GetDeviceType()};
-  Tensor dx_buffer_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> du_buffer_;
+  Tensor<Context> dv_buffer_;
+  Tensor<Context> dx_buffer_;
 };
 
 }  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.h b/caffe2/experiments/operators/fully_connected_op_prune.h
index 3f6c24ef4a3f0..5b9508f26f2b5 100644
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@@ -189,7 +189,7 @@ namespace caffe2 {
         }
 
       protected:
-       Tensor bias_multiplier_{Context::GetDeviceType()};
+        Tensor<Context> bias_multiplier_;
     };
 
   template <typename T, class Context, class Engine=DefaultEngine>
@@ -343,9 +343,9 @@ namespace caffe2 {
         }
 
       protected:
-       Tensor bias_multiplier_{Context::GetDeviceType()};
-       Tensor sum_buffer_{Context::GetDeviceType()};
-       Tensor comp_r_buf_{Context::GetDeviceType()};
+        Tensor<Context> bias_multiplier_;
+        Tensor<Context> sum_buffer_;
+        Tensor<Context> comp_r_buf_;
     };
 
 }  // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.h b/caffe2/experiments/operators/fully_connected_op_sparse.h
index 6f19c1bacdc5b..a5abe18a07484 100644
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@@ -106,7 +106,7 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
     const auto& jw = Input(3);
     // Notice that we do not need to transpose b
     const auto& b = Input(4);
-    auto* Yt = Output(0); // transposed Y
+    auto* Yt = Output(0); //transposed Y
     // here we assume X is k-by-m
     CAFFE_ENFORCE_EQ(Xt.ndim(), 2);
     CAFFE_ENFORCE_EQ(b.ndim(), 1);
@@ -140,7 +140,7 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
   }
 
  protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
 };
 
 
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
index b6686ade1aabd..8c8d51c4ed01d 100644
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@@ -104,6 +104,7 @@ class SparseMatrixReshapeOp : public Operator<Context> {
     CAFFE_ENFORCE(
         old_row.size() == nnz,
         "Column and row tensors must have the same size.");
+
     auto* new_col = Output(0);
     auto* new_row = Output(1);
     new_col->Resize(nnz);
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index eb2d5b6acf1a6..f589185caa0f3 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -27,7 +27,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
   bool RunOnDevice() override {
     const auto& input_zero = Input(INPUT0);
     auto* output = Output(OUTPUT);
-    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO, CPU);
+    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO);
 
     vector<itensor> inputs;
     for (int i = 0; i < InputSize(); ++i) {
@@ -88,7 +88,7 @@ class IDEEPSplitOp final : public IDEEPOperator {
           0,
           "If you set split with an input blob, do not pass in "
           "split in the argument.");
-      auto& axis_info = OperatorBase::Input<Tensor>(AXIS_INFO, CPU);
+      auto& axis_info = OperatorBase::Input<TensorCPU>(AXIS_INFO);
       CAFFE_ENFORCE_EQ(axis_info.size(), OutputSize());
       auto* axis_data = axis_info.template data<int>();
       axis_vdata.assign(axis_data, axis_data + OutputSize());
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index ad39e641ed933..44eb9c7a430a8 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -74,7 +74,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
     for (int i = 0; i < InputSize(); ++i) {
       if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
         auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
+        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
@@ -85,7 +85,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           InputIsType<itensor>(i) &&
           Input(i).get_data_type() == itensor::data_type::s32) {
         auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
+        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
@@ -138,8 +138,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         auto src_dims = src.dims();
         Blob* dst = OperatorBase::OutputBlob(i);
-        dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
+        dst->Reset(new Tensor<CPUContext>());
+        auto dtensor = dst->template GetMutable<TensorCPU>();
         dtensor->Resize(src_dims);
         dtensor->ShareData(src);
       }
@@ -156,3 +156,4 @@ class IDEEPFallbackOp final : public IDEEPOperator {
 };
 
 } // namespace caffe2
+
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 194b949222bea..67d7d2ca2d732 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -10,7 +10,7 @@ class CopyCPUToIDEEPOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
 
   bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
+    const auto& X = OperatorBase::Input<TensorCPU>(0);
     auto* Y = OperatorBase::OutputBlob(0);
     itensor::dims src_dims(X.dims().begin(), X.dims().end());
     if (!(Y->template IsType<itensor>() &&
@@ -31,14 +31,14 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.template IsType<Tensor>(CPU)) {
+    if (input_blob.template IsType<TensorCPU>()) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<TensorCPU>(0);
-      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
+      auto* Y = OperatorBase::Output<TensorCPU>(0);
       Y->CopyFrom(X);
     } else {
       const auto& X = OperatorBase::Input<itensor>(0);
-      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
+      auto* Y = OperatorBase::Output<TensorCPU>(0);
       Y->Resize(X.get_dims());
       if (X.get_data_type() == itensor::data_type::f32) {
         X.reorder_to(Y->template mutable_data<float>());
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 38885af44c8a8..200d98fe08609 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -8,9 +8,7 @@
 
 namespace caffe2 {
 
-BaseStaticContext* GetIDEEPStaticContext();
-
-class IDEEPContext final : public BaseContext {
+class IDEEPContext final {
  public:
   typedef std::mt19937 rand_gen_type;
   IDEEPContext() : random_seed_(RandomNumberSeed()) {}
@@ -23,16 +21,10 @@ class IDEEPContext final : public BaseContext {
 
   ~IDEEPContext() noexcept {}
 
-  BaseStaticContext* GetStaticContext() const override {
-    return GetIDEEPStaticContext();
-  }
-
-  static BaseStaticContext* StaticContext() {
-    return GetIDEEPStaticContext();
-  }
-
   inline void SwitchToDevice(int /*stream_id*/) {}
-  using BaseContext::SwitchToDevice;
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
 
   inline void WaitEvent(const Event& ev) {
     ev.Wait(IDEEP, this);
@@ -54,32 +46,7 @@ class IDEEPContext final : public BaseContext {
   }
 
   inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return StaticContext()->New(nbytes);
-  }
-
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
-      override {
-    if (nbytes == 0) {
-      return;
-    }
-    CAFFE_ENFORCE(src);
-    CAFFE_ENFORCE(dst);
-    memcpy(dst, src, nbytes);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  bool SupportsNonFundamentalTypes() const override {
-    // IDEEP meta copy is OK
-    return true;
+    return GetCPUAllocator()->New(nbytes);
   }
 
   // Two copy functions that deals with cross-device copies.
@@ -122,14 +89,6 @@ class IDEEPContext final : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
-    return IDEEP;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return IDEEP;
-  }
-
  protected:
   // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
   int random_seed_{1701};
@@ -174,25 +133,4 @@ inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
   CAFFE_ENFORCE(dst);
   memcpy(dst, src, nbytes);
 }
-
-class IDEEPStaticContext : public BaseStaticContext {
- public:
-  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
-    return GetCPUAllocator()->New(nbytes);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<IDEEPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<IDEEPContext>(option);
-  }
-
-  DeviceType GetDeviceType() override {
-    return IDEEP;
-  }
-};
-
 } // namespace caffe2
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index c9c22387de4aa..45335e9659d48 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -1,8 +1,7 @@
-#include <caffe2/core/event_cpu.h>
+#include <ideep_pin_singletons.hpp>
 #include <caffe2/core/operator.h>
 #include <caffe2/proto/caffe2.pb.h>
-#include <ideep_pin_singletons.hpp>
-#include "ideep_context.h"
+#include <caffe2/core/event_cpu.h>
 
 namespace caffe2 {
 
@@ -27,11 +26,4 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
 
-BaseStaticContext* GetIDEEPStaticContext() {
-  static IDEEPStaticContext context;
-  return &context;
-}
-
-REGISTER_STATIC_CONTEXT(IDEEP, GetIDEEPStaticContext());
-
 } // namespace caffe2
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 9dae032134bc0..a8c45ca87d46a 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -87,12 +87,12 @@ class ImageInputOp final
   unique_ptr<db::DBReader> owned_reader_;
   const db::DBReader* reader_;
   CPUContext cpu_context_;
-  Tensor prefetched_image_{CPU};
-  Tensor prefetched_label_{CPU};
+  TensorCPU prefetched_image_;
+  TensorCPU prefetched_label_;
   vector<TensorCPU> prefetched_additional_outputs_;
-  Tensor prefetched_image_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_label_on_device_{Context::GetDeviceType()};
-  vector<Tensor> prefetched_additional_outputs_on_device_;
+  Tensor<Context> prefetched_image_on_device_;
+  Tensor<Context> prefetched_label_on_device_;
+  vector<Tensor<Context>> prefetched_additional_outputs_on_device_;
   // Default parameters for images
   PerImageArg default_arg_;
   int batch_size_;
@@ -118,8 +118,8 @@ class ImageInputOp final
   int crop_;
   std::vector<float> mean_;
   std::vector<float> std_;
-  Tensor mean_gpu_{Context::GetDeviceType()};
-  Tensor std_gpu_{Context::GetDeviceType()};
+  Tensor<Context> mean_gpu_;
+  Tensor<Context> std_gpu_;
   bool mirror_;
   bool is_test_;
   bool use_caffe_datum_;
@@ -154,6 +154,8 @@ ImageInputOp<Context>::ImageInputOp(
     Workspace* ws)
     : PrefetchOperator<Context>(operator_def, ws),
       reader_(nullptr),
+      prefetched_additional_outputs_(OutputSize() - 2),
+      prefetched_additional_outputs_on_device_(OutputSize() - 2),
       batch_size_(
           OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
       label_type_(static_cast<LABEL_TYPE>(
@@ -383,9 +385,6 @@ ImageInputOp<Context>::ImageInputOp(
   }
 
   for (int i = 0; i < additional_output_sizes.size(); ++i) {
-    prefetched_additional_outputs_on_device_.emplace_back(
-        Context::GetDeviceType());
-    prefetched_additional_outputs_.emplace_back(CPU);
     prefetched_additional_outputs_[i].Resize(
         TIndex(batch_size_), TIndex(additional_output_sizes[i]));
   }
@@ -1197,12 +1196,12 @@ bool ImageInputOp<Context>::Prefetch() {
   // If the context is not CPUContext, we will need to do a copy in the
   // prefetch function as well.
   if (!std::is_same<Context, CPUContext>::value) {
-    prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
-    prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
+    prefetched_image_on_device_.CopyFrom(prefetched_image_, &context_);
+    prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
 
     for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
       prefetched_additional_outputs_on_device_[i].CopyFrom(
-          prefetched_additional_outputs_[i], &cpu_context_);
+          prefetched_additional_outputs_[i], &context_);
     }
   }
 
@@ -1213,13 +1212,13 @@ bool ImageInputOp<Context>::Prefetch() {
 
 template <class Context>
 bool ImageInputOp<Context>::CopyPrefetched() {
-  auto type = Context::GetDeviceType();
-  auto* image_output = OperatorBase::Output<Tensor>(0, type);
-  auto* label_output = OperatorBase::Output<Tensor>(1, type);
-  vector<Tensor*> additional_outputs_output;
+  auto* image_output = OperatorBase::Output<Tensor<Context> >(0);
+  auto* label_output = OperatorBase::Output<Tensor<Context> >(1);
+  vector<Tensor<Context>*> additional_outputs_output;
 
   for (int i = 2; i < OutputSize(); ++i) {
-    additional_outputs_output.push_back(OperatorBase::Output<Tensor>(i, type));
+    additional_outputs_output.push_back(
+        OperatorBase::Output<Tensor<Context>>(i));
   }
 
   // Note(jiayq): The if statement below should be optimized away by the
@@ -1239,12 +1238,10 @@ bool ImageInputOp<Context>::CopyPrefetched() {
         mean_gpu_.Resize(mean_.size());
         std_gpu_.Resize(std_.size());
 
-        context_.template CopyFromCPU<float>(
-            mean_.size(),
-            mean_.data(),
-            mean_gpu_.template mutable_data<float>());
-        context_.template CopyFromCPU<float>(
-            std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
+        context_.template Copy<float, CPUContext, Context>(
+          mean_.size(), mean_.data(), mean_gpu_.template mutable_data<float>());
+        context_.template Copy<float, CPUContext, Context>(
+          std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
         mean_std_copied_ = true;
       }
       // GPU transform kernel allows explicitly setting output type
diff --git a/caffe2/image/transform_gpu.cu b/caffe2/image/transform_gpu.cu
index bb557429f5ad6..c6d8d775332d9 100644
--- a/caffe2/image/transform_gpu.cu
+++ b/caffe2/image/transform_gpu.cu
@@ -50,12 +50,9 @@ __global__ void transform_kernel(
 
 template <typename T_IN, typename T_OUT, class Context>
 
-bool TransformOnGPU(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    Context* context) {
+bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
+                    Tensor<Context>& mean, Tensor<Context>& std,
+                    Context *context) {
   // data comes in as NHWC
   const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
   // data goes out as NCHW
@@ -71,18 +68,16 @@ bool TransformOnGPU(
   return true;
 };
 
-template bool TransformOnGPU<uint8_t, float, CUDAContext>(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    CUDAContext* context);
-
-template bool TransformOnGPU<uint8_t, float16, CUDAContext>(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    CUDAContext* context);
+template bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X,
+                                                          Tensor<CUDAContext> *Y,
+                                                          Tensor<CUDAContext>& mean,
+                                                          Tensor<CUDAContext>& std,
+                                                          CUDAContext *context);
+
+template bool TransformOnGPU<uint8_t, float16, CUDAContext>(Tensor<CUDAContext>& X,
+                                                            Tensor<CUDAContext> *Y,
+                                                            Tensor<CUDAContext>& mean,
+                                                            Tensor<CUDAContext>& std,
+                                                            CUDAContext *context);
 
 }  // namespace caffe2
diff --git a/caffe2/image/transform_gpu.h b/caffe2/image/transform_gpu.h
index 3ca11ce159feb..a19b5251f5d72 100644
--- a/caffe2/image/transform_gpu.h
+++ b/caffe2/image/transform_gpu.h
@@ -31,12 +31,9 @@
 namespace caffe2 {
 
 template <typename T_IN, typename T_OUT, class Context>
-bool TransformOnGPU(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    Context* context);
+bool TransformOnGPU(Tensor<Context>& X, Tensor<Context>* Y,
+                    Tensor<Context>& mean, Tensor<Context>& std,
+                    Context* context);
 
 }  // namespace caffe2
 
diff --git a/caffe2/mkl/mkl_utils_test.cc b/caffe2/mkl/mkl_utils_test.cc
index 72dcda2c8f6bb..678d643c5253f 100644
--- a/caffe2/mkl/mkl_utils_test.cc
+++ b/caffe2/mkl/mkl_utils_test.cc
@@ -23,10 +23,10 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
   int pads[2] = {0, 0};
 
   // Creating Input and output tensors
-  Tensor X(vector<TIndex>{16, 8, 32, 32}, CPU);
-  Tensor W(vector<TIndex>{64, 8, 3, 3}, CPU);
-  Tensor b(vector<TIndex>{64}, CPU);
-  Tensor Y(vector<TIndex>{16, 64, 30, 30}, CPU);
+  TensorCPU X(vector<TIndex>{16, 8, 32, 32});
+  TensorCPU W(vector<TIndex>{64, 8, 3, 3});
+  TensorCPU b(vector<TIndex>{64});
+  TensorCPU Y(vector<TIndex>{16, 64, 30, 30});
 
   float* data = X.mutable_data<float>();
   for (int i = 0; i < X.size(); ++i) {
@@ -56,7 +56,7 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
   // Test if the resource wrapper works.
   MKLMemory<float> X_wrapper(X.dims(), primitive, dnnResourceSrc);
   X_wrapper.CopyFrom(X);
-  Tensor X_recover(X.dims(), CPU);
+  TensorCPU X_recover(X.dims());
   X_wrapper.CopyTo(&X_recover);
   const float* recover_data = X_recover.data<float>();
   for (int i = 0; i < X_recover.size(); ++i) {
@@ -93,7 +93,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
   // layout?). Test both cases.
   vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
   for (const auto& dims : dims_list) {
-    auto X_cpu_in = caffe2::make_unique<Tensor>(dims, CPU);
+    auto X_cpu_in = caffe2::make_unique<TensorCPU>(dims);
     CPUContext ctx;
     math::RandUniform<float, CPUContext>(
         X_cpu_in->size(),
@@ -117,7 +117,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
     EXPECT_EQ(X_mkl1->size(), X_cpu_in->size());
 
     // CPU <- MKL1
-    auto X_cpu_out = caffe2::make_unique<Tensor>(CPU);
+    auto X_cpu_out = caffe2::make_unique<TensorCPU>();
     X_mkl1->CopyTo(X_cpu_out.get());
     EXPECT_EQ(X_cpu_out->dims(), dims);
     EXPECT_EQ(X_cpu_out->size(), X_cpu_in->size());
diff --git a/caffe2/mkl/operators/conv_op.cc b/caffe2/mkl/operators/conv_op.cc
index 2678f4c37e17a..71618316cbec6 100644
--- a/caffe2/mkl/operators/conv_op.cc
+++ b/caffe2/mkl/operators/conv_op.cc
@@ -31,7 +31,7 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
 
     const int M = filter.dim32(0);
     if (InputSize() == 2 && !zero_bias_) {
-      Tensor cpu_zero_bias{CPU};
+      TensorCPU cpu_zero_bias;
       cpu_zero_bias.Resize(M);
       CPUContext ctx;
       math::Set<T, CPUContext>(
@@ -72,8 +72,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
       size_t bdata_sizes[4] = {W, H, C, N};
       // We will utilize the SetOutputSize() function int he base class
       // with dummy TensorCPU input and output to calculate the sizes.
-      Tensor dummy_input(X.dims(), CPU);
-      Tensor dummy_output(CPU);
+      TensorCPU dummy_input(X.dims());
+      TensorCPU dummy_output;
       ConvPoolOpBase<MKLContext>::SetOutputSize(
           dummy_input, &dummy_output, M);
       size_t tdata_sizes[4] = {
diff --git a/caffe2/mkl/operators/conv_op_mkldnn.cc b/caffe2/mkl/operators/conv_op_mkldnn.cc
index 80edf1332d063..0e363863bc435 100644
--- a/caffe2/mkl/operators/conv_op_mkldnn.cc
+++ b/caffe2/mkl/operators/conv_op_mkldnn.cc
@@ -28,7 +28,7 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
     auto& X = Input(INPUT);
     auto& filter = Input(FILTER);
     auto& bias = Input(BIAS);
-    Tensor* Y = Output(0);
+    TensorCPU* Y = Output(0);
     CAFFE_ENFORCE(4 == X.ndim());
     const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
     CAFFE_ENFORCE(4 == filter.ndim());
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index 456a96d71fdf8..cc90bc16c0836 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
     for (int i = 0; i < InputSize(); ++i) {
       if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
         OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            local_input_blobs_[i]->template GetMutable<TensorCPU>());
       } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
         OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            local_input_blobs_[i]->template GetMutable<TensorCPU>());
       } else {
         VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc
index 1f3231dc521f5..d24bed7b3dbc7 100644
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@@ -49,7 +49,7 @@ class PackedFCOp final : public Operator<CPUContext> {
 
     // Check out what is the passed in format.
     const MKLPackedMatrix* packed_matrix = nullptr;
-    if (OperatorBase::InputIsType<Tensor>(1, CPU)) {
+    if (OperatorBase::InputIsType<TensorCPU>(1)) {
       const auto& W = Input(1);
       CAFFE_ENFORCE_EQ(W.ndim(), 2);
       CAFFE_ENFORCE_EQ(W.dim32(0), N);
@@ -142,7 +142,7 @@ class PackedFCOp final : public Operator<CPUContext> {
   size_t axis_{1};
   uint32_t hash_{0};
   vector<TIndex> Y_shape_cache_;
-  Tensor bias_multiplier_{CPU};
+  Tensor<CPUContext> bias_multiplier_;
   std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
 };
 
diff --git a/caffe2/mkl/operators/pool_op.cc b/caffe2/mkl/operators/pool_op.cc
index 284e7f80b8c37..434fad2f46b37 100644
--- a/caffe2/mkl/operators/pool_op.cc
+++ b/caffe2/mkl/operators/pool_op.cc
@@ -61,8 +61,8 @@ bool MKLPoolOp<float>::RunOnDeviceWithOrderNCHW() {
   if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
     // We will utilize the SetOutputSize() function in the base class
     // with dummy TensorCPU input and output to calculate the sizes.
-    Tensor dummy_input(X.dims(), CPU);
-    Tensor dummy_output(CPU);
+    TensorCPU dummy_input(X.dims());
+    TensorCPU dummy_output;
 
     ConvPoolOpBase<MKLContext>::SetOutputSize(
         dummy_input, &dummy_output, X.dim32(1));
diff --git a/caffe2/mkl/operators/utility_ops.cc b/caffe2/mkl/operators/utility_ops.cc
index 386bbbc5ee18f..969450c7c117e 100644
--- a/caffe2/mkl/operators/utility_ops.cc
+++ b/caffe2/mkl/operators/utility_ops.cc
@@ -10,7 +10,7 @@ class CopyCPUToMKLOp final : public MKLOperator<float> {
  public:
   using MKLOperator<float>::MKLOperator;
   bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
+    const auto& X = OperatorBase::Input<TensorCPU>(0);
     auto* Y = OperatorBase::OutputBlob(0);
     if (!Y->template IsType<MKLMemory<float>>() ||
         Y->Get<MKLMemory<float>>().dims() != X.dims()) {
@@ -27,7 +27,7 @@ class CopyMKLToCPUOp final : public MKLOperator<float> {
 
   bool RunOnDevice() override {
     const auto& X = OperatorBase::Input<MKLMemory<float>>(0);
-    auto* Y = OperatorBase::Output<Tensor>(0, CPU);
+    auto* Y = OperatorBase::Output<TensorCPU>(0);
     X.CopyTo(Y);
     return true;
   }
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
index 6e9075df43475..e13b3ada86fa4 100644
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -1,6 +1,5 @@
 // #include "caffe2/mkl/utils/mkl_context.h"
 
-#include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"
 
 namespace caffe2 {
@@ -19,11 +18,4 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(MKLDNN, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(MKLDNN, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(MKLDNN, EventResetCPU);
 
-BaseStaticContext* GetMKLStaticContext() {
-  static MKLStaticContext context;
-  return &context;
-}
-
-REGISTER_STATIC_CONTEXT(MKLDNN, GetMKLStaticContext());
-
 } // namespace caffe2
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 6181a91dda35d..b876894746af0 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -6,12 +6,9 @@
 #include <random>
 
 #include "caffe2/core/context.h"
-#include "caffe2/core/context_base.h"
 
 namespace caffe2 {
 
-BaseStaticContext* GetMKLStaticContext();
-
 /**
  * The MKL Context, which is largely the same as the CPUContext. We instantiate
  * this mainly in order to have a first-class MKL device.
@@ -20,7 +17,7 @@ BaseStaticContext* GetMKLStaticContext();
  * operators to mainly perform input and output via MKLMemory. As a result,
  * most likely MKLContext::New and ::Delete won't be used as often.
  */
-class MKLContext : public BaseContext {
+class MKLContext final {
  public:
   MKLContext() : random_seed_(RandomNumberSeed()) {}
   explicit MKLContext(const DeviceOption& option)
@@ -30,28 +27,20 @@ class MKLContext : public BaseContext {
     CAFFE_ENFORCE_EQ(option.device_type(), MKLDNN);
   }
 
-  ~MKLContext() override {}
-
-  BaseStaticContext* GetStaticContext() const override {
-    return GetMKLStaticContext();
-  }
-
-  static BaseStaticContext* StaticContext() {
-    return GetMKLStaticContext();
-  }
+  ~MKLContext() {}
 
-  inline void SwitchToDevice(int /*stream_id*/ = 0) override {}
+  inline void SwitchToDevice(int /*stream_id*/ = 0) {}
 
-  inline void WaitEvent(const Event& ev) override {
+  inline void WaitEvent(const Event& ev) {
     ev.Wait(MKLDNN, this);
   }
 
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const {
     CAFFE_ENFORCE(ev, "Event must not be null.");
     ev->Record(MKLDNN, this, err_msg);
   }
 
-  inline void FinishDeviceComputation() override {}
+  inline void FinishDeviceComputation() {}
 
   inline std::mt19937& RandGenerator() {
     if (!random_generator_.get()) {
@@ -61,32 +50,7 @@ class MKLContext : public BaseContext {
   }
 
   inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return StaticContext()->New(nbytes);
-  }
-
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst)
-      override {
-    if (nbytes == 0) {
-      return;
-    }
-    CAFFE_ENFORCE(src);
-    CAFFE_ENFORCE(dst);
-    memcpy(dst, src, nbytes);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst)
-      override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  bool SupportsNonFundamentalTypes() const override {
-    // MKL meta copy is OK
-    return true;
+    return GetCPUAllocator()->New(nbytes);
   }
 
   // Two copy functions that deals with cross-device copies.
@@ -126,18 +90,10 @@ class MKLContext : public BaseContext {
     return false;
   }
 
-  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
+  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
-    return MKLDNN;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return MKLDNN;
-  }
-
  protected:
   // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
   int random_seed_{1701};
@@ -152,26 +108,21 @@ inline void MKLContext::CopyBytes<MKLContext, MKLContext>(
   memcpy(dst, src, nbytes);
 }
 
-class MKLStaticContext : public BaseStaticContext {
- public:
-  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
-    return GetCPUAllocator()->New(nbytes);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<MKLContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<MKLContext>(option);
-  }
-
-  DeviceType GetDeviceType() override {
-    return MKLDNN;
-  }
-};
+template <>
+inline void MKLContext::CopyBytes<CPUContext, MKLContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  memcpy(dst, src, nbytes);
+}
 
+template <>
+inline void MKLContext::CopyBytes<MKLContext, CPUContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  memcpy(dst, src, nbytes);
+}
 } // namespace caffe2
 
 #endif // CAFFE2_UTILS_MKL_CONTEXT_H_
diff --git a/caffe2/mobile/contrib/CMakeLists.txt b/caffe2/mobile/contrib/CMakeLists.txt
index e49c2ef129c32..29a35812bc4ec 100644
--- a/caffe2/mobile/contrib/CMakeLists.txt
+++ b/caffe2/mobile/contrib/CMakeLists.txt
@@ -1,10 +1,7 @@
 add_subdirectory(ios)
-# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
-if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
-  # add_subdirectory(opengl)
-endif()
+add_subdirectory(opengl)
 if (USE_ACL)
-  # add_subdirectory(arm-compute)
+  add_subdirectory(arm-compute)
 endif()
 # Finally pass the src lists back to the parent
 
@@ -20,4 +17,4 @@ set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
 # GPU source, test sources, binary sources
 set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
 set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
index 111af03f8602b..56c95237e923d 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
   if (first_run_) {
     first_run_ = false;
     for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
       Y->Resize(inputs_[i]->dims());
       Y->template mutable_data<float>();
     }
@@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
       // GLTensor
       auto* X = inputs_[i].get();
       X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
       Timer timer;
       timer.Start();
       getTensorCPU(*X, *Y);
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
index 50b457c7ba86d..fc53479088443 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                      std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
   Blob *blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto *tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(dims);
   T *t_data = tensor->mutable_data<T>();
   std::random_device rd;
diff --git a/caffe2/mobile/contrib/ios/ios_caffe.cc b/caffe2/mobile/contrib/ios/ios_caffe.cc
index 0ac1131350b61..12e0e5598c6aa 100644
--- a/caffe2/mobile/contrib/ios/ios_caffe.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe.cc
@@ -41,7 +41,7 @@ void GenerateStylizedImage(std::vector<float>& originalImage,
   caffe2::Predictor p(init_net, predict_net);
 
   std::vector<int> dims({1, 3, height, width});
-  caffe2::Tensor input(caffe2::CPU);
+  caffe2::TensorCPU input;
   input.Resize(dims);
   input.ShareExternalPointer(originalImage.data());
   caffe2::Predictor::TensorVector input_vec{&input};
diff --git a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
index d4207691290d5..d497c9b7b7047 100644
--- a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
@@ -50,7 +50,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
 
 void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
   caffe2::FLAGS_caffe2_force_shared_col_buffer = true;
-  caffe2::Tensor input(caffe2::CPU);
+  caffe2::TensorCPU input;
   input.Resize(inData.dims);
   input.ShareExternalPointer(inData.data);
   caffe2::Predictor::TensorVector input_vec{&input};
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 45f55ab2407a2..d7842eaaa6bdb 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -256,9 +256,9 @@ void computeOutputHW(
     int W,
     int* OH,
     int* OW) {
-  Tensor input(CPU), output(CPU);
+  Tensor<CPUContext> input, output;
   input.Resize(1, 1, H, W);
-  op->SetOutputSize(input, &output, 1);
+  op->SetOutputSize<CPUContext>(input, &output, 1);
   CAFFE_ENFORCE_EQ(output.ndim(), 4);
   *OH = output.dim(2);
   *OW = output.dim(3);
@@ -495,7 +495,7 @@ bool RunOnDevice() override {
       caffe2::Timer rt;
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = noiseBlob->template GetMutable<TensorCPU>();
       t->Resize(noiseSize);
       math::RandGaussian<float, CPUContext>(
           t->size(),
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index c4f6ff4d6a3a4..e6d9daa0e70dc 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 90e672397b821..5a14f4606635d 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index 45ea26c44cc96..3f05149c70454 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
         output_dims.push_back(dim);
       }
 
-      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
+      auto* tensor = ws_.CreateBlob(blob)->GetMutable<TensorCPU>();
       tensor->Resize(output_dims);
       outputs->push_back(tensor);
 
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
index 359e7767746b6..db0e867aa07ce 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
@@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
@@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
@@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
index deab1ca7b43f7..76278c8ef8fb8 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
   // CPU reference
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -103,21 +103,21 @@ static void test_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     t->Resize(K, kernel, kernel, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -189,7 +189,7 @@ static void test_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     t->Resize(1, kernel, kernel, D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
     t->Resize(D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -428,7 +428,7 @@ static void test_pooling(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -496,7 +496,7 @@ static void test_pooling(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     if (H == 1 && W == 1) {
       t->Resize(N, C);
     } else {
@@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
diff --git a/caffe2/mobile/contrib/opengl/CMakeLists.txt b/caffe2/mobile/contrib/opengl/CMakeLists.txt
index 6d116253f71d5..f23de75d153a1 100644
--- a/caffe2/mobile/contrib/opengl/CMakeLists.txt
+++ b/caffe2/mobile/contrib/opengl/CMakeLists.txt
@@ -1,11 +1,14 @@
-add_subdirectory(core)
-add_subdirectory(operators)
+if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
+  add_subdirectory(core)
+  add_subdirectory(operators)
 
-if (ANDROID)
-  add_subdirectory(android)
-endif()
+  if (ANDROID)
+    add_subdirectory(android)
+  endif()
 
-if (IOS)
-  add_subdirectory(ios)
+  if (IOS)
+    add_subdirectory(ios)
+  endif()
 endif()
+
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 49a875184c10d..f9ede815f5a99 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
   LOG(INFO) << "OPENGLCopyFrom/To Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -275,7 +275,7 @@ void testOpenGLConv(int N,
             << " Op: " << glPoolOperationName[poolOp];
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -301,7 +301,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
       t->Resize(K, C, kernel_h, kernel_w);
     } else {
@@ -343,7 +343,7 @@ void testOpenGLConv(int N,
 
     // bias
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
       t->Resize(K);
       CPUContext ctx;
       if (random_input) {
@@ -367,7 +367,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -532,7 +532,7 @@ void testOpenGLPRelu(
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -541,7 +541,7 @@ void testOpenGLPRelu(
 
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
     t->Resize(prelu_size);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
 
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -814,8 +814,7 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t =
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutable<TensorCPU>();
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -891,7 +890,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -942,7 +941,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@@ -992,14 +991,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
   }
 
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
     t->Resize(1);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@@ -1060,7 +1059,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
   LOG(INFO) << "OpenGL Softmax Test "
             << "N: " << N << " D: " << D << " Tiled:" << tiled;
   Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
   {
     t->Resize(N, D);
     CPUContext ctx;
@@ -1151,7 +1150,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1163,7 +1162,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1172,7 +1171,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1254,7 +1253,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1266,7 +1265,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1275,7 +1274,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1284,7 +1283,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
     t->Resize(C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1385,7 +1384,7 @@ void OpenGL_speedtest(int N,
             << " C: " << C << " H: " << H << " W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1399,7 +1398,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1413,7 +1412,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1479,7 +1478,7 @@ void testOpenGLPadImage(
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1593,7 +1592,7 @@ void testOpenGLResize(int N,
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1675,7 +1674,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGL Preprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, H, W, C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1684,7 +1683,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 100;
@@ -1748,7 +1747,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLDeprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1757,7 +1756,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1800,7 +1799,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLNormPlanarYUV Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, 3, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1809,7 +1808,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1818,7 +1817,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("stdev")->GetMutable<TensorCPU>();
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 6;
@@ -1879,7 +1878,7 @@ void OpenGL_copyops_speedtest(int N,
   LOG(INFO) << "OpenGL CopyOps Speed Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1893,7 +1892,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1907,7 +1906,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1990,8 +1989,7 @@ void compareModelsForOpenGL(std::string name,
     Workspace cws;
     cws.RunNetOnce(initNet);
 
-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
-                      ->GetMutableTensor(CPU);
+    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))->GetMutable<TensorCPU>();
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2032,8 +2030,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace mws;
     mws.RunNetOnce(initNet);
 
-    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
-                     ->GetMutableTensor(CPU);
+    auto* t_gl =
+        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))->GetMutable<TensorCPU>();
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2115,8 +2113,7 @@ void compareBatchedToTiledModels(std::string name,
     Workspace tws;
     tws.RunNetOnce(initNet);
 
-    auto* t_batch =
-        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
+    auto* t_batch = tws.CreateBlob(bachedNet.external_input(0))->GetMutable<TensorCPU>();
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2142,8 +2139,7 @@ void compareBatchedToTiledModels(std::string name,
     Workspace bws;
     bws.RunNetOnce(initNet);
 
-    auto* t_tiling =
-        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
+    auto* t_tiling = bws.CreateBlob(tiledNet.external_input(0))->GetMutable<TensorCPU>();
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
diff --git a/caffe2/mobile/contrib/snpe/snpe_op.cc b/caffe2/mobile/contrib/snpe/snpe_op.cc
index db8a95fe8395a..fa015960183d2 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op.cc
@@ -111,8 +111,7 @@ class SNPEOp final : public Operator<CPUContext> {
     X(snpe_copy_output_to);
     snpe_copy_output_to_f(ctx_.get(), Output(0)->mutable_data<float>());
 
-    CAFFE_ENFORCE(
-        Output(0)->data<float>(), "nullptr where output should be!\n");
+    CAFFE_ENFORCE(Output(0)->data<float>(), "nullptr where output should be!\n");
     return true;
   }
 
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index 1bbe303ef777d..58e3ccbb8a7b6 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -11,22 +11,20 @@
 #if TEST_REAL_DATA
 #include "data_chw.h"
 #include "data_hwc.h"
-#define POPULATE_DATA(_n, _s, _l)                                         \
-  do {                                                                    \
-    Blob* _blob = ws.CreateBlob((_n));                                    \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
-    _tensor->Resize((_s));                                                \
-    memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
-  } while (0)
+#define POPULATE_DATA(_n, _s, _l) do {\
+  Blob* _blob = ws.CreateBlob((_n));\
+  auto* _tensor = _blob->GetMutable<TensorCPU>();\
+  _tensor->Resize((_s));\
+  memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes());\
+} while(0)
 #else
 // Rough test on static data
-#define POPULATE_DATA(_n, _s, _l)                                 \
-  do {                                                            \
-    Blob* _blob = ws.CreateBlob((_n));                            \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
-    _tensor->Resize((_s));                                        \
-    memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
-  } while (0)
+#define POPULATE_DATA(_n, _s, _l) do {\
+  Blob* _blob = ws.CreateBlob((_n));\
+  auto* _tensor = _blob->GetMutable<TensorCPU>();\
+  _tensor->Resize((_s));\
+  memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes());\
+} while(0)
 #endif
 
 #include <cmath>
@@ -43,7 +41,7 @@ void AddConstInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
   math::Set<float, CPUContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@@ -56,7 +54,7 @@ void AddNoiseInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
@@ -73,7 +71,7 @@ float snpe_run(int iters, Workspace& ws) {
   const int C = 3;
 
   POPULATE_DATA("X_snpe", (caffe2::vector<caffe2::TIndex>{H, W, C}), hwc);
-
+  
   OperatorDef def;
   def.set_name("snpe_test");
   def.set_type("SNPE");
@@ -178,7 +176,7 @@ int main(int argc, char** argv) {
   float avg_diff = total_diff; // Avg difference as percentage (not a great metric)
   printf("Average difference is %f%%\n", avg_diff * 100);
   printf("JS Divergence is %f\n", JS_divergence); // Jensen-Shannon
-  printf("KL Divergence is %f\n", KL_divergence); // Kullback-Leibler
+  printf("KL Divergence is %f\n", KL_divergence); // Kullback–Leibler
   printf("Predicted %d with %f%% confidence\n", max_index, max * 100);
 
   printf ("Caffe2: %f microseconds.\n", t_caffe2);
diff --git a/caffe2/mobile/contrib/ulp2/ulp.cc b/caffe2/mobile/contrib/ulp2/ulp.cc
index 3acd17281fefa..1d8e0e8fe69a5 100644
--- a/caffe2/mobile/contrib/ulp2/ulp.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@@ -261,14 +261,14 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
   state->XQs.resize(k2b1bXBits);
   state->YQs.resize(k2b1bXBits);
   for (auto i = 0; i < k2b1bXBits; ++i) {
-    state->XQs[i] = caffe2::make_unique<Tensor>(CPU);
-    state->YQs[i] = caffe2::make_unique<Tensor>(CPU);
+    state->XQs[i] = caffe2::make_unique<TensorCPU>();
+    state->YQs[i] = caffe2::make_unique<TensorCPU>();
   }
-  state->WQ = caffe2::make_unique<Tensor>(CPU);
-  state->WQN = caffe2::make_unique<Tensor>(CPU);
-  state->WQL1Norm = caffe2::make_unique<Tensor>(CPU);
-  state->scratch = caffe2::make_unique<Tensor>(CPU);
-  state->scratchColBuffer = caffe2::make_unique<Tensor>(CPU);
+  state->WQ = caffe2::make_unique<TensorCPU>();
+  state->WQN = caffe2::make_unique<TensorCPU>();
+  state->WQL1Norm = caffe2::make_unique<TensorCPU>();
+  state->scratch = caffe2::make_unique<TensorCPU>();
+  state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
 
   signQuantize(W, state->WQ.get());
   filterNormalization11(*(state->WQ), state->WQN.get());
@@ -290,7 +290,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
   };
   if (b) {
     CPUContext context;
-    state->bias = caffe2::make_unique<Tensor>(*b, &context, CPU);
+    state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
   }
   return state;
 }
diff --git a/caffe2/mobile/contrib/ulp2/ulp_neon.cc b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
index c7a4450e7ba31..15ad59a47916e 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_neon.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
@@ -438,7 +438,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
   const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
   Y->Resize(X.dim32(0), OH, OW, OC);
   if (!state->WQPacked) {
-    state->WQPacked = caffe2::make_unique<Tensor>(CPU);
+    state->WQPacked = caffe2::make_unique<TensorCPU>();
     qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
     CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
     CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index f6705e638ddac..58bc5e7132836 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -63,7 +63,7 @@ int randInt(int a, int b) {
 }
 
 TensorCPU genTensor11(std::vector<TIndex> shape) {
-  Tensor r(CPU);
+  TensorCPU r;
   r.Resize(shape);
 
   std::random_device rd;
@@ -77,7 +77,7 @@ TensorCPU genTensor11(std::vector<TIndex> shape) {
 }
 
 TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
-  Tensor r(CPU);
+  TensorCPU r;
   r.Resize(shape);
 
   std::random_device rd;
@@ -91,7 +91,7 @@ TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
 }
 
 TensorCPU genTensor0123(std::vector<TIndex> shape) {
-  Tensor r(CPU);
+  TensorCPU r;
   r.Resize(shape);
 
   std::random_device rd;
@@ -114,7 +114,7 @@ TEST(ULP, QPadZero) {
   const auto ICQ = 1;
 
   auto X = genTensor11({1, 10, 10, ICQ * 8});
-  Tensor XQ(CPU), XQPad(CPU);
+  TensorCPU XQ, XQPad;
   signQuantize(X, &XQ);
   qpad_zero(args, XQ, &XQPad);
 
@@ -174,7 +174,7 @@ inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, flo
 void gemmTest(TIndex M, TIndex N, TIndex K) {
   auto X = genTensor11({M, K});
   auto W = genTensor11({N, K});
-  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
+  TensorCPU XQ, WQ, YQ, Y;
   {
     signQuantize(X, &XQ);
     signQuantize(W, &WQ);
@@ -207,7 +207,7 @@ TEST(QConv, ConvTest) {
   int K = 3;
   auto X = genTensor11({1, S, S, IC});
   auto W = genTensor11({OC, K, K, IC});
-  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
+  TensorCPU XQ, WQ, YQ, Y;
   {
     signQuantize(X, &XQ);
     signQuantize(W, &WQ);
@@ -235,16 +235,16 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
   auto X = genTensor0123({N, H, W, IC});
   auto W_ = genTensor11({OC, KH, KW, IC});
   auto bias = genTensorUniform11({OC});
-  Tensor Y(CPU), YQ(CPU), Y2b1b(CPU), YOP(CPU);
+  TensorCPU Y, YQ, Y2b1b, YOP;
 
   {
     std::vector<std::unique_ptr<TensorCPU>> XQs(k2b1bXBits);
     std::vector<std::unique_ptr<TensorCPU>> YQs(k2b1bXBits);
     for (auto i = 0; i < k2b1bXBits; ++i) {
-      XQs[i] = caffe2::make_unique<Tensor>(CPU);
-      YQs[i] = caffe2::make_unique<Tensor>(CPU);
+      XQs[i] = caffe2::make_unique<TensorCPU>();
+      YQs[i] = caffe2::make_unique<TensorCPU>();
     }
-    Tensor WQN(CPU), WQ(CPU);
+    TensorCPU WQN, WQ;
     uniformQuantize2b1b(X, XQs, 0.5, 1.0);
     signQuantize(W_, &WQ);
     filterNormalization11(WQ, &WQN);
@@ -289,17 +289,17 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
     def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
     def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
     def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
+    auto* Xws = ws.CreateBlob("X")->GetMutable<TensorCPU>();
     Xws->ResizeLike(X);
     Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* Wws = ws.CreateBlob("W")->GetMutable<TensorCPU>();
     Wws->ResizeLike(W_);
     Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* bws = ws.CreateBlob("b")->GetMutable<TensorCPU>();
     bws->ResizeLike(bias);
     bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
     ws.RunOperatorOnce(def);
-    YOP.CopyFrom(ws.GetBlob("Y")->Get<TensorCPU>());
+    YOP.CopyFrom<CPUContext>(ws.GetBlob("Y")->Get<TensorCPU>());
   }
 
   { conv(args, X, W_, &bias, &Y); }
diff --git a/caffe2/mpi/mpi_gpu_test.cc b/caffe2/mpi/mpi_gpu_test.cc
index d24521e0274dc..087a87575510b 100644
--- a/caffe2/mpi/mpi_gpu_test.cc
+++ b/caffe2/mpi/mpi_gpu_test.cc
@@ -55,6 +55,7 @@ TEST(MPITest, TestMPIBroadcast) {
   arg->set_f(rank);
   int size;
   MPI_Comm_size(MPI_COMM_WORLD, &size);
+
   for (int root = 0; root < size; ++root) {
     net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
     Workspace ws;
@@ -62,8 +63,8 @@ TEST(MPITest, TestMPIBroadcast) {
     EXPECT_NE(nullptr, net.get());
     EXPECT_TRUE(net->Run());
     // Let's test the value.
-    auto& X = ws.GetBlob("X")->Get<Tensor>();
-    Tensor X_cpu(X, CPU);
+    auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
+    TensorCPU X_cpu(X);
     EXPECT_EQ(X.size(), 10);
     for (int i = 0; i < X.size(); ++i) {
       EXPECT_EQ(X_cpu.data<float>()[i], root);
@@ -132,7 +133,7 @@ TEST(MPITest, TestMPIReduce) {
       auto& X = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
       EXPECT_EQ(X.size(), 10);
       int expected_result = size * (size - 1) / 2;
-      Tensor X_cpu(X, CPU);
+      TensorCPU X_cpu(X);
       for (int i = 0; i < X.size(); ++i) {
         EXPECT_EQ(X_cpu.data<float>()[i], expected_result);
       }
@@ -189,7 +190,7 @@ TEST(MPITest, TestMPIAllgather) {
   EXPECT_TRUE(net->Run());
   // Let's test the value.
   auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
-  Tensor X_cpu(X, CPU);
+  TensorCPU X_cpu(X);
   EXPECT_EQ(X.size(), 20);
   for (int i = 0; i < X.size(); ++i) {
     EXPECT_EQ(X_cpu.data<float>()[i], rank);
@@ -198,7 +199,7 @@ TEST(MPITest, TestMPIAllgather) {
   EXPECT_EQ(X_gathered.size(), 20 * size);
   EXPECT_EQ(X_gathered.dim(0), 2 * size);
   EXPECT_EQ(X_gathered.dim(1), 10);
-  Tensor X_gathered_cpu(X_gathered, CPU);
+  TensorCPU X_gathered_cpu(X_gathered);
   for (int i = 0; i < X_gathered.size(); ++i) {
     EXPECT_EQ(X_gathered_cpu.data<float>()[i], i / 20);
   }
@@ -253,14 +254,14 @@ TEST(MPITest, TestMPIAllreduce) {
   // Let's test the value.
   auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
   EXPECT_EQ(X.size(), 10);
-  Tensor X_cpu(X, CPU);
+  TensorCPU X_cpu(X);
   for (int i = 0; i < X.size(); ++i) {
     EXPECT_EQ(X_cpu.data<float>()[i], rank);
   }
   auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
   EXPECT_EQ(X_reduced.size(), 10);
   int expected_result = size * (size - 1) / 2;
-  Tensor X_reduced_cpu(X_reduced, CPU);
+  TensorCPU X_reduced_cpu(X_reduced);
   for (int i = 0; i < X_reduced.size(); ++i) {
     EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
   }
@@ -315,7 +316,7 @@ TEST(MPITest, TestInPlaceMPIAllreduce) {
   auto& X_reduced = ws.GetBlob("X")->Get<TensorCUDA>();
   EXPECT_EQ(X_reduced.size(), 10);
   int expected_result = size * (size - 1) / 2;
-  Tensor X_reduced_cpu(X_reduced, CPU);
+  TensorCPU X_reduced_cpu(X_reduced);
   for (int i = 0; i < X_reduced.size(); ++i) {
     EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
   }
diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
index 7d251f7445d1e..108bf45afdb04 100644
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@@ -36,7 +36,8 @@ class MPIBroadcastOp final : public Operator<Context> {
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     CAFFE_ENFORCE(
-        OperatorBase::OutputIsType<Tensor>(0), "Output is of wrong type.");
+        OperatorBase::OutputIsType<Tensor<Context>>(0),
+        "Output is of wrong type.");
     auto* output = Output(0);
     // Make sure that output is already allocated.
     CAFFE_ENFORCE(
@@ -167,8 +168,8 @@ class MPISendTensorOp final : public Operator<Context> {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
     auto& input = Input(INPUT);
     if (InputSize() == 4) {
-      dst_ = OperatorBase::Input<Tensor>(DST, CPU).template data<int>()[0];
-      tag_ = OperatorBase::Input<Tensor>(TAG, CPU).template data<int>()[0];
+      dst_ = OperatorBase::Input<TensorCPU>(DST).template data<int>()[0];
+      tag_ = OperatorBase::Input<TensorCPU>(TAG).template data<int>()[0];
     }
     if (raw_buffer_) {
       // We need to do a const cast to cope with the fact that, before OpenMPI
@@ -210,8 +211,8 @@ class MPIReceiveTensorOp final : public Operator<Context> {
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
     if (InputSize() == 4) {
-      src_ = OperatorBase::Input<Tensor>(SRC_IN, CPU).template data<int>()[0];
-      tag_ = OperatorBase::Input<Tensor>(TAG_IN, CPU).template data<int>()[0];
+      src_ = OperatorBase::Input<TensorCPU>(SRC_IN).template data<int>()[0];
+      tag_ = OperatorBase::Input<TensorCPU>(TAG_IN).template data<int>()[0];
     }
     MPI_Status status;
     if (raw_buffer_) {
@@ -227,10 +228,10 @@ class MPIReceiveTensorOp final : public Operator<Context> {
     } else {
       CAFFE_NOT_IMPLEMENTED;
     }
-    auto* src_out = OperatorBase::Output<Tensor>(SRC_OUT, CPU);
+    auto* src_out = OperatorBase::Output<TensorCPU>(SRC_OUT);
     src_out->Resize();
     src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
-    auto* tag_out = OperatorBase::Output<Tensor>(TAG_OUT, CPU);
+    auto* tag_out = OperatorBase::Output<TensorCPU>(TAG_OUT);
     tag_out->Resize();
     tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
     return true;
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index df97bcffb2cf6..afeac6d127ba7 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -26,10 +26,17 @@ void ProfileOperatorObserver::Dump() const {
   LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
             << " op#" << getId() << " ---------";
   for (int i = 0; i < subject_->InputSize(); ++i) {
-    const auto& tensor = subject_->Input<Tensor>(i);
-    const auto& name = subject_->debug_def().input(i);
-    TensorPrinter printer(name);
-    LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    if (subject_->InputIsType<TensorCPU>(i)) {
+      const auto& tensor = subject_->Input<TensorCPU>(i);
+      const auto& name = subject_->debug_def().input(i);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    } else if (subject_->InputIsType<TensorCUDA>(i)) {
+      const auto& tensor = subject_->Input<TensorCUDA>(i);
+      const auto& name = subject_->debug_def().input(i);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    }
   }
 
   int a = 0;
@@ -39,13 +46,13 @@ void ProfileOperatorObserver::Dump() const {
   }
 
   for (int o = 0; o < subject_->OutputSize(); ++o) {
-    if (subject_->OutputIsType<Tensor>(o, CPU)) {
-      auto* tensor = subject_->Output<Tensor>(o, CPU);
+    if (subject_->OutputIsType<TensorCPU>(o)) {
+      auto* tensor = subject_->Output<TensorCPU>(o);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
-    } else if (subject_->OutputIsType<Tensor>(o, CUDA)) {
-      auto* tensor = subject_->Output<Tensor>(o, CUDA);
+    } else if (subject_->OutputIsType<TensorCUDA>(o)) {
+      auto* tensor = subject_->Output<TensorCUDA>(o);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
diff --git a/caffe2/operators/accuracy_op.cc b/caffe2/operators/accuracy_op.cc
index 8c1273eca2099..03733ed89a3b2 100644
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@@ -38,7 +38,7 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
     }
   }
   CAFFE_ENFORCE_LE(correct, N);
-  *(Y->template mutable_data<float>()) = static_cast<float>(correct) / N;
+  *(Y->mutable_data<float>()) = static_cast<float>(correct) / N;
 
   return true;
 }
@@ -46,10 +46,10 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
 REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>);
 
 OPERATOR_SCHEMA(Accuracy)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .ScalarType(TensorProto::FLOAT)
-    .SetDoc(R"DOC(
+  .NumInputs(2)
+  .NumOutputs(1)
+  .ScalarType(TensorProto::FLOAT)
+  .SetDoc(R"DOC(
 Accuracy takes two inputs- predictions and labels, and returns a float
 accuracy value for the batch. Predictions are expected in the form of 2-D tensor
 containing a batch of scores for various classes, and labels are expected in the
@@ -57,25 +57,16 @@ containing a batch of scores for various classes, and labels are expected in the
 the score for the label index in the predictions is the highest among all
 classes, it is considered a correct prediction.
 )DOC")
-    .Arg(
-        "top_k",
-        "Count as correct by comparing the true label to the top k scoring "
-        "classes (default 1: only compare to the top scoring class i.e. argmax)")
-    .Input(
-        0,
-        "predictions",
-        "2-D tensor (Tensor<float>) of size "
-        "(num_batches x num_classes) containing scores")
-    .Input(
-        1,
-        "labels",
-        "1-D tensor (Tensor<float>) of size (num_batches) having "
+  .Arg(
+      "top_k",
+      "Count as correct by comparing the true label to the top k scoring "
+      "classes (default 1: only compare to the top scoring class i.e. argmax)")
+  .Input(0, "predictions", "2-D tensor (Tensor<float>) of size "
+         "(num_batches x num_classes) containing scores")
+  .Input(1, "labels", "1-D tensor (Tensor<int>) of size (num_batches) having "
         "the indices of true labels")
-    .Output(
-        0,
-        "accuracy",
-        "1-D tensor (Tensor<float>) of size 1 containing "
-        "accuracy");
+  .Output(0, "accuracy", "1-D tensor (Tensor<float>) of size 1 containing "
+          "accuracy");
 
 SHOULD_NOT_DO_GRADIENT(Accuracy);
 }  // namespace caffe2
diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
index 5d27707662c74..949a077ec9a80 100644
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@@ -54,7 +54,7 @@ bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(label.ndim(), 1);
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
   Y->Resize(vector<TIndex>());
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
   AccuracyKernel<<<
       std::min(CAFFE_MAXIMUM_NUM_BLOCKS, N),
diff --git a/caffe2/operators/affine_channel_op.cc b/caffe2/operators/affine_channel_op.cc
index a19e96f9e1884..823a3cf8fee37 100644
--- a/caffe2/operators/affine_channel_op.cc
+++ b/caffe2/operators/affine_channel_op.cc
@@ -70,7 +70,7 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
       scale_dims.data(),
       dY_data,
       scale_data,
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
@@ -85,8 +85,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
         HxW,
         dY_data,
         X_data,
-        dscale->template mutable_data<float>(),
-        dbias->template mutable_data<float>());
+        dscale->mutable_data<float>(),
+        dbias->mutable_data<float>());
   }
   return true;
 }
@@ -104,12 +104,7 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   const float* dY_data = dY.data<float>();
   const float* scale_data = scale.data<float>();
   math::RowwiseMul<float, CPUContext>(
-      rows,
-      cols,
-      dY_data,
-      scale_data,
-      dX->template mutable_data<float>(),
-      &context_);
+      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
     const float* X_data = X.data<float>();
@@ -125,8 +120,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
         HxW,
         dY_data,
         X_data,
-        dscale->template mutable_data<float>(),
-        dbias->template mutable_data<float>());
+        dscale->mutable_data<float>(),
+        dbias->mutable_data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
index 6faa01eb97419..f3a9703cd14c3 100644
--- a/caffe2/operators/affine_channel_op.cu
+++ b/caffe2/operators/affine_channel_op.cu
@@ -71,7 +71,7 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
       scale_dims.data(),
       dY_data,
       scale_data,
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
@@ -91,8 +91,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
             HxW,
             dY_data,
             X_data,
-            dscale->template mutable_data<float>(),
-            dbias->template mutable_data<float>());
+            dscale->mutable_data<float>(),
+            dbias->mutable_data<float>());
   }
   return true;
 }
@@ -110,12 +110,7 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const float* dY_data = dY.data<float>();
   const float* scale_data = scale.data<float>();
   math::RowwiseMul<float, CUDAContext>(
-      rows,
-      cols,
-      dY_data,
-      scale_data,
-      dX->template mutable_data<float>(),
-      &context_);
+      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
   if (is_learnable_) {
     const auto& X = Input(1);
     const float* X_data = X.data<float>();
@@ -135,8 +130,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
             HxW,
             dY_data,
             X_data,
-            dscale->template mutable_data<float>(),
-            dbias->template mutable_data<float>());
+            dscale->mutable_data<float>(),
+            dbias->mutable_data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/apmeter_op.cc b/caffe2/operators/apmeter_op.cc
index 4867d86097de7..7965c81cad2e5 100644
--- a/caffe2/operators/apmeter_op.cc
+++ b/caffe2/operators/apmeter_op.cc
@@ -58,7 +58,7 @@ bool APMeterOp<float, CPUContext>::RunOnDevice() {
 
   const auto* Xdata = X.data<float>();
   const auto* labelData = label.data<int>();
-  auto* Ydata = Y->template mutable_data<float>();
+  auto* Ydata = Y->mutable_data<float>();
 
   BufferPredictions(Xdata, labelData, N, D);
 
@@ -116,7 +116,7 @@ per class for the average precision of that class.
     .Input(
         1,
         "labels",
-        "2-D tensor (Tensor<float>) of size (num_samples) "
+        "2-D tensor (Tensor<int>) of size (num_samples) "
         "containing true labels for each sample")
     .Output(
         0,
diff --git a/caffe2/operators/assert_op.h b/caffe2/operators/assert_op.h
index 27e0579d91e06..65bddaca64ca0 100644
--- a/caffe2/operators/assert_op.h
+++ b/caffe2/operators/assert_op.h
@@ -41,7 +41,7 @@ class AssertOp final : public Operator<Context> {
   }
 
  private:
-  Tensor cmp_tensor_{CPU};
+  TensorCPU cmp_tensor_;
   std::string error_msg_;
 };
 
diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc
index 73c4196b6e9b1..31a4dd659f756 100644
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@@ -33,8 +33,8 @@ class AtomicFetchAddOp final : public Operator<CPUContext> {
     d->Resize(std::vector<TIndex>());
     auto* aPtr = a.data<int32_t>();
     auto* bPtr = b.data<int32_t>();
-    auto* cPtr = c->template mutable_data<int32_t>();
-    auto* dPtr = d->template mutable_data<int32_t>();
+    auto* cPtr = c->mutable_data<int32_t>();
+    auto* dPtr = d->mutable_data<int32_t>();
     std::lock_guard<std::mutex> lg(*mutex);
     *dPtr = *aPtr;
     *cPtr = *aPtr + *bPtr;
@@ -77,7 +77,7 @@ class CheckAtomicBoolOp final : public Operator<CPUContext> {
   bool RunOnDevice() override {
     auto& ptr = OperatorBase::Input<std::unique_ptr<std::atomic<bool>>>(0);
     Output(0)->Resize(1);
-    *Output(0)->template mutable_data<bool>() = ptr->load();
+    *Output(0)->mutable_data<bool>() = ptr->load();
     return true;
   }
 };
diff --git a/caffe2/operators/batch_gather_ops.cu b/caffe2/operators/batch_gather_ops.cu
index 2d047660491b5..8aa8cb42a3e01 100644
--- a/caffe2/operators/batch_gather_ops.cu
+++ b/caffe2/operators/batch_gather_ops.cu
@@ -31,7 +31,7 @@ __global__ void BatchGatherKernel(
 template <>
 bool BatchGatherOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
+      this, OperatorBase::Input<TensorCUDA>(INDICES));
 }
 
 template <>
@@ -99,7 +99,7 @@ __global__ void BatchGatherGradientKernel(
 template <>
 bool BatchGatherGradientOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
+      this, OperatorBase::Input<TensorCUDA>(INDICES));
 }
 
 template <>
@@ -107,7 +107,7 @@ template <typename TInd>
 bool BatchGatherGradientOp<CUDAContext>::DoRunWithType() {
   return DispatchHelper<
       TensorTypes2<float, GenericTensorImplementation>,
-      TInd>::call(this, OperatorBase::Input<Tensor>(DATA, CUDA));
+      TInd>::call(this, OperatorBase::Input<TensorCUDA>(DATA));
 }
 
 template <>
diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
index 01177441c021d..b9d3491132c8f 100644
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@@ -15,7 +15,7 @@ class BatchGatherOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(INDICES, CPU));
+        this, OperatorBase::Input<TensorCPU>(INDICES));
   }
 
   template <typename TInd>
@@ -54,7 +54,8 @@ class BatchGatherOp final : public Operator<Context> {
         auto src =
             src_base + idx * block_bytesize + batch * data_batch_bytesize;
         auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
-        context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
+        context_.template CopyItems<Context, Context>(
+            data.meta(), block_size, src, dst);
       }
     }
     return true;
@@ -71,7 +72,7 @@ class BatchGatherGradientOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(INDICES, CPU));
+        this, OperatorBase::Input<TensorCPU>(INDICES));
   }
 
   template <typename TInd>
diff --git a/caffe2/operators/batch_matmul_op.h b/caffe2/operators/batch_matmul_op.h
index 6408f1fa4495d..e594f526a6bf6 100644
--- a/caffe2/operators/batch_matmul_op.h
+++ b/caffe2/operators/batch_matmul_op.h
@@ -20,7 +20,7 @@ class BatchMatMulOp final : public Operator<Context> {
         broadcast_(OperatorBase::GetSingleArgument<int>("broadcast", 0)),
         use_scratch_(OperatorBase::GetSingleArgument<int>("use_scratch", 0)) {
     if (use_scratch_) {
-      scratch_ = std::make_shared<Tensor>(Context::GetDeviceType());
+      scratch_ = std::make_shared<Tensor<Context>>();
     }
   }
 
@@ -282,7 +282,7 @@ class BatchMatMulOp final : public Operator<Context> {
   bool broadcast_;
 
   bool use_scratch_;
-  std::shared_ptr<Tensor> scratch_;
+  std::shared_ptr<Tensor<Context>> scratch_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index 33a5363b0afc3..e8424f0837d6b 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -30,20 +30,20 @@ class BatchMatMulOpGPUTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CUDA);
+    auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
     tensor->Resize(dims);
     math::Set<float, CUDAContext>(
         tensor->size(),
         value,
-        tensor->template mutable_data<float>(),
+        tensor->mutable_data<float>(),
         cuda_context_.get());
   }
 
   void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
     const Blob* Y_blob = ws_.GetBlob("Y");
     ASSERT_NE(nullptr, Y_blob);
-    const auto& Y = Y_blob->Get<Tensor>();
-    Tensor Y_cpu(Y, CPU);
+    const auto& Y = Y_blob->Get<Tensor<CUDAContext>>();
+    TensorCPU Y_cpu(Y);
     const auto& Y_dims = Y_cpu.dims();
     ASSERT_EQ(dims.size(), Y_dims.size());
     for (std::size_t i = 0; i < dims.size(); ++i) {
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index 28fa8c1a90867..0ec1799179839 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -24,12 +24,12 @@ class BatchMatMulOpTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = blob->GetMutable<TensorCPU>();
     tensor->Resize(dims);
     math::Set<float, CPUContext>(
         tensor->size(),
         value,
-        tensor->template mutable_data<float>(),
+        tensor->mutable_data<float>(),
         cpu_context_.get());
   }
 
diff --git a/caffe2/operators/bbox_transform_op.cc b/caffe2/operators/bbox_transform_op.cc
index 79520face8c09..0d2b5a3a9aa25 100644
--- a/caffe2/operators/bbox_transform_op.cc
+++ b/caffe2/operators/bbox_transform_op.cc
@@ -144,9 +144,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
 
   box_out->ResizeLike(delta_in);
   Eigen::Map<ERArrXXf> new_boxes(
-      box_out->template mutable_data<float>(),
-      box_out->dim32(0),
-      box_out->dim32(1));
+      box_out->mutable_data<float>(), box_out->dim32(0), box_out->dim32(1));
 
   // We assume roi_in and delta_in over multiple batches are grouped
   // together in increasing order as generated by GenerateProposalsOp
@@ -189,7 +187,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
     auto* roi_batch_splits = Output(1);
     roi_batch_splits->Resize(batch_size);
     Eigen::Map<EArrXf> roi_batch_splits_map(
-        roi_batch_splits->template mutable_data<float>(), batch_size);
+        roi_batch_splits->mutable_data<float>(), batch_size);
     roi_batch_splits_map =
         Eigen::Map<const EArrXi>(num_rois_per_batch.data(), batch_size)
             .cast<float>();
diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc
index 2d1deb0badc5d..b38182b8aa98b 100644
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@@ -91,7 +91,8 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
       const auto* src = inPtr + lastStart * innerSizeBytes;
       auto* dst = outPtr + outStart * innerSizeBytes;
       int numItems = i - lastStart;
-      context_.CopyItemsSameDevice(data.meta(), numItems * innerSize, src, dst);
+      context_.template CopyItems<CPUContext, CPUContext>(
+          data.meta(), numItems * innerSize, src, dst);
       outStart += numItems;
       lastStart = -1;
     }
@@ -355,9 +356,9 @@ bool SequenceMaskOp<CPUContext>::RunOnDevice() {
 template <>
 template <class T>
 bool SequenceMaskOp<CPUContext>::DoRunWithType() {
-  const Tensor* input = &Input(0);
-  const Tensor* sequence_lengths = nullptr;
-  const Tensor* window_centers = nullptr;
+  const Tensor<CPUContext>* input = &Input(0);
+  const Tensor<CPUContext>* sequence_lengths = nullptr;
+  const Tensor<CPUContext>* window_centers = nullptr;
 
   if (mode_ == "sequence") {
     sequence_lengths = &Input(1);
@@ -412,7 +413,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
           SequenceFunctor(
               sequence_lengths->data<int>(), sequence_lengths->size()),
           fill_val,
-          output->template mutable_data<T>());
+          output->mutable_data<T>());
     } else {
       MaskWithFunctor(
           left,
@@ -422,7 +423,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
           SequenceFunctor(
               sequence_lengths->data<int>(), sequence_lengths->size()),
           fill_val,
-          output->template mutable_data<T>());
+          output->mutable_data<T>());
     }
   } else if (mode_ == "window") {
     MaskWithFunctor(
@@ -432,7 +433,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         WindowFunctor(window_centers->data<int>(), radius_),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "upper") {
     MaskWithFunctor(
         left,
@@ -441,7 +442,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         UpperFunctor(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "lower") {
     MaskWithFunctor(
         left,
@@ -450,7 +451,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         LowerFunctor(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "upperdiag") {
     MaskWithFunctor(
         left,
@@ -459,7 +460,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         UpperDiagFunctor(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "lowerdiag") {
     MaskWithFunctor(
         left,
@@ -468,7 +469,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
         input->data<T>(),
         LowerDiagFunctor(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else {
     CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
     return false;
diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
index f62ec513ca2e4..85315768bd85d 100644
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@@ -73,7 +73,8 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
 
     // Copy numOfOutput from gpu to cpu
     TIndex numOfOutput;
-    context_.CopyToCPU(1, numOfOutputData, &numOfOutput);
+    context_.Copy<TIndex, CUDAContext, CPUContext>(
+        1, numOfOutputData, &numOfOutput);
 
     indices_.Resize(numOfOutput);
     std::vector<TIndex> dims = src.dims();
@@ -84,7 +85,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
     if (OutputSize() == 2) {
       auto* indicesOut = Output(1);
       indicesOut->Resize(numOfOutput);
-      indicesOut->template mutable_data<TIndex>();
+      indicesOut->mutable_data<TIndex>();
     }
 
     if (numOfOutput > 0) {
@@ -108,8 +109,8 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
   }
 
  private:
-  Tensor indices_{CUDA};
-  Tensor scratch_{CUDA};
+  Tensor<CUDAContext> indices_;
+  Tensor<CUDAContext> scratch_;
 };
 
 REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>);
@@ -296,9 +297,9 @@ bool SequenceMaskOp<CUDAContext>::RunOnDevice() {
 template <>
 template <class T>
 bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
-  const Tensor* input = &Input(0);
-  const Tensor* sequence_lengths = nullptr;
-  const Tensor* window_centers = nullptr;
+  const Tensor<CUDAContext>* input = &Input(0);
+  const Tensor<CUDAContext>* sequence_lengths = nullptr;
+  const Tensor<CUDAContext>* window_centers = nullptr;
 
   if (mode_ == "sequence") {
     sequence_lengths = &Input(1);
@@ -354,7 +355,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
           input->data<T>(),
           sequence_lengths->data<int>(),
           fill_val,
-          output->template mutable_data<T>());
+          output->mutable_data<T>());
     } else {
       sequenceMaskKernel<<<
           CAFFE_GET_BLOCKS(left * right),
@@ -367,7 +368,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
           input->data<T>(),
           sequence_lengths->data<int>(),
           fill_val,
-          output->template mutable_data<T>());
+          output->mutable_data<T>());
     }
   } else if (mode_ == "window") {
     windowMaskKernel<<<
@@ -382,7 +383,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         window_centers->data<int>(),
         radius_,
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "upper") {
     upperMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -394,7 +395,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "lower") {
     lowerMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -406,7 +407,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "upperdiag") {
     upperDiagMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -418,7 +419,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else if (mode_ == "lowerdiag") {
     lowerDiagMaskKernel<<<
         CAFFE_GET_BLOCKS(left * right),
@@ -430,7 +431,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
         batch_dim,
         input->data<T>(),
         fill_val,
-        output->template mutable_data<T>());
+        output->mutable_data<T>());
   } else {
     CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
   }
diff --git a/caffe2/operators/boolean_unmask_ops.cu b/caffe2/operators/boolean_unmask_ops.cu
index 2dfc4a19944be..dcdec9c33df7b 100644
--- a/caffe2/operators/boolean_unmask_ops.cu
+++ b/caffe2/operators/boolean_unmask_ops.cu
@@ -77,9 +77,9 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
       hostValuesData[i] = (char*)value.raw_data();
       hostValueSizesData[i] = value.size();
     }
-    masks_.CopyFrom(hostMasks_);
-    values_.CopyFrom(hostValues_);
-    valueSizes_.CopyFrom(hostValueSizes_);
+    masks_.CopyFrom(hostMasks_, &context_);
+    values_.CopyFrom(hostValues_, &context_);
+    valueSizes_.CopyFrom(hostValueSizes_, &context_);
 
     indices_.Resize(maskSize);
     auto* indicesData = indices_.mutable_data<int>();
@@ -109,14 +109,14 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
   }
 
  private:
-  Tensor indices_{CUDA};
-  Tensor masks_{CUDA};
-  Tensor values_{CUDA};
-  Tensor valueSizes_{CUDA};
-
-  Tensor hostMasks_{CPU};
-  Tensor hostValues_{CPU};
-  Tensor hostValueSizes_{CPU};
+  Tensor<CUDAContext> indices_;
+  Tensor<CUDAContext> masks_;
+  Tensor<CUDAContext> values_;
+  Tensor<CUDAContext> valueSizes_;
+
+  Tensor<CPUContext> hostMasks_;
+  Tensor<CPUContext> hostValues_;
+  Tensor<CPUContext> hostValueSizes_;
 };
 
 REGISTER_CUDA_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CUDAContext>);
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index 2972cee495747..05c588f36aa0a 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -16,13 +16,13 @@ static void AddScalarInput(
     Workspace* ws,
     bool isEmpty = false) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   if (!isEmpty) {
     tensor->Resize(vector<TIndex>{1});
-    *(tensor->template mutable_data<DataT>()) = value;
+    *(tensor->mutable_data<DataT>()) = value;
   } else {
     tensor->Resize(vector<TIndex>{0});
-    tensor->template mutable_data<DataT>();
+    tensor->mutable_data<DataT>();
   }
   return;
 }
diff --git a/caffe2/operators/box_with_nms_limit_op.cc b/caffe2/operators/box_with_nms_limit_op.cc
index 8c21cbd77a7be..9a3f45f85b85d 100644
--- a/caffe2/operators/box_with_nms_limit_op.cc
+++ b/caffe2/operators/box_with_nms_limit_op.cc
@@ -77,8 +77,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
   out_boxes->Resize(0, box_dim);
   out_classes->Resize(0);
 
-  Tensor* out_keeps = nullptr;
-  Tensor* out_keeps_size = nullptr;
+  TensorCPU* out_keeps = nullptr;
+  TensorCPU* out_keeps_size = nullptr;
   if (OutputSize() > 4) {
     out_keeps = Output(4);
     out_keeps_size = Output(5);
@@ -194,8 +194,7 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
       auto cur_boxes = boxes.block(0, j * box_dim, boxes.rows(), box_dim);
       auto& cur_keep = keeps[j];
       Eigen::Map<EArrXf> cur_out_scores(
-          out_scores->template mutable_data<float>() + cur_start_idx +
-              cur_out_idx,
+          out_scores->mutable_data<float>() + cur_start_idx + cur_out_idx,
           cur_keep.size());
       Eigen::Map<ERArrXXf> cur_out_boxes(
           out_boxes->mutable_data<float>() +
@@ -203,8 +202,7 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
           cur_keep.size(),
           box_dim);
       Eigen::Map<EArrXf> cur_out_classes(
-          out_classes->template mutable_data<float>() + cur_start_idx +
-              cur_out_idx,
+          out_classes->mutable_data<float>() + cur_start_idx + cur_out_idx,
           cur_keep.size());
 
       utils::GetSubArray(
@@ -222,11 +220,9 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
       out_keeps->Extend(total_keep_count, 50, &context_);
 
       Eigen::Map<EArrXi> out_keeps_arr(
-          out_keeps->template mutable_data<int>() + cur_start_idx,
-          total_keep_count);
+          out_keeps->mutable_data<int>() + cur_start_idx, total_keep_count);
       Eigen::Map<EArrXi> cur_out_keeps_size(
-          out_keeps_size->template mutable_data<int>() + b * num_classes,
-          num_classes);
+          out_keeps_size->mutable_data<int>() + b * num_classes, num_classes);
 
       cur_out_idx = 0;
       for (int j = 0; j < num_classes; j++) {
@@ -244,7 +240,7 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
     auto* batch_splits_out = Output(3);
     batch_splits_out->Resize(batch_size);
     Eigen::Map<EArrXf> batch_splits_out_map(
-        batch_splits_out->template mutable_data<float>(), batch_size);
+        batch_splits_out->mutable_data<float>(), batch_size);
     batch_splits_out_map =
         Eigen::Map<const EArrXi>(total_keep_per_batch.data(), batch_size)
             .cast<float>();
diff --git a/caffe2/operators/ceil_op.cu b/caffe2/operators/ceil_op.cu
index 651b0020eddd9..de382ada4186d 100644
--- a/caffe2/operators/ceil_op.cu
+++ b/caffe2/operators/ceil_op.cu
@@ -22,7 +22,7 @@ bool CeilOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->template mutable_data<float>());
+      X.size(), X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/channel_backprop_stats_op.cc b/caffe2/operators/channel_backprop_stats_op.cc
index 90b0b38ceceef..bee287d29cef9 100644
--- a/caffe2/operators/channel_backprop_stats_op.cc
+++ b/caffe2/operators/channel_backprop_stats_op.cc
@@ -26,10 +26,8 @@ bool ChannelBackpropStatsOp<CPUContext>::RunOnDevice() {
   ConstEigenVectorArrayMap<float> mean_arr(Input(SAVED_MEAN).data<float>(), C);
   ConstEigenVectorArrayMap<float> inv_stddev_arr(
       Input(SAVED_INV_STDDEV).data<float>(), C);
-  EigenVectorArrayMap<float> dBias_arr(
-      dBias->template mutable_data<float>(), C);
-  EigenVectorArrayMap<float> dScale_arr(
-      dScale->template mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
 
   dBias_arr.setZero();
   dScale_arr.setZero();
diff --git a/caffe2/operators/channel_backprop_stats_op.cu b/caffe2/operators/channel_backprop_stats_op.cu
index 19999a2360286..3726773843113 100644
--- a/caffe2/operators/channel_backprop_stats_op.cu
+++ b/caffe2/operators/channel_backprop_stats_op.cu
@@ -199,8 +199,8 @@ bool ChannelBackpropStatsOp<CUDAContext>::RunOnDevice() {
           numBlocksPerChannel,
           dBiasScratch_.data<float>(),
           dScaleScratch_.data<float>(),
-          dBias->template mutable_data<float>(),
-          dScale->template mutable_data<float>());
+          dBias->mutable_data<float>(),
+          dScale->mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/channel_backprop_stats_op.h b/caffe2/operators/channel_backprop_stats_op.h
index ce0e08927108b..7678c00d96f87 100644
--- a/caffe2/operators/channel_backprop_stats_op.h
+++ b/caffe2/operators/channel_backprop_stats_op.h
@@ -23,8 +23,8 @@ class ChannelBackpropStatsOp : public Operator<Context> {
   INPUT_TAGS(INPUT, SAVED_MEAN, SAVED_INV_STDDEV, OUTPUT_GRAD);
   OUTPUT_TAGS(SCALE_GRAD, BIAS_GRAD);
 
-  Tensor dBiasScratch_{Context::GetDeviceType()};
-  Tensor dScaleScratch_{Context::GetDeviceType()};
+  Tensor<Context> dBiasScratch_;
+  Tensor<Context> dScaleScratch_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu
index 120947c21af31..ad9a0ab4e79d7 100644
--- a/caffe2/operators/channel_shuffle_op_gpu.cu
+++ b/caffe2/operators/channel_shuffle_op_gpu.cu
@@ -56,7 +56,7 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), S, C, G, K, X.data<float>(), Y->template mutable_data<float>());
+      X.size(), S, C, G, K, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -74,7 +74,7 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), G, K, X.data<float>(), Y->template mutable_data<float>());
+      X.size(), G, K, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -93,13 +93,7 @@ bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      dY.size(),
-      S,
-      C,
-      K,
-      G,
-      dY.data<float>(),
-      dX->template mutable_data<float>());
+      dY.size(), S, C, K, G, dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
@@ -117,7 +111,7 @@ bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      dY.size(), K, G, dY.data<float>(), dX->template mutable_data<float>());
+      dY.size(), K, G, dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/channel_stats_op.cc b/caffe2/operators/channel_stats_op.cc
index b9cd19f75909e..442ab48d764de 100644
--- a/caffe2/operators/channel_stats_op.cc
+++ b/caffe2/operators/channel_stats_op.cc
@@ -17,10 +17,8 @@ bool ChannelStatsOp<CPUContext>::RunOnDevice() {
 
   Output(SUM)->Resize(C);
   Output(SUMSQ)->Resize(C);
-  EigenVectorArrayMap<float> sum(
-      Output(SUM)->template mutable_data<float>(), C);
-  EigenVectorArrayMap<float> sumsq(
-      Output(SUMSQ)->template mutable_data<float>(), C);
+  EigenVectorArrayMap<float> sum(Output(SUM)->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> sumsq(Output(SUMSQ)->mutable_data<float>(), C);
 
   sum.setZero();
   sumsq.setZero();
diff --git a/caffe2/operators/channel_stats_op.cu b/caffe2/operators/channel_stats_op.cu
index 7f129ad1ea47e..fff23ffe46102 100644
--- a/caffe2/operators/channel_stats_op.cu
+++ b/caffe2/operators/channel_stats_op.cu
@@ -185,8 +185,8 @@ bool ChannelStatsOp<CUDAContext>::RunOnDevice() {
           numBlocksPerChannel,
           sumScratch_.data<float>(),
           sumsqScratch_.data<float>(),
-          sum->template mutable_data<float>(),
-          sumsq->template mutable_data<float>());
+          sum->mutable_data<float>(),
+          sumsq->mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/channel_stats_op.h b/caffe2/operators/channel_stats_op.h
index 0ccb885285760..eb6b062068c09 100644
--- a/caffe2/operators/channel_stats_op.h
+++ b/caffe2/operators/channel_stats_op.h
@@ -23,8 +23,8 @@ class ChannelStatsOp : public Operator<Context> {
   INPUT_TAGS(INPUT);
   OUTPUT_TAGS(SUM, SUMSQ);
 
-  Tensor sumScratch_{Context::GetDeviceType()};
-  Tensor sumsqScratch_{Context::GetDeviceType()};
+  Tensor<Context> sumScratch_;
+  Tensor<Context> sumsqScratch_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/clip_op.cc b/caffe2/operators/clip_op.cc
index 789a44d61cee4..02e80bd131beb 100644
--- a/caffe2/operators/clip_op.cc
+++ b/caffe2/operators/clip_op.cc
@@ -8,7 +8,7 @@ bool ClipOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);
   Y->ResizeLike(X);
-  EigenVectorMap<float>(Y->template mutable_data<float>(), Y->size()) =
+  EigenVectorMap<float>(Y->mutable_data<float>(), Y->size()) =
       ConstEigenVectorMap<float>(X.data<float>(), X.size())
           .cwiseMax(min_)
           .cwiseMin(max_);
@@ -25,7 +25,7 @@ bool ClipGradientOp<float, CPUContext>::RunOnDevice() {
   dX->ResizeLike(Y);
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   for (int i = 0; i < Y.size(); ++i) {
     dXdata[i] = dYdata[i] * (Ydata[i] > min_ && Ydata[i] < max_);
   }
diff --git a/caffe2/operators/clip_op.cu b/caffe2/operators/clip_op.cu
index 167ef21492f50..91b6dca882f39 100644
--- a/caffe2/operators/clip_op.cu
+++ b/caffe2/operators/clip_op.cu
@@ -46,12 +46,9 @@ bool ClipOp<float, CUDAContext>::RunOnDevice() {
   auto* Y = Output(0);
   CAFFE_ENFORCE_GT(X.size(), 0);
   Y->ResizeLike(X);
-  ClipKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(), min_, max_, X.data<float>(), Y->template mutable_data<float>());
+  ClipKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+               0, context_.cuda_stream()>>>(
+      X.size(), min_, max_, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -63,17 +60,10 @@ bool ClipGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_GT(Y.size(), 0);
   CAFFE_ENFORCE_EQ(dY.size(), Y.size());
   dX->ResizeLike(Y);
-  ClipGradientKernel<<<
-      CAFFE_GET_BLOCKS(Y.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      Y.size(),
-      min_,
-      max_,
-      Y.data<float>(),
-      dY.data<float>(),
-      dX->template mutable_data<float>());
+  ClipGradientKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS,
+                       0, context_.cuda_stream()>>>(
+      Y.size(), min_, max_, Y.data<float>(), dY.data<float>(),
+      dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
index 006f6212a31d8..256e2109504ab 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
@@ -180,8 +180,8 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
   //   outputs[0].data[...] = rois
   auto* rois_out = Output(0);
   rois_out->Resize(rois.rows(), rois.cols());
-  Eigen::Map<ERArrXXf> rois_out_mat(
-      rois_out->template mutable_data<float>(), rois.rows(), rois.cols());
+  Eigen::Map<ERArrXXf> rois_out_mat(rois_out->mutable_data<float>(),
+                                    rois.rows(), rois.cols());
   rois_out_mat = rois;
 
   // Create new roi blobs for each FPN level
@@ -207,10 +207,9 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
     // Output blob_roi_level
     auto* roi_out = Output(i + 1);
     roi_out->Resize(blob_roi_level.rows(), blob_roi_level.cols());
-    Eigen::Map<ERArrXXf> roi_out_mat(
-        roi_out->template mutable_data<float>(),
-        blob_roi_level.rows(),
-        blob_roi_level.cols());
+    Eigen::Map<ERArrXXf> roi_out_mat(roi_out->mutable_data<float>(),
+                                     blob_roi_level.rows(),
+                                     blob_roi_level.cols());
     roi_out_mat = blob_roi_level;
 
     // Append indices from idx_lvl to rois_idx_restore
@@ -220,9 +219,8 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
   utils::ArgSort(rois_idx_restore);
   auto* rois_idx_restore_out = Output(OutputSize() - 1);
   rois_idx_restore_out->Resize(rois_idx_restore.size());
-  Eigen::Map<EArrXi> rois_idx_restore_out_mat(
-      rois_idx_restore_out->template mutable_data<int>(),
-      rois_idx_restore.size());
+  Eigen::Map<EArrXi> rois_idx_restore_out_mat(rois_idx_restore_out->mutable_data<int>(),
+                                              rois_idx_restore.size());
   rois_idx_restore_out_mat = rois_idx_restore;
 
   return true;
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index 35c170c320e9a..08e87db7af33c 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -81,8 +81,8 @@ class SplitByLengthsOp final : public Operator<Context> {
 
  protected:
   int axis_;
-  Tensor inclusive_scan_buffer_{Context::GetDeviceType()};
-  Tensor inclusive_scan_length_buffer_{Context::GetDeviceType()};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
   // Input: X, optionally split
   // The split tensor is stored in CPU.
 };
@@ -134,7 +134,7 @@ bool SplitOp<Context>::RunOnDevice() {
         0,
         "If you set split with an input blob, do not pass in "
         "split in the argument.");
-    auto& split_tensor = OperatorBase::Input<Tensor>(1, CPU);
+    auto& split_tensor = OperatorBase::Input<TensorCPU>(1);
     CAFFE_ENFORCE_EQ(split_tensor.size(), OutputSize());
     axis_data = split_tensor.template data<int>();
   } else if (split_.size() == 0) {
@@ -199,7 +199,7 @@ bool SplitOp<Context>::RunOnDevice() {
 template <class Context>
 bool SplitByLengthsOp<Context>::RunOnDevice() {
   auto& input = Input(0);
-  auto& length = OperatorBase::Input<Tensor>(1, CPU);
+  auto& length = OperatorBase::Input<TensorCPU>(1);
   auto length_length = length.size();
   CAFFE_ENFORCE_EQ(
       length_length % OutputSize(),
@@ -244,7 +244,7 @@ bool SplitByLengthsOp<Context>::RunOnDevice() {
 template <class Context>
 bool ConcatOp<Context>::RunOnDevice() {
   auto* output = Output(0);
-  Tensor* split = OperatorBase::Output<Tensor>(1, CPU);
+  TensorCPU* split = OperatorBase::Output<TensorCPU>(1);
   split->Resize(vector<TIndex>(1, InputSize()));
   int* axis_data = split->template mutable_data<int>();
   auto& input_zero = Input(0);
diff --git a/caffe2/operators/conditional_op.cc b/caffe2/operators/conditional_op.cc
index e202ea2e9881e..9c53f3f4cb9bd 100644
--- a/caffe2/operators/conditional_op.cc
+++ b/caffe2/operators/conditional_op.cc
@@ -34,10 +34,10 @@ bool ConditionalOp<CPUContext>::RunOnDevice() {
   for (TIndex i = 0; i < condition.size(); i++) {
     auto* dst = outPtr + i * innerSizeBytes;
     if (condPtr[i]) {
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<CPUContext, CPUContext>(
           dataT.meta(), innerSize, ptrT + i * innerSizeBytes, dst);
     } else {
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<CPUContext, CPUContext>(
           dataF.meta(), innerSize, ptrF + i * innerSizeBytes, dst);
     }
   }
diff --git a/caffe2/operators/conv_op.h b/caffe2/operators/conv_op.h
index 019e91248b9d8..efdc30f161a31 100644
--- a/caffe2/operators/conv_op.h
+++ b/caffe2/operators/conv_op.h
@@ -54,10 +54,10 @@ class ConvOp final : public ConvPoolOpBase<Context> {
       const T* bias,
       T* Y);
 
-  Tensor col_buffer_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor img_shape_device_{Context::GetDeviceType()};
-  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
   // Input: X, W, b
   // Output: Y
   INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -83,10 +83,10 @@ class ConvGradientOp final : public ConvPoolOpBase<Context> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  Tensor col_buffer_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor img_shape_device_{Context::GetDeviceType()};
-  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
   bool no_bias_;
   // input: X, W, dY
   // output: dW, db, and optionally dX
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
index 9975c04cb6ece..f8ad628c0ca72 100644
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@@ -19,9 +19,9 @@ namespace caffe2 {
 
 template <typename T, class Context>
 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), C = X.dim32(1);
   CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
   const int M = filter.dim32(0);
@@ -96,7 +96,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         N, C, HxW, M, X_data, filter_data, bias_data, Y_data);
   }
 
-  auto f = [&](Tensor* col_buffer) {
+  auto f = [&](Tensor<Context>* col_buffer) {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2Col, followed by gemm.
@@ -180,9 +180,9 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 // The implementations.
 template <typename T, class Context>
 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
-  const Tensor& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
 
   CAFFE_ENFORCE_EQ(
@@ -233,7 +233,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
     ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
         output_image_size, &bias_multiplier_);
   }
-  auto f = [&](Tensor* col_buffer) {
+  auto f = [&](Tensor<Context>* col_buffer) {
     col_buffer->Resize(
         vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
     T* col_buffer_data = col_buffer->template mutable_data<T>();
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
index b9f54b6d55be7..b0ad152a56f9c 100644
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@@ -19,16 +19,16 @@ void createSharedBuffer<CPUContext>(Workspace* ws) {
 }
 
 template <>
-void runWithSharedBuffer<CPUContext>(
+void runWithSharedBuffer(
     Workspace* ws,
-    std::function<void(Tensor* buffer)> f) {
+    std::function<void(Tensor<CPUContext>* buffer)> f) {
   auto* mutexBlob = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU_MUTEX__");
   CAFFE_ENFORCE(mutexBlob, "Must call createSharedBuffer() first");
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
   auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutable<TensorCPU>();
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_op_shared.h b/caffe2/operators/conv_op_shared.h
index 34ccee5fa2c45..18f7aa898a6b5 100644
--- a/caffe2/operators/conv_op_shared.h
+++ b/caffe2/operators/conv_op_shared.h
@@ -19,7 +19,9 @@ void createSharedBuffer(Workspace* ws);
  * access to shared buffer.
  */
 template <typename Context>
-void runWithSharedBuffer(Workspace* ws, std::function<void(Tensor* buffer)> f);
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<Context>* buffer)> f);
 } // namespace caffe2
 
 #endif // CAFFE2_OPERATORS_CONV_OP_SHARED_H_
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index f80d15a5d9054..eb5a762cbd3e3 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -12,16 +12,16 @@ void createSharedBuffer<CUDAContext>(Workspace* ws) {
 }
 
 template <>
-void runWithSharedBuffer<CUDAContext>(
+void runWithSharedBuffer(
     Workspace* ws,
-    std::function<void(Tensor* buffer)> f) {
+    std::function<void(Tensor<CUDAContext>* buffer)> f) {
   auto* mutexBlob = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA_MUTEX__");
   CAFFE_ENFORCE(mutexBlob, "Must call createSharedBuffer() first");
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
+  auto* buffer = ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")
+                     ->GetMutable<TensorCUDA>();
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index 0cea8f9645a71..723304994c4e9 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -168,7 +168,7 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
   // Returns the input image dimensions for the current storage order type.
-  vector<int> GetDims(const Tensor& input) {
+  vector<int> GetDims(const Tensor<Context>& input) {
     vector<int> dims;
     switch (order_) {
       case StorageOrder::NCHW:
@@ -184,7 +184,7 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
   // Returns the size of the input image for the current storage type.
-  int GetDimsSize(const Tensor& input) {
+  int GetDimsSize(const Tensor<Context>& input) {
     int size = 0;
     switch (order_) {
       case StorageOrder::NCHW:
@@ -214,8 +214,12 @@ class ConvPoolOpBase : public Operator<Context> {
   // Note(jiayq): the templatization of this function is mainly to help
   // implementations that do not use first-class Tensor objects, such as the
   // MKL operator. One can still call this function with dummy
-  // Tensor objects in order to obtain the sizes.
-  void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) {
+  // Tensor<CPUContext> objects in order to obtain the sizes.
+  template <typename AlternativeContext>
+  void SetOutputSize(
+      const Tensor<AlternativeContext>& input,
+      Tensor<AlternativeContext>* output,
+      int output_channel) {
     CAFFE_ENFORCE(input.size() > 0);
     vector<int> output_dims;
     int N = input.dim32(0);
@@ -331,7 +335,7 @@ class ConvPoolOpBase : public Operator<Context> {
         stride_.cbegin(), stride_.cend(), [](const int x) { return x > 1; });
   }
 
-  void SetDeviceTensor(const std::vector<int>& data, Tensor* tensor) {
+  void SetDeviceTensor(const std::vector<int>& data, Tensor<Context>* tensor) {
     bool reset_tensor_device_ = false;
 
     if (tensor->size() != data.size()) {
@@ -354,7 +358,7 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
   template <typename T>
-  void SetBiasMultiplier(const int size, Tensor* bias_multiplier_) {
+  void SetBiasMultiplier(const int size, Tensor<Context>* bias_multiplier_) {
     if (bias_multiplier_->size() != size) {
       // If the helper bias multiplier is not image size, reshape and fill it
       // with one.
@@ -731,9 +735,9 @@ class ConvPoolOpBase : public Operator<Context> {
   }
 
  private:
-  inline void AllocateAndCopy(const vector<int>& vec, Tensor& tensor) {
+  inline void AllocateAndCopy(const vector<int>& vec, Tensor<Context>& tensor) {
     tensor.Resize(vec.size());
-    context_.template CopyFromCPU<int>(
+    context_.template Copy<int, CPUContext, Context>(
         vec.size(), vec.data(), tensor.template mutable_data<int>());
   }
 
diff --git a/caffe2/operators/conv_transpose_op.h b/caffe2/operators/conv_transpose_op.h
index 8204bb02befe4..6dcdbb81b1ced 100644
--- a/caffe2/operators/conv_transpose_op.h
+++ b/caffe2/operators/conv_transpose_op.h
@@ -18,8 +18,8 @@ class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  Tensor col_buffer_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
   // Input: X, W, b
   // Output: Y
   INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -41,8 +41,8 @@ class ConvTransposeGradientOp final : public ConvTransposeUnpoolBase<Context> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  Tensor col_buffer_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
   const bool no_bias_;
   // input: X, W, dY
   // output: dW, optionally db and dX
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index 23def95ea9bd1..808433939c785 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -17,9 +17,9 @@ namespace caffe2 {
 
 template <typename T, class Context>
 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -59,7 +59,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   const T* filter_data = filter.template data<T>();
   T* Ydata = Y->template mutable_data<T>();
 
-  auto f = [&](Tensor* col_buffer) {
+  auto f = [&](Tensor<Context>* col_buffer) {
     col_buffer->Resize(
         vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
     T* col_buffer_data = col_buffer->template mutable_data<T>();
@@ -139,9 +139,9 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 
 template <typename T, class Context>
 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
-  const Tensor& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -180,7 +180,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   const T* filter_data = filter.template data<T>();
   T* Ydata = Y->template mutable_data<T>();
 
-  auto f = [&](Tensor* /*col_buffer*/) {
+  auto f = [&](Tensor<Context>* /*col_buffer*/) {
     col_buffer_.Resize(
         vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
     T* col_buffer_data = col_buffer_.template mutable_data<T>();
diff --git a/caffe2/operators/conv_transpose_op_mobile.h b/caffe2/operators/conv_transpose_op_mobile.h
index 568dd34c1c844..ddfe365e678dc 100644
--- a/caffe2/operators/conv_transpose_op_mobile.h
+++ b/caffe2/operators/conv_transpose_op_mobile.h
@@ -35,7 +35,7 @@ class ConvTransposeMobileOp final : public ConvTransposeUnpoolBase<Context> {
  private:
   // We store a numThreasds per-worker  tiles of Y, and numThreads per-worker threadBuffer for the
   // gemm output, laid out in that order.
-  Tensor threadBuffer_{CPU};
+  TensorCPU threadBuffer_;
 
   // Input: X, W, b
   // Output: Y
diff --git a/caffe2/operators/conv_transpose_op_mobile_impl.h b/caffe2/operators/conv_transpose_op_mobile_impl.h
index c724c907f13c8..d434ec49e3e5b 100644
--- a/caffe2/operators/conv_transpose_op_mobile_impl.h
+++ b/caffe2/operators/conv_transpose_op_mobile_impl.h
@@ -529,9 +529,9 @@ void sumInto(float* acc, std::vector<float*>& toSum, size_t size) {
 
 template <typename T, class Context>
 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
-  Tensor* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -606,7 +606,7 @@ bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         &context_);
   };
 
-  auto f = [&](Tensor* threadBuffer) {
+  auto f = [&](Tensor<Context>* threadBuffer) {
     threadBuffer->Resize(
         numThreads * threadYBufferSizeAligned +
         numThreads * threadColBufferSize);
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index da443928a9745..b9282e767d060 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -17,10 +17,11 @@ void AddConstInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
-  math::Set<float, CPUContext>(
-      tensor->size(), value, tensor->template mutable_data<float>(), &context);
+  math::Set<float, CPUContext>(tensor->size(), value,
+                               tensor->mutable_data<float>(),
+                               &context);
 }
 
 void AddNoiseInput(const vector<TIndex>& shape,
@@ -29,15 +30,14 @@ void AddNoiseInput(const vector<TIndex>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
-      tensor->size(),
-      0.0f,
-      10.0f,
-      tensor->template mutable_data<float>(),
-      &context);
+    tensor->size(),
+    0.0f, 10.0f,
+    tensor->mutable_data<float>(),
+    &context);
 }
 
 inline float relativeError(float a, float b) {
diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h
index e3e253e150c46..bf2708d22733e 100644
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@@ -131,7 +131,10 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
     }
   }
   // Sets the output size. The output channel is manually specified.
-  void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) {
+  void SetOutputSize(
+      const Tensor<Context>& input,
+      Tensor<Context>* output,
+      int output_channel) {
     CAFFE_ENFORCE(4 == input.ndim());
     CAFFE_ENFORCE(input.size() > 0);
     int N = input.dim32(0);
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cc b/caffe2/operators/cosine_embedding_criterion_op.cc
index b99f3f33572ea..26f477fe62cd0 100644
--- a/caffe2/operators/cosine_embedding_criterion_op.cc
+++ b/caffe2/operators/cosine_embedding_criterion_op.cc
@@ -18,7 +18,7 @@ bool CosineEmbeddingCriterionOp<CPUContext>::RunOnDevice() {
 
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output_data = output->template mutable_data<float>();
+  float* output_data = output->mutable_data<float>();
   for (int i = 0; i < S.size(); ++i) {
     output_data[i] =
         Ydata[i] == 1 ? (1.f - Sdata[i]) : std::max(0.f, Sdata[i] - margin_);
@@ -38,7 +38,7 @@ bool CosineEmbeddingCriterionGradientOp<CPUContext>::RunOnDevice() {
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
   const float* dOutput_data = dOutput.data<float>();
-  float* dSdata = dS->template mutable_data<float>();
+  float* dSdata = dS->mutable_data<float>();
   for (int i = 0; i < S.size(); ++i) {
     dSdata[i] = dOutput_data[i] *
         (Ydata[i] == 1 ? -1.f : static_cast<float>(Sdata[i] >= margin_));
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cu b/caffe2/operators/cosine_embedding_criterion_op.cu
index e720f95efc683..69a37ff329445 100644
--- a/caffe2/operators/cosine_embedding_criterion_op.cu
+++ b/caffe2/operators/cosine_embedding_criterion_op.cu
@@ -33,8 +33,8 @@ bool CosineEmbeddingCriterionOp<CUDAContext>::RunOnDevice() {
 
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output_data = output->template mutable_data<float>();
-
+  float* output_data = output->mutable_data<float>();
+ 
   CECKernel<<<CAFFE_GET_BLOCKS(S.size()), CAFFE_CUDA_NUM_THREADS,
               0, context_.cuda_stream()>>>(
       S.size(), Sdata, Ydata, margin_, output_data);
@@ -53,7 +53,7 @@ bool CosineEmbeddingCriterionGradientOp<CUDAContext>::RunOnDevice() {
   const float* Sdata = S.data<float>();
   const int* Ydata = Y.data<int>();
   const float* dOutput_data = dOutput.data<float>();
-  float* dSdata = dS->template mutable_data<float>();
+  float* dSdata = dS->mutable_data<float>();
   CECGradientKernel<<<CAFFE_GET_BLOCKS(S.size()), CAFFE_CUDA_NUM_THREADS,
                       0, context_.cuda_stream()>>>(
       S.size(), Sdata, Ydata, dOutput_data, margin_, dSdata);
diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h
index cf58b7cd351b2..3b7bf7dd82711 100644
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@@ -75,7 +75,7 @@ class ResetCounterOp final : public Operator<Context> {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
     auto previous = counterPtr->reset(init_count_);
     if (OutputSize() == 1) {
-      auto* output = Output(0);
+      auto* output = OperatorBase::Output<TensorCPU>(0);
       output->Resize();
       *output->template mutable_data<T>() = previous;
     }
@@ -96,7 +96,7 @@ class CountDownOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<bool>() = counterPtr->countDown();
     return true;
@@ -113,7 +113,7 @@ class CheckCounterDoneOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<bool>() = counterPtr->checkIfDone();
     return true;
@@ -130,7 +130,7 @@ class CountUpOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<T>() = counterPtr->countUp();
     return true;
@@ -147,7 +147,7 @@ class RetrieveCountOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<int>{});
     *output->template mutable_data<T>() = counterPtr->retrieve();
     return true;
diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc
index 584b7abd5a183..c288eb7be69d8 100644
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@@ -56,7 +56,7 @@ bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(N);
   const auto* Xdata = X.data<float>();
   const auto* labelData = label.data<int>();
-  auto* Ydata = Y->template mutable_data<float>();
+  auto* Ydata = Y->mutable_data<float>();
   CAFFE_ENFORCE(
       (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() &&
           (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(),
@@ -85,7 +85,7 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -123,7 +123,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -167,7 +167,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -201,7 +201,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -241,12 +241,12 @@ bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(dY.ndim(), 1);
   CAFFE_ENFORCE_EQ(dY.dim32(0), N);
   dX->ResizeLike(X);
-  math::Set<float, CPUContext>(
-      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
+  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data<float>(),
+                               &context_);
   const float* Xdata = X.data<float>();
   const float* dYdata = dY.data<float>();
   const int* labelData = label.data<int>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   for (int i = 0; i < N; ++i) {
     dXdata[i * D + labelData[i]] =
         - dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD());
@@ -263,7 +263,7 @@ bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
   TIndex N = X.size();
   Y->Resize(shape);
   const auto* Xdata = X.data<float>();
-  auto* Ydata = Y->template mutable_data<float>();
+  auto* Ydata = Y->mutable_data<float>();
   for (TIndex i = 0; i < N; ++i) {
     DCHECK_GE(Xdata[i], 0.0);
     DCHECK_LE(Xdata[i], 1.0);
@@ -283,7 +283,7 @@ bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
   shape.pop_back();
   dX->Resize(shape);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   TIndex N = dX->size();
   // use eigen?
   for (TIndex i = 0; i < N; ++i) {
@@ -311,7 +311,7 @@ bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(vector<TIndex>{N});
   const float* Xdata = X.data<float>();
   const float* labelData = label.data<float>();
-  auto* Ydata = Y->template mutable_data<float>();
+  auto* Ydata = Y->mutable_data<float>();
   CAFFE_ENFORCE(
       (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() &&
           (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(),
@@ -350,11 +350,11 @@ bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(dY.dim32(0), N);
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
-      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
+    dX->size(), 0.f, dX->mutable_data<float>(), &context_);
   const float* Xdata = X.data<float>();
   const float* dYdata = dY.data<float>();
   const float* labelData = label.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   EigenArrayMap<float>(dXdata, D, N) =
       (ConstEigenArrayMap<float>(labelData, D, N) /
        ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()))
diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
index cab3c9692a42f..70bfbe4e9e3bc 100644
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@@ -43,17 +43,10 @@ bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
       (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
   Y->Resize(vector<TIndex>(size_t(1), N));
-  LabelCrossEntropyKernel<<<
-      CAFFE_GET_BLOCKS(N),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N,
-      D,
-      X.data<float>(),
-      label.data<int>(),
-      kLOG_THRESHOLD(),
-      Y->template mutable_data<float>());
+  LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                            0, context_.cuda_stream()>>>(
+      N, D, X.data<float>(), label.data<int>(), kLOG_THRESHOLD(),
+      Y->mutable_data<float>());
   return true;
 }
 
@@ -78,19 +71,11 @@ bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(dY.dim32(0), N);
   dX->ResizeLike(X);
   math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
-  LabelCrossEntropyGradientKernel<<<
-      CAFFE_GET_BLOCKS(N),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N,
-      D,
-      X.data<float>(),
-      label.data<int>(),
-      dY.data<float>(),
-      kLOG_THRESHOLD(),
-      dX->template mutable_data<float>());
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+  LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                                    0, context_.cuda_stream()>>>(
+      N, D, X.data<float>(), label.data<int>(), dY.data<float>(),
+      kLOG_THRESHOLD(), dX->mutable_data<float>());
   return true;
 }
 
@@ -119,12 +104,9 @@ bool MakeTwoClassOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max() / 2);
   Y->Resize(shape);
   int N = X.size();
-  MakeTwoClassKernel<<<
-      CAFFE_GET_BLOCKS(N),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, X.data<float>(), Y->template mutable_data<float>());
+  MakeTwoClassKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                       0, context_.cuda_stream()>>>(
+      N, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -139,12 +121,9 @@ bool MakeTwoClassGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_LT(dY.size(), std::numeric_limits<int>::max());
   dX->Resize(shape);
   int N = dX->size();
-  MakeTwoClassGradientKernel<<<
-      CAFFE_GET_BLOCKS(N),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, dY.data<float>(), dX->template mutable_data<float>());
+  MakeTwoClassGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                               0, context_.cuda_stream()>>>(
+      N, dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
@@ -255,7 +234,7 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::RunOnDevice() {
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -293,7 +272,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -377,7 +356,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::
     std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
@@ -407,7 +386,7 @@ bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CUDAContext>::
 
   auto* out = Output(0);
   out->ResizeLike(logits);
-  auto* out_ptr = out->template mutable_data<float>();
+  auto* out_ptr = out->mutable_data<float>();
 
   auto* logits_ptr = logits.data<float>();
   auto* targets_ptr = targets.data<float>();
diff --git a/caffe2/operators/ctc_beam_search_decoder_op.cc b/caffe2/operators/ctc_beam_search_decoder_op.cc
index e299950e9d946..9dd426978b257 100644
--- a/caffe2/operators/ctc_beam_search_decoder_op.cc
+++ b/caffe2/operators/ctc_beam_search_decoder_op.cc
@@ -4,7 +4,8 @@ namespace caffe2 {
 
 namespace {
 
-const float* getTensorDataPtr(const Tensor& tensor, int t, int n) {
+template <class Context>
+const float* getTensorDataPtr(const Tensor<Context>& tensor, int t, int n) {
   const auto& dims = tensor.dims();
   CAFFE_ENFORCE_EQ(dims.size(), 3);
   int offset = (t * dims[1] + n) * dims[2];
diff --git a/caffe2/operators/ctc_greedy_decoder_op.cc b/caffe2/operators/ctc_greedy_decoder_op.cc
index 8a5e0932defd6..1a9c415aac74b 100644
--- a/caffe2/operators/ctc_greedy_decoder_op.cc
+++ b/caffe2/operators/ctc_greedy_decoder_op.cc
@@ -4,7 +4,8 @@ namespace caffe2 {
 
 namespace {
 
-const float* getTensorDataPtr(const Tensor& tensor, int t, int n) {
+template <class Context>
+const float* getTensorDataPtr(const Tensor<Context>& tensor, int t, int n) {
   const auto& dims = tensor.dims();
   CAFFE_ENFORCE_EQ(dims.size(), 3);
   int offset = (t * dims[1] + n) * dims[2];
@@ -33,7 +34,7 @@ bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
 
   vector<int> values_cach;
   output_len->Resize(vector<TIndex>{batch_size});
-  int* output_len_data = output_len->template mutable_data<int>();
+  int* output_len_data = output_len->mutable_data<int>();
 
   for (int32_t i = 0; i < batch_size; ++i) {
     int previous_label = 0, t_dec = 0;
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index cb7e108748a6d..e7291476e6076 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -12,7 +12,7 @@
 namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(std::unique_ptr<dataset_ops::TreeCursor>);
-CAFFE_KNOWN_TYPE(dataset_ops::TensorVectorPtr);
+CAFFE_KNOWN_TYPE(dataset_ops::TensorVectorPtr<CPUContext>);
 CAFFE_KNOWN_TYPE(dataset_ops::SharedTensorVectorPtr);
 
 namespace dataset_ops {
@@ -215,7 +215,7 @@ class GetCursorOffsetOp : public Operator<CPUContext> {
   bool RunOnDevice() override {
     auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
     Output(0)->Resize(cursor->offsets.size());
-    auto* output = Output(0)->template mutable_data<int>();
+    auto* output = Output(0)->mutable_data<int>();
     for (size_t i = 0; i < cursor->offsets.size(); ++i) {
       output[i] = cursor->offsets[i];
     }
@@ -314,16 +314,16 @@ class PackRecordsOp : public Operator<CPUContext> {
     Output(0)->Resize(walker.size());
 
     // Output(0)->raw_mutable_data(TypeMeta::Make<SharedTensorVectorPtr>()));
-    auto* dst = Output(0)->template mutable_data<SharedTensorVectorPtr>();
+    auto* dst = Output(0)->mutable_data<SharedTensorVectorPtr>();
 
     for (int batchId = 0; batchId < walker.size(); ++batchId) {
       dst[batchId] = std::make_shared<std::vector<TensorCPU>>();
       dst[batchId]->reserve(walker.fields().size());
 
       for (const auto& field : walker.fields()) {
-        dst[batchId]->emplace_back(field.dim(), CPU);
+        dst[batchId]->emplace_back(field.dim());
         auto& tensor = dst[batchId]->back();
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<CPUContext, CPUContext>(
             field.meta(),
             tensor.size(),
             field.ptr() /* src */,
@@ -397,7 +397,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
       for (int j = 0; j < numTensors; ++j) {
         const auto& input = inputs[i]->at(j);
 
-        context_.CopyItemsSameDevice(
+        context_.CopyItems<CPUContext, CPUContext>(
             *metas[j],
             input.size(),
             input.raw_data() /* src */,
@@ -518,7 +518,8 @@ class ReadNextBatchOp : public Operator<CPUContext> {
       if (out->size() == 0) {
         continue;
       }
-      context_.CopyItemsSameDevice(in.meta(), out->size(), src, dst);
+      context_.template CopyItems<CPUContext, CPUContext>(
+          in.meta(), out->size(), src, dst);
     }
     return true;
   }
@@ -559,7 +560,7 @@ class ComputeOffsetOp : public Operator<CPUContext> {
           std::min(limits[lengthFieldIdx], (TOffset)Input(i + 1).dims()[0]);
     }
     out->Resize(limits.at(0) + 1, sizes.size());
-    auto* out_data = out->template mutable_data<int64_t>();
+    auto* out_data = out->mutable_data<int64_t>();
     for (int k = 0; k <= limits.at(0); k++) {
       // advance cursor
       if (cursor->offsets.empty()) {
@@ -608,7 +609,7 @@ class SortAndShuffleOp : public Operator<CPUContext> {
     int num_batch = size / batch_size_;
     auto* out = Output(0);
     out->Resize(size);
-    auto* out_data = out->template mutable_data<int64_t>();
+    auto* out_data = out->mutable_data<int64_t>();
 
     vector<int> shuffle_idx(size);
     iota(shuffle_idx.begin(), shuffle_idx.end(), 0);
@@ -738,7 +739,7 @@ class ReadRandomBatchOp : public Operator<CPUContext> {
         auto size = *(offsetptr + offsetdim[1]) - offset;
         // copy data
         auto src = src_base + offset * block_bytesize;
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<CPUContext, CPUContext>(
             in.meta(), size * block_size, src, dst + start * block_bytesize);
         start += size;
         idx++;
@@ -778,7 +779,8 @@ class AppendOp final : public Operator<Context> {
     auto oldSize = c->size();
     c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
     auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
-    context_.CopyItemsSameDevice(b.meta(), b.size(), b.raw_data(), dst);
+    context_.template CopyItems<Context, Context>(
+        b.meta(), b.size(), b.raw_data(), dst);
     return true;
   }
 };
@@ -828,7 +830,8 @@ class AtomicAppendOp final : public Operator<Context> {
       auto oldSize = c->size();
       c->Extend(b.dims()[0], kDatasetGrowthPct, &context_);
       auto* dst = (char*)c->raw_mutable_data() + oldSize * b.meta().itemsize();
-      context_.CopyItemsSameDevice(b.meta(), b.size(), b.raw_data(), dst);
+      context_.template CopyItems<Context, Context>(
+          b.meta(), b.size(), b.raw_data(), dst);
     }
     return true;
   }
@@ -841,8 +844,9 @@ class CreateTensorVectorOp final : public Operator<Context> {
   using Operator<Context>::Operator;
 
   bool RunOnDevice() override {
-    auto ptr = make_unique<std::vector<Tensor>>();
-    *OperatorBase::Output<TensorVectorPtr>(TENSOR_VECTOR) = std::move(ptr);
+    auto ptr = make_unique<std::vector<Tensor<Context>>>();
+    *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR) =
+        std::move(ptr);
     return true;
   }
 
@@ -857,7 +861,8 @@ class TensorVectorSizeOp final : public Operator<Context> {
   USE_SIMPLE_CTOR_DTOR(TensorVectorSizeOp);
 
   bool RunOnDevice() override {
-    auto& vector_ptr = OperatorBase::Input<TensorVectorPtr>(TENSOR_VECTOR);
+    auto& vector_ptr =
+        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
     auto* size = Output(SIZE);
     size->Resize();
     // 32-bit should be enough here
@@ -877,8 +882,8 @@ class ConcatTensorVectorOp final : public Operator<Context> {
   using Operator<Context>::Operator;
 
   bool RunOnDevice() override {
-    const TensorVectorPtr& tensorVector =
-        OperatorBase::Input<TensorVectorPtr>(TENSOR_VECTOR);
+    const TensorVectorPtr<Context>& tensorVector =
+        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
 
     auto* tensor = Output(TENSOR);
     CAFFE_ENFORCE(!tensorVector->empty());
@@ -899,7 +904,7 @@ class ConcatTensorVectorOp final : public Operator<Context> {
     auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());
 
     for (const auto& t : *tensorVector) {
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           t.meta(), t.size(), t.raw_data(), dst + offset);
       offset += t.nbytes();
     }
@@ -942,7 +947,8 @@ class CollectTensorOp final : public Operator<Context> {
 
     for (int i = 0; i < OutputSize(); ++i) {
       // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
-      TensorVectorPtr& tensorVector = *OperatorBase::Output<TensorVectorPtr>(i);
+      TensorVectorPtr<Context>& tensorVector =
+          *OperatorBase::Output<TensorVectorPtr<Context>>(i);
 
       if (numVisited_ >= numToCollect_) {
         CAFFE_ENFORCE(
@@ -960,11 +966,13 @@ class CollectTensorOp final : public Operator<Context> {
         CAFFE_ENFORCE(numVisited_ >= numToCollect_);
       } else if (pos >= tensorVector->size()) {
         // append
-        tensorVector->emplace_back(Context::GetDeviceType());
-        tensorVector->back().CopyFrom(tensor, &context_);
+        tensorVector->push_back(Tensor<Context>());
+        tensorVector->back().template CopyFrom<Context, Context>(
+            tensor, &context_);
       } else {
         // replace
-        tensorVector->at(pos).CopyFrom(tensor, &context_);
+        tensorVector->at(pos).template CopyFrom<Context, Context>(
+            tensor, &context_);
       }
     }
 
@@ -1428,13 +1436,13 @@ class TreeCursorSerializer : public BlobSerializerBase {
     // serialize offsets as a tensor
     if (cursor->offsets.size() > 0) {
       Blob offsets_blob;
-      auto* offsets = offsets_blob.GetMutableTensor(CPU);
+      auto* offsets = offsets_blob.template GetMutable<Tensor<CPUContext>>();
       offsets->Resize(cursor->offsets.size());
       std::copy(
           cursor->offsets.begin(),
           cursor->offsets.end(),
-          offsets->template mutable_data<TOffset>());
-      TensorSerializer ser;
+          offsets->mutable_data<TOffset>());
+      TensorSerializer<CPUContext> ser;
       ser.Serialize(
           *offsets, name, blob_proto.mutable_tensor(), 0, offsets->size());
     }
@@ -1456,10 +1464,10 @@ class TreeCursorDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override {
     // deserialize the offsets
-    TensorDeserializer deser;
+    TensorDeserializer<CPUContext> deser;
     Blob offset_blob;
     deser.Deserialize(proto, &offset_blob);
-    auto& offsets = offset_blob.template Get<Tensor>();
+    auto& offsets = offset_blob.template Get<Tensor<CPUContext>>();
     auto* offsets_ptr = offsets.data<TOffset>();
 
     // deserialize the field names
diff --git a/caffe2/operators/dataset_ops.h b/caffe2/operators/dataset_ops.h
index 809e570ba3c00..161a82b0d101f 100644
--- a/caffe2/operators/dataset_ops.h
+++ b/caffe2/operators/dataset_ops.h
@@ -191,7 +191,8 @@ class TreeWalker {
 
 using SharedTensorVectorPtr = std::shared_ptr<std::vector<TensorCPU>>;
 
-using TensorVectorPtr = std::unique_ptr<std::vector<Tensor>>;
+template <class Context>
+using TensorVectorPtr = std::unique_ptr<std::vector<Tensor<Context>>>;
 
 class SharedTensorVectorPtrSerializer : public BlobSerializerBase {
  public:
diff --git a/caffe2/operators/deform_conv_op.h b/caffe2/operators/deform_conv_op.h
index a0e4882abdbce..56b4d3228b1db 100644
--- a/caffe2/operators/deform_conv_op.h
+++ b/caffe2/operators/deform_conv_op.h
@@ -70,10 +70,10 @@ class DeformConvOp final : public DeformConvOpBase<T, Context> {
   bool RunOnDeviceWithOrderNCHW() override;
 
  private:
-  Tensor col_buffer_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor img_shape_device_{Context::GetDeviceType()};
-  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
   // Input: X, o, W, b
   // Output: Y
   INPUT_TAGS(INPUT, OFFSET, FILTER, BIAS);
@@ -96,10 +96,10 @@ class DeformConvGradientOp final : public DeformConvOpBase<T, Context> {
   bool RunOnDeviceWithOrderNCHW() override;
 
  private:
-  Tensor col_buffer_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor img_shape_device_{Context::GetDeviceType()};
-  Tensor col_buffer_shape_device_{Context::GetDeviceType()};
+  Tensor<Context> col_buffer_;
+  Tensor<Context> bias_multiplier_;
+  Tensor<Context> img_shape_device_;
+  Tensor<Context> col_buffer_shape_device_;
   bool no_bias_;
   // input: X, W, dY
   // output: dO, dW, db, and optionally dX
diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h
index 5d84d5905fd9a..072c156cf5bae 100644
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@@ -14,10 +14,10 @@ namespace caffe2 {
 
 template <typename T, class Context>
 bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  const Tensor& X = Input(INPUT);
-  const Tensor& offset = Input(OFFSET);
+  const Tensor<Context>& X = Input(INPUT);
+  const Tensor<Context>& offset = Input(OFFSET);
   auto& filter = Input(FILTER);
-  Tensor* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), C = X.dim32(1);
   CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
   const int M = filter.dim32(0);
@@ -133,7 +133,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     bias_data = Input(BIAS).template data<T>();
   }
 
-  auto f = [&](Tensor* col_buffer) {
+  auto f = [&](Tensor<Context>* col_buffer) {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2col, followed by gemm.
diff --git a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
index e514e5e2ed6fc..6868a482ff121 100644
--- a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
+++ b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu
@@ -286,9 +286,9 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> {
   }
 
   bool RunOnDeviceWithOrderNCHW() override {
-    const Tensor& X = Input(0);
+    const Tensor<CUDAContext>& X = Input(0);
     auto& filter = Input(1);
-    Tensor* Y = Output(0);
+    Tensor<CUDAContext>* Y = Output(0);
     const int N = X.dim32(0), C = X.dim32(1);
     CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
     const int M = filter.dim32(0);
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index 448172a5f8699..4e00cd4396726 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -15,7 +15,7 @@ bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
   int N = X.ndim() > 0 ? X.dim32(0) : 1;
   distance->Resize(N);
   int D = N > 0 ? X.size() / N : 0;
-  float* distance_data = distance->template mutable_data<float>();
+  float* distance_data = distance->mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   for (int i = 0; i < N; ++i) {
@@ -48,7 +48,7 @@ bool L1DistanceOp<float, CPUContext>::RunOnDevice() {
   const float* Y_data = Y.data<float>();
 
   for (int i = 0; i < N; ++i) {
-    (distance->template mutable_data<float>())[i] =
+    (distance->mutable_data<float>())[i] =
         (ConstEigenVectorMap<float>(X_data + i * D, D).array() -
          ConstEigenVectorMap<float>(Y_data + i * D, D).array())
             .abs()
@@ -86,18 +86,14 @@ bool L1DistanceGradientOp<float, CPUContext>::RunOnDevice() {
           (X.data<float>())[offset + j] - (Y.data<float>())[offset + j];
       const float kEps = 1e-12f;
       if (temp < -kEps) {
-        dX->template mutable_data<float>()[offset + j] =
-            -(dDistance.data<float>())[i];
-        dY->template mutable_data<float>()[offset + j] =
-            (dDistance.data<float>())[i];
+        dX->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
+        dY->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
       } else if (temp > kEps) {
-        dX->template mutable_data<float>()[offset + j] =
-            (dDistance.data<float>())[i];
-        dY->template mutable_data<float>()[offset + j] =
-            -(dDistance.data<float>())[i];
+        dX->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
+        dY->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
       } else {
-        dX->template mutable_data<float>()[offset + j] = 0;
-        dY->template mutable_data<float>()[offset + j] = 0;
+        dX->mutable_data<float>()[offset + j] = 0;
+        dY->mutable_data<float>()[offset + j] = 0;
       }
     }
   }
@@ -116,7 +112,7 @@ bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
   const int N = X.ndim() > 0 ? X.dim32(0) : 1;
   const int D = X.size_from_dim(1);
   result->Resize(N);
-  float* result_data = result->template mutable_data<float>();
+  float* result_data = result->mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   float X2, Y2;
@@ -312,7 +308,7 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
   D = std::min(DX, DY);
   restD = std::max(DX, DY) - D;
   result->Resize(N);
-  float* result_data = result->template mutable_data<float>();
+  float* result_data = result->mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   for (int i = 0; i < N; ++i) { // TODO: multithreading
diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
index d9ecad7f329fe..e1a56399a2f94 100644
--- a/caffe2/operators/distance_op.cu
+++ b/caffe2/operators/distance_op.cu
@@ -55,11 +55,7 @@ bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      N,
-      D,
-      X.data<float>(),
-      Y.data<float>(),
-      distance->template mutable_data<float>());
+      N, D, X.data<float>(), Y.data<float>(), distance->mutable_data<float>());
   return true;
 }
 
@@ -101,27 +97,23 @@ bool SquaredL2DistanceGradientOp<float, CUDAContext>::RunOnDevice() {
       X.size(),
       X.data<float>(),
       Y.data<float>(),
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       &context_);
 
-  StripedScaleKernel<float>
-      <<<CAFFE_GET_BLOCKS(N * D),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          N,
-          D,
-          dDistance.data<float>(),
-          dX->data<float>(),
-          dX->template mutable_data<float>());
+  StripedScaleKernel<float><<<
+      CAFFE_GET_BLOCKS(N * D),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      N,
+      D,
+      dDistance.data<float>(),
+      dX->data<float>(),
+      dX->mutable_data<float>());
 
   // The gradient of the other side is basically the negative.
   math::Scale<float, CUDAContext>(
-      X.size(),
-      -1,
-      dX->data<float>(),
-      dY->template mutable_data<float>(),
-      &context_);
+      X.size(), -1, dX->data<float>(), dY->mutable_data<float>(), &context_);
   return true;
 }
 
@@ -170,11 +162,7 @@ bool L1DistanceOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      N,
-      D,
-      X.data<float>(),
-      Y.data<float>(),
-      distance->template mutable_data<float>());
+      N, D, X.data<float>(), Y.data<float>(), distance->mutable_data<float>());
 
   return true;
 }
@@ -240,8 +228,8 @@ bool L1DistanceGradientOp<float, CUDAContext>::RunOnDevice() {
       X.data<float>(),
       Y.data<float>(),
       dDistance.data<float>(),
-      dX->template mutable_data<float>(),
-      dY->template mutable_data<float>());
+      dX->mutable_data<float>(),
+      dY->mutable_data<float>());
 
   return true;
 }
@@ -312,7 +300,7 @@ bool CosineSimilarityOp<float, CUDAContext>::RunOnDevice() {
   const int N = X.ndim() > 0 ? X.dim32(0) : 1;
   const int D = X.size_from_dim(1);
   result->Resize(N);
-  float* result_data = result->template mutable_data<float>();
+  float* result_data = result->mutable_data<float>();
   const float* X_data = X.data<float>();
   const float* Y_data = Y.data<float>();
   // Auxiliary arrays, one allocation of memory
@@ -367,8 +355,8 @@ bool CosineSimilarityGradientOp<float, CUDAContext>::RunOnDevice() {
   const auto* X_data = X.data<float>();
   const auto* Y_data = Y.data<float>();
   const auto* dCos_data = dCos.data<float>();
-  auto* dX_data = dX->template mutable_data<float>();
-  auto* dY_data = dY->template mutable_data<float>();
+  auto* dX_data = dX->mutable_data<float>();
+  auto* dY_data = dY->mutable_data<float>();
 
   // one memory allocation, a few arrays
   aux_.Resize(6 * N);
@@ -466,11 +454,7 @@ bool DotProductOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      N,
-      D,
-      X.data<float>(),
-      Y.data<float>(),
-      result->template mutable_data<float>());
+      N, D, X.data<float>(), Y.data<float>(), result->mutable_data<float>());
 
   return true;
 }
@@ -526,8 +510,8 @@ bool DotProductGradientOp<float, CUDAContext>::RunOnDevice() {
       X.data<float>(),
       Y.data<float>(),
       dDot.data<float>(),
-      dX->template mutable_data<float>(),
-      dY->template mutable_data<float>());
+      dX->mutable_data<float>(),
+      dY->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h
index be95b58145d1a..aad57e955dac2 100644
--- a/caffe2/operators/distance_op.h
+++ b/caffe2/operators/distance_op.h
@@ -157,7 +157,7 @@ class CosineSimilarityOp : public Operator<Context> {
   OUTPUT_TAGS(COS_OUT);
 
  private:
-  Tensor aux_{Context::GetDeviceType()};
+  Tensor<Context> aux_;
 };
 
 template <typename T, class Context>
@@ -174,7 +174,7 @@ class CosineSimilarityGradientOp final : public Operator<Context> {
   OUTPUT_TAGS(DER_X_OUT, DER_Y_OUT);
 
  private:
-  Tensor aux_{Context::GetDeviceType()};
+  Tensor<Context> aux_;
 };
 
 template <typename T, class Context>
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
index be5b8224aa40c..bd9178e9f4ae2 100644
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@@ -9,8 +9,8 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(X.dims());
   if (is_test_) {
     if (Y != &X) {
-      context_.CopyFromCPU<float>(
-          X.size(), X.data<float>(), Y->template mutable_data<float>());
+      context_.Copy<float, CPUContext, CPUContext>(
+          X.size(), X.data<float>(), Y->mutable_data<float>());
     }
     return true;
   } else {
@@ -19,10 +19,10 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
     // generate probability depending on 1-ratio.
     std::bernoulli_distribution dist(1. - ratio_);
     const float* Xdata = X.data<float>();
-    float* Ydata = Y->template mutable_data<float>();
+    float* Ydata = Y->mutable_data<float>();
     auto mask = Output(1);
     mask->Resize(X.dims());
-    bool* mask_data = mask->template mutable_data<bool>();
+    bool* mask_data = mask->mutable_data<bool>();
     auto& gen = context_.RandGenerator();
     for (int i = 0; i < X.size(); ++i) {
       mask_data[i] = dist(gen);
@@ -39,8 +39,8 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
   dX->Resize(dY.dims());
   if (is_test_) {
     if (dX != &dY) {
-      context_.CopyFromCPU<float>(
-          dY.size(), dY.data<float>(), dX->template mutable_data<float>());
+      context_.Copy<float, CPUContext, CPUContext>(
+          dY.size(), dY.data<float>(), dX->mutable_data<float>());
     }
     return true;
   } else {
@@ -48,7 +48,7 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
     CAFFE_ENFORCE_EQ(dY.size(), mask.size());
     const float* dYdata = dY.data<float>();
     const bool* mask_data = mask.data<bool>();
-    float* dXdata = dX->template mutable_data<float>();
+    float* dXdata = dX->mutable_data<float>();
     float scale = 1. / (1. - ratio_);
     for (int i = 0; i < dY.size(); ++i) {
       dXdata[i] = dYdata[i] * mask_data[i] * scale;
@@ -144,9 +144,7 @@ mask: [[False False False  True  True]
 </details>
 
 )DOC")
-    .Arg(
-        "ratio",
-        "*(type: float; default: 0.5)* Probability of an element to be zeroed.")
+    .Arg("ratio", "*(type: float; default: 0.5)* Probability of an element to be zeroed.")
     .ArgIsTest(
         "*(type: int; default: 0)* If zero (train mode), perform dropout. If non-zero"
         "(test mode), Y = X.")
@@ -156,7 +154,7 @@ mask: [[False False False  True  True]
         1,
         "mask",
         "*(type: Tensor`<bool>`)* The output mask containing boolean values for"
-        "each element, signifying which elements are dropped out. If `is_test` is"
+        "each element, signifying which elements are dropped out. If `is_test` is" 
         "nonzero, this output is not filled.")
     .InheritOnnxSchema("Dropout");
 
diff --git a/caffe2/operators/dropout_op.cu b/caffe2/operators/dropout_op.cu
index 6489ada5927ba..745840e82affc 100644
--- a/caffe2/operators/dropout_op.cu
+++ b/caffe2/operators/dropout_op.cu
@@ -25,15 +25,15 @@ bool DropoutOp<float, CUDAContext>::RunOnDevice() {
   Y->Resize(X.dims());
   if (is_test_) {
     if (Y != &X) {
-      context_.CopySameDevice<float>(
-          X.size(), X.data<float>(), Y->template mutable_data<float>());
+      context_.Copy<float, CUDAContext, CUDAContext>(
+          X.size(), X.data<float>(), Y->mutable_data<float>());
     }
     return true;
   } else {
     // We do a simple trick here: since curand cannot generate random
     // boolean numbers, we will generate into dY and write the result to
     // mask.
-    float* Ydata = Y->template mutable_data<float>();
+    float* Ydata = Y->mutable_data<float>();
     auto* mask = Output(1);
     mask->Resize(X.dims());
     CAFFE_ENFORCE(X.data<float>() != Ydata, "In-place GPU dropout is broken");
@@ -44,11 +44,7 @@ bool DropoutOp<float, CUDAContext>::RunOnDevice() {
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
-        X.size(),
-        ratio_,
-        X.data<float>(),
-        Ydata,
-        mask->template mutable_data<bool>());
+        X.size(), ratio_, X.data<float>(), Ydata, mask->mutable_data<bool>());
     return true;
   }
 }
@@ -73,8 +69,8 @@ bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
   dX->Resize(dY.dims());
   if (is_test_) {
     if (dX != &dY) {
-      context_.CopySameDevice<float>(
-          dY.size(), dY.data<float>(), dX->template mutable_data<float>());
+      context_.Copy<float, CUDAContext, CUDAContext>(
+          dY.size(), dY.data<float>(), dX->mutable_data<float>());
     }
     return true;
   } else {
@@ -90,7 +86,7 @@ bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
         dY.data<float>(),
         mask.data<bool>(),
         scale,
-        dX->template mutable_data<float>());
+        dX->mutable_data<float>());
     return true;
   }
 }
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index 8e66f03aa6d14..906c2563aa0e0 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -141,7 +141,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
   // now actually run the computation
   if (is_test_) {
     if (Y != &X) {
-      context_.CopySameDevice<T>(
+      context_.Copy<T, CUDAContext, CUDAContext>(
           X.size(), X.template data<T>(), Y->template mutable_data<T>());
     }
     return true;
@@ -150,7 +150,8 @@ bool CuDNNDropoutOp::DoRunWithType() {
     // Reshape tensor descriptors if necessary
     if (X.dims() != cudnn_input_dims_ && !is_test_) {
       CAFFE_ENFORCE(scratch_blob_);
-      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
+      Tensor<CUDAContext>* states =
+          scratch_blob_->GetMutable<Tensor<CUDAContext>>();
       cudnn_input_dims_ = X.dims();
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
@@ -171,7 +172,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
       if (!states_initialized_) {
         // set the dropout descriptor (note: need to allocate the states data
         // before acquiring the mutex)
-        uint8_t* states_data = states->template mutable_data<uint8_t>();
+        uint8_t* states_data = states->mutable_data<uint8_t>();
         {
           // Need to protect  as clashes with NCCL
           std::lock_guard<std::mutex> lk(CUDAContext::mutex());
@@ -194,7 +195,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
         X.template data<T>(),
         data_desc_,
         Y->template mutable_data<T>(),
-        mask->template mutable_data<uint8_t>(),
+        mask->mutable_data<uint8_t>(),
         reserve_space_size_in_bytes_));
   }
   return true;
@@ -218,7 +219,7 @@ template <typename T, typename M>
 bool CuDNNDropoutGradientOp::DoRunWithType() {
   const auto& dY = Input(0);
   const auto& mask = Input(1);
-  const Tensor& states = scratch_blob_->Get<Tensor>();
+  const Tensor<CUDAContext>& states = scratch_blob_->Get<Tensor<CUDAContext>>();
   auto* dX = Output(0);
 
   auto size_prod = 1;
diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc
index d68bfbc5a0eb9..e935136905fba 100644
--- a/caffe2/operators/elementwise_linear_op.cc
+++ b/caffe2/operators/elementwise_linear_op.cc
@@ -23,7 +23,7 @@ bool ElementwiseLinearOp<float, CPUContext>::RunOnDevice(){
   const float* X_data = X.data<float>();
   const float* a_data = a.data<float>();
   const float* b_data = b.data<float>();
-  float* Y_data = Y->template mutable_data<float>();
+  float* Y_data = Y->mutable_data<float>();
 
   int p = 0;
   for (int n = 0; n < N; ++n) {
@@ -48,7 +48,7 @@ bool ElementwiseLinearGradientOp<float, CPUContext>::RunOnDevice(){
   CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
   CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
 
-  auto* g_X = Output(0);
+  auto *g_X = Output(0);
   auto *g_a = Output(1);
   auto *g_b = Output(2);
   g_X->ResizeLike(X);
@@ -58,9 +58,9 @@ bool ElementwiseLinearGradientOp<float, CPUContext>::RunOnDevice(){
   const float* g_o_data = g_o.data<float>();
   const float* X_data = X.data<float>();
   const float* a_data = a.data<float>();
-  float* g_X_data = g_X->template mutable_data<float>();
-  float* g_a_data = g_a->template mutable_data<float>();
-  float* g_b_data = g_b->template mutable_data<float>();
+  float* g_X_data = g_X->mutable_data<float>();
+  float* g_a_data = g_a->mutable_data<float>();
+  float* g_b_data = g_b->mutable_data<float>();
 
   math::Set<float, CPUContext>(g_a->size(), 0.f, g_a_data, &context_);
   math::Set<float, CPUContext>(g_b->size(), 0.f, g_b_data, &context_);
diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
index efbf52a86a18f..e4cd235eeffa3 100644
--- a/caffe2/operators/elementwise_linear_op.cu
+++ b/caffe2/operators/elementwise_linear_op.cu
@@ -67,17 +67,10 @@ bool ElementwiseLinearOp<float, CUDAContext>::RunOnDevice(){
 
   Y->ResizeLike(X);
 
-  ElementwiseLinearKernel<<<
-      CAFFE_GET_BLOCKS(N * D),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N,
-      D,
-      X.data<float>(),
-      a.data<float>(),
-      b.data<float>(),
-      Y->template mutable_data<float>());
+  ElementwiseLinearKernel<<<CAFFE_GET_BLOCKS(N * D), CAFFE_CUDA_NUM_THREADS,
+                          0, context_.cuda_stream()>>>(
+    N, D, X.data<float>(), a.data<float>(), b.data<float>(),
+    Y->mutable_data<float>());
   return true;
 }
 
@@ -95,15 +88,15 @@ bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
   CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
   CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
 
-  auto* g_X = Output(0);
+  auto *g_X = Output(0);
   auto *g_a = Output(1);
   auto *g_b = Output(2);
   g_X->ResizeLike(X);
   g_a->ResizeLike(a);
   g_b->ResizeLike(a);
 
-  float* g_a_data = g_a->template mutable_data<float>();
-  float* g_b_data = g_b->template mutable_data<float>();
+  float* g_a_data = g_a->mutable_data<float>();
+  float* g_b_data = g_b->mutable_data<float>();
 
   ElementwiseLinearGradientKernel<<<
       D,
@@ -115,7 +108,7 @@ bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
       g_o.data<float>(),
       X.data<float>(),
       a.data<float>(),
-      g_X->template mutable_data<float>(),
+      g_X->mutable_data<float>(),
       g_a_data,
       g_b_data);
   return true;
diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc
index 5ddd4570356e9..9da98a83b78ae 100644
--- a/caffe2/operators/elementwise_logical_ops.cc
+++ b/caffe2/operators/elementwise_logical_ops.cc
@@ -12,8 +12,8 @@ OPERATOR_SCHEMA(Where)
     .AllowInplace({{1, 2}})
     .IdenticalTypeAndShapeOfInput(1)
     .SetDoc(R"DOC(
-Operator Where takes three input data (Tensor, Tensor, Tensor) and
-produces one output data (Tensor) where z = c ? x : y is applied elementwise.
+Operator Where takes three input data (Tensor<bool>, Tensor<T>, Tensor<T>) and
+produces one output data (Tensor<T>) where z = c ? x : y is applied elementwise.
 )DOC")
     .Input(0, "C", "input tensor containing booleans")
     .Input(1, "X", "input tensor")
diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h
index a90e3332d861d..99b84c5830397 100644
--- a/caffe2/operators/elementwise_logical_ops.h
+++ b/caffe2/operators/elementwise_logical_ops.h
@@ -53,13 +53,13 @@ class WhereOp final : public Operator<Context> {
       for (int i = 0; i < select.size(); i++) {
         size_t offset = i * block_size;
         if (select_data[i]) {
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               output->meta(),
               block_size,
               left_data + offset,
               output_data + offset);
         } else {
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               output->meta(),
               block_size,
               right_data + offset,
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index 9afb154d9bddc..6b3151ecc4990 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -19,7 +19,7 @@ void FillTensor(
     const std::vector<caffe2::TIndex>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = blob->GetMutable<caffe2::Tensor<Context>>();
   tensor->Resize(shape);
   auto* mutable_data = tensor->template mutable_data<O_Type>();
   const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
@@ -59,7 +59,8 @@ void elementwiseAnd() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{true, false, false, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -79,7 +80,8 @@ void elementwiseAnd() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{
         true, false, false, false, true, false, false, false};
@@ -105,7 +107,8 @@ void elementwiseOr() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{true, true, true, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -125,7 +128,8 @@ void elementwiseOr() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{true, true, true, false, true, true, true, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -150,7 +154,8 @@ void elementwiseXor() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{false, true, true, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -170,7 +175,8 @@ void elementwiseXor() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{
         false, true, true, false, false, true, true, false};
@@ -195,7 +201,8 @@ void elementwiseNot() {
   EXPECT_TRUE(op->Run());
   auto* blob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, blob);
-  caffe2::Tensor Y(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+  caffe2::CPUContext context;
+  caffe2::TensorCPU Y(blob->Get<caffe2::Tensor<Context>>(), &context);
   EXPECT_EQ(Y.size(), N);
   std::vector<bool> result{false, true};
   for (size_t i = 0; i < Y.size(); ++i) {
@@ -217,7 +224,8 @@ void elementwiseEQ() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{false, true, false, true};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -234,7 +242,8 @@ void elementwiseEQ() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), N);
     std::vector<bool> result{true, true, false, false};
     for (size_t i = 0; i < Z.size(); ++i) {
@@ -253,7 +262,8 @@ void elementwiseEQ() {
     EXPECT_TRUE(op->Run());
     auto* blob = ws.GetBlob("Z");
     EXPECT_NE(nullptr, blob);
-    caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
+    caffe2::CPUContext context;
+    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>(), &context);
     EXPECT_EQ(Z.size(), M * N);
     std::vector<bool> result{
         true, false, false, true, false, true, true, false};
diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
index 2bd91f191229f..1dee0d6272470 100644
--- a/caffe2/operators/elementwise_ops.cu
+++ b/caffe2/operators/elementwise_ops.cu
@@ -88,7 +88,7 @@ void device_reduce(
     const T* d_in,
     T* d_out,
     int N,
-    Tensor* buffer,
+    Tensor<CUDAContext>* buffer,
     CUDAContext* context) {
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
@@ -114,7 +114,7 @@ void device_reduce<float16>(
     const float16* in,
     float16* out,
     int N,
-    Tensor* buffer,
+    Tensor<CUDAContext>* buffer,
     CUDAContext* context) {
 #if defined(__HIPCC__) && !ROCBLAS_FP16
   CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
@@ -127,7 +127,7 @@ void device_reduce<float16>(
     math::Set<float16, CUDAContext>(
         N,
         convert::To<float, float16>(1.),
-        buffer->template mutable_data<float16>(),
+        buffer->mutable_data<float16>(),
         context);
   }
 
diff --git a/caffe2/operators/elementwise_ops.h b/caffe2/operators/elementwise_ops.h
index 747d2fe0285c2..aec5ea458fff4 100644
--- a/caffe2/operators/elementwise_ops.h
+++ b/caffe2/operators/elementwise_ops.h
@@ -512,8 +512,8 @@ class SumReduceLikeOp final : public Operator<Context> {
   int axis_;
   string axis_str_;
   string order_;
-  Tensor ones_{Context::GetDeviceType()};
-  Tensor sum_buffer_{Context::GetDeviceType()};
+  Tensor<Context> ones_;
+  Tensor<Context> sum_buffer_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops_utils.cc b/caffe2/operators/elementwise_ops_utils.cc
index 6a5b136831c48..3d906c9a0f708 100644
--- a/caffe2/operators/elementwise_ops_utils.cc
+++ b/caffe2/operators/elementwise_ops_utils.cc
@@ -3,45 +3,6 @@
 namespace caffe2 {
 namespace elementwise_ops_utils {
 
-std::tuple<size_t, size_t, size_t>
-ComputeLegacyBroadcastSizes(const Tensor& A, const Tensor& B, int axis) {
-  CAFFE_ENFORCE_GE(
-      A.ndim(),
-      B.ndim(),
-      "If you are doing broadcasting, input1 should have "
-      "a smaller or equal number of dimensions.");
-  if (axis == -1) {
-    axis = A.ndim() - B.ndim();
-  }
-  CAFFE_ENFORCE(
-      axis >= 0 && axis <= A.ndim() - B.ndim(),
-      "Broadcast axis should be in the range of"
-      "[0, A.ndim() - B.ndim()], but axis = ",
-      axis);
-
-  int b_dim_start = 0;
-  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
-    ++b_dim_start;
-  }
-  int b_dim_end = B.ndim() - 1;
-  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
-    --b_dim_end;
-  }
-  size_t pre = 1, n = 1, post = 1;
-  for (int i = 0; i < axis + b_dim_start; ++i) {
-    pre *= A.dim(i);
-  }
-  for (int i = b_dim_start; i <= b_dim_end; ++i) {
-    CAFFE_ENFORCE_EQ(
-        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
-    n *= B.dim(i);
-  }
-  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
-    post *= A.dim(i);
-  }
-  return std::make_tuple(pre, n, post);
-}
-
 std::vector<int> ComputeBinaryBroadcastForwardDims(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims) {
diff --git a/caffe2/operators/elementwise_ops_utils.h b/caffe2/operators/elementwise_ops_utils.h
index f8ff47cdf4ced..dd37b12076e30 100644
--- a/caffe2/operators/elementwise_ops_utils.h
+++ b/caffe2/operators/elementwise_ops_utils.h
@@ -10,8 +10,48 @@
 namespace caffe2 {
 namespace elementwise_ops_utils {
 
-std::tuple<size_t, size_t, size_t>
-ComputeLegacyBroadcastSizes(const Tensor& A, const Tensor& B, int axis);
+template <typename Context>
+std::tuple<size_t, size_t, size_t> ComputeLegacyBroadcastSizes(
+    const Tensor<Context>& A,
+    const Tensor<Context>& B,
+    int axis) {
+  CAFFE_ENFORCE_GE(
+      A.ndim(),
+      B.ndim(),
+      "If you are doing broadcasting, input1 should have "
+      "a smaller or equal number of dimensions.");
+  if (axis == -1) {
+    axis = A.ndim() - B.ndim();
+  }
+  CAFFE_ENFORCE(
+      axis >= 0 && axis <= A.ndim() - B.ndim(),
+      "Broadcast axis should be in the range of"
+      "[0, A.ndim() - B.ndim()], but axis = ",
+      axis);
+
+  int b_dim_start = 0;
+  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
+    ++b_dim_start;
+  }
+  int b_dim_end = B.ndim() - 1;
+  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
+    --b_dim_end;
+  }
+  size_t pre = 1, n = 1, post = 1;
+  for (int i = 0; i < axis + b_dim_start; ++i) {
+    pre *= A.dim(i);
+  }
+  for (int i = b_dim_start; i <= b_dim_end; ++i) {
+    CAFFE_ENFORCE_EQ(
+        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
+    n *= B.dim(i);
+  }
+  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
+    post *= A.dim(i);
+  }
+  return std::make_tuple(pre, n, post);
+}
+
 std::vector<int> ComputeBinaryBroadcastForwardDims(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims);
diff --git a/caffe2/operators/enforce_finite_op.cu b/caffe2/operators/enforce_finite_op.cu
index 38f1669a40af3..b909d70cb43cc 100644
--- a/caffe2/operators/enforce_finite_op.cu
+++ b/caffe2/operators/enforce_finite_op.cu
@@ -7,7 +7,7 @@ namespace caffe2 {
 template <>
 template <typename T>
 bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
-  buffer_.CopyFrom(Input(0), &context_);
+  buffer_.CopyFrom<CUDAContext, CUDAContext>(Input(0), &context_);
   EnforceOnCPU<T>(buffer_);
   return true;
 }
diff --git a/caffe2/operators/enforce_finite_op.h b/caffe2/operators/enforce_finite_op.h
index a1f63ecb7bccc..d8e5a15a3ac0a 100644
--- a/caffe2/operators/enforce_finite_op.h
+++ b/caffe2/operators/enforce_finite_op.h
@@ -23,10 +23,10 @@ class EnforceFiniteOp final : public Operator<Context> {
   bool DoRunWithType();
 
  private:
-  Tensor buffer_{Context::GetDeviceType()};
+  Tensor<CPUContext> buffer_;
 
   template <typename T>
-  void EnforceOnCPU(const Tensor& input) {
+  void EnforceOnCPU(const Tensor<CPUContext>& input) {
     const T* input_data = input.template data<T>();
     auto size = input.size();
 
diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h
index 46b820f316893..8130f42ad026c 100644
--- a/caffe2/operators/ensure_cpu_output_op.h
+++ b/caffe2/operators/ensure_cpu_output_op.h
@@ -15,9 +15,9 @@ class EnsureCPUOutputOp : public Operator<Context> {
       : Operator<Context>(operator_def, ws) {}
 
   bool RunOnDevice() override {
-    if (OperatorBase::InputIsType<Tensor>(0, CPU)) {
+    if (OperatorBase::InputIsType<TensorCPU>(0)) {
       return CopyWithContext<CPUContext>();
-    } else if (OperatorBase::InputIsType<Tensor>(0, Context::GetDeviceType())) {
+    } else if (OperatorBase::InputIsType<Tensor<Context>>(0)) {
       // CUDA Context will go this branch
       return CopyWithContext<Context>();
     } else {
@@ -32,10 +32,10 @@ class EnsureCPUOutputOp : public Operator<Context> {
   template <class InputContext>
   bool CopyWithContext() {
     // Output is always on CPU
-    auto* output = OperatorBase::Output<Tensor>(0, CPU);
-    auto& input = OperatorBase::Input<Tensor>(0, InputContext::GetDeviceType());
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    auto& input = OperatorBase::Input<Tensor<InputContext>>(0);
     output->ResizeLike(input);
-    context_.CopyItemsToCPU(
+    context_.template CopyItems<InputContext, CPUContext>(
         input.meta(),
         input.size(),
         input.raw_data(),
diff --git a/caffe2/operators/expand_op.h b/caffe2/operators/expand_op.h
index 8337862630390..9f5406fe62447 100644
--- a/caffe2/operators/expand_op.h
+++ b/caffe2/operators/expand_op.h
@@ -26,7 +26,7 @@ class ExpandOp final : public Operator<Context> {
     const auto& X = Input(0);
     const auto& Y_shape_tensor = Input(1);
     std::vector<int64_t> shape_dims(Y_shape_tensor.size());
-    context_.template CopyToCPU<int64_t>(
+    context_.template Copy<int64_t, Context, CPUContext>(
         Y_shape_tensor.size(),
         Y_shape_tensor.template data<int64_t>(),
         shape_dims.data());
diff --git a/caffe2/operators/feature_maps_ops.h b/caffe2/operators/feature_maps_ops.h
index a9da8a7ebd3f8..7c9b7abeff03b 100644
--- a/caffe2/operators/feature_maps_ops.h
+++ b/caffe2/operators/feature_maps_ops.h
@@ -198,7 +198,7 @@ class MergeSingleListFeatureTensorsOp : public Operator<Context> {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValues.meta(),
               inLengthsData[exampleIndex],
               &inValues.template data<T>()[inValuesOffset_[inputIndex]],
@@ -268,7 +268,7 @@ class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> {
             Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
         if (inPresenceData[exampleIndex]) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValuesValuesGrad.meta(),
               inLengthsData[exampleIndex],
               &inValuesValuesGradData[inValuesValuesOffset],
@@ -367,12 +367,12 @@ class MergeSingleMapFeatureTensorsOp : public Operator<Context> {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inKeys.meta(),
               inLengthsData[exampleIndex],
               &inKeys.template data<K>()[inValuesOffset_[inputIndex]],
               &outValuesKeysData[valuesOffset]);
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValues.meta(),
               inLengthsData[exampleIndex],
               &inValues.template data<V>()[inValuesOffset_[inputIndex]],
@@ -510,7 +510,7 @@ class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> {
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         if (inLengthsData[exampleIndex] > 0) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValuesGrad.meta(),
               inLengthsData[exampleIndex],
               &inValuesGradData[inValuesOffset],
@@ -597,7 +597,7 @@ class MergeMultiListFeatureTensorsOp : public Operator<Context> {
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesLengthsData[outKeysOffset] =
               inValuesLengthsData[inKeysOffset_[inputIndex]];
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValuesValues.meta(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesValues
@@ -703,13 +703,13 @@ class MergeMultiMapFeatureTensorsOp : public Operator<Context> {
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesLengthsData[outKeysOffset] =
               inValuesLengthsData[inKeysOffset_[inputIndex]];
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValuesKeys.meta(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesKeys
                    .template data<K>()[inValuesValuesOffset_[inputIndex]],
               &outValuesKeysData[outValuesValuesOffset]);
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValuesValues.meta(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesValues
@@ -791,7 +791,7 @@ class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> {
         }
         if (valuesLengthCopy > 0) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               inValuesValuesGrad.meta(),
               valuesLengthCopy,
               &inValuesValuesGradData[inValuesValuesOffset],
diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc
index ff3eac217390a..021df2ec0abe5 100644
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@@ -3,8 +3,9 @@
 namespace caffe2 {
 
 template <>
-bool RangeFillOp<float, CPUContext>::Fill(Tensor* output) {
-  float* data = output->template mutable_data<float>();
+bool RangeFillOp<float, CPUContext>::Fill(
+    TensorCPU* output) {
+  float* data = output->mutable_data<float>();
   for (int i = 0; i < output->size(); ++i) {
     data[i] = i;
   }
@@ -13,7 +14,7 @@ bool RangeFillOp<float, CPUContext>::Fill(Tensor* output) {
 
 template <>
 template <typename T>
-bool DiagonalFillOp<CPUContext>::FillWithType(Tensor* output) {
+bool DiagonalFillOp<CPUContext>::FillWithType(TensorCPU* output) {
   VerifyOutputShape(output);
   T value = OperatorBase::GetSingleArgument<T>("value", 0);
   auto* data = output->template mutable_data<T>();
diff --git a/caffe2/operators/filler_op.cu b/caffe2/operators/filler_op.cu
index 65918cc04b5d5..9df195a918b91 100644
--- a/caffe2/operators/filler_op.cu
+++ b/caffe2/operators/filler_op.cu
@@ -25,19 +25,19 @@ __global__ void FillDiagonalKernel(
 }
 
 template <>
-bool RangeFillOp<float, CUDAContext>::Fill(Tensor* output) {
+bool RangeFillOp<float, CUDAContext>::Fill(TensorCUDA* output) {
   int N = output->size();
   FillRangeKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
-      context_.cuda_stream()>>>(N, output->template mutable_data<float>());
+      context_.cuda_stream()>>>(N, output->mutable_data<float>());
   return true;
 }
 
 template <>
 template <typename T>
-bool DiagonalFillOp<CUDAContext>::FillWithType(Tensor* output) {
+bool DiagonalFillOp<CUDAContext>::FillWithType(TensorCUDA* output) {
   VerifyOutputShape(output);
   auto* data = output->template mutable_data<T>();
   int size = output->size();
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
index 659b4906cc4d4..c144b70378273 100644
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@@ -56,7 +56,7 @@ class FillerOp : public Operator<Context> {
       auto shape = vector<TIndex>{};
       if (input_as_shape_) {
         // Shape input must be in CPU context
-        auto& input = OperatorBase::Input<Tensor>(0, CPU);
+        auto& input = OperatorBase::Input<Tensor<CPUContext>>(0);
         CAFFE_ENFORCE_EQ(
             input.ndim(),
             1,
@@ -76,7 +76,7 @@ class FillerOp : public Operator<Context> {
     return Fill(output);
   }
 
-  virtual bool Fill(Tensor* output) = 0;
+  virtual bool Fill(Tensor<Context>* output) = 0;
 
  protected:
   vector<TIndex> shape_;
@@ -105,7 +105,7 @@ class UniformFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     T min = min_;
     T max = max_;
     if (InputSize() == 3) {
@@ -163,7 +163,7 @@ class UniqueUniformFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     return (this->*body_)(output);
   }
 
@@ -179,7 +179,7 @@ class UniqueUniformFillOp final : public FillerOp<Context> {
   }
 
   template <typename T>
-  bool FillWithType(Tensor* output) {
+  bool FillWithType(Tensor<Context>* output) {
     T min = OperatorBase::GetSingleArgument<T>("min", 0);
     T max = OperatorBase::GetSingleArgument<T>("max", 0);
 
@@ -201,7 +201,7 @@ class UniqueUniformFillOp final : public FillerOp<Context> {
     return true;
   }
 
-  bool (UniqueUniformFillOp::*body_)(Tensor* output);
+  bool (UniqueUniformFillOp::*body_)(Tensor<Context>* output);
 };
 
 template <class Context>
@@ -268,12 +268,12 @@ class ConstantFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     return (this->*body_)(output);
   }
 
   template <typename T>
-  bool FillWithType(Tensor* output) {
+  bool FillWithType(Tensor<Context>* output) {
     T value = OperatorBase::GetSingleArgument<T>("value", 0);
     auto* data = output->template mutable_data<T>();
     if (output->size()) {
@@ -282,7 +282,7 @@ class ConstantFillOp final : public FillerOp<Context> {
     return true;
   }
 
-  bool FillWithString(Tensor* output) {
+  bool FillWithString(Tensor<Context>* output) {
     auto value = OperatorBase::GetSingleArgument<std::string>("value", "");
     auto* data = output->template mutable_data<std::string>();
     for (int i = 0; i < output->size(); ++i) {
@@ -292,7 +292,7 @@ class ConstantFillOp final : public FillerOp<Context> {
   }
 
  private:
-  bool (ConstantFillOp::*body_)(Tensor* output);
+  bool (ConstantFillOp::*body_)(Tensor<Context>* output);
 };
 
 template <class Context>
@@ -355,19 +355,19 @@ class DiagonalFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     return (this->*body_)(output);
   }
 
   template <typename T>
-  bool FillWithType(Tensor* output);
+  bool FillWithType(Tensor<Context>* output);
 
  private:
-  void VerifyOutputShape(Tensor* output) {
+  void VerifyOutputShape(Tensor<Context>* output) {
     CAFFE_ENFORCE(output->ndim() >= 2, "Input shape must be >= 2D");
   }
 
-  TIndex GetStepSize(Tensor* output) {
+  TIndex GetStepSize(Tensor<Context>* output) {
     TIndex step;
     if (output->ndim() == 2) {
       step = output->dim(1) + 1;
@@ -393,7 +393,7 @@ class DiagonalFillOp final : public FillerOp<Context> {
     return step;
   }
 
-  bool (DiagonalFillOp::*body_)(Tensor* output);
+  bool (DiagonalFillOp::*body_)(Tensor<Context>* output);
 };
 
 template <typename T, class Context>
@@ -407,7 +407,7 @@ class GaussianFillOp final : public FillerOp<Context> {
     DCHECK_GT(std_, 0) << "Standard deviation should be nonnegative.";
   }
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     math::RandGaussian<T, Context>(
         output->size(),
         mean_,
@@ -429,7 +429,7 @@ class XavierFillOp final : public FillerOp<Context> {
   XavierFillOp(const OperatorDef& operator_def, Workspace* ws)
       : FillerOp<Context>(operator_def, ws) {}
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     const int fan_in = output->size() / output->dim32(0);
     T scale = std::sqrt(T(3) / fan_in);
     math::RandUniform<T, Context>(
@@ -449,7 +449,7 @@ class MSRAFillOp final : public FillerOp<Context> {
   MSRAFillOp(const OperatorDef& operator_def, Workspace* ws)
       : FillerOp<Context>(operator_def, ws) {}
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     const int fan_out = output->size() / output->dim32(1);
     T scale = std::sqrt(T(2) / fan_out);
     math::RandGaussian<T, Context>(
@@ -472,7 +472,7 @@ class RangeFillOp final : public FillerOp<Context> {
   RangeFillOp(const OperatorDef& operator_def, Workspace* ws)
       : FillerOp<Context>(operator_def, ws) {}
 
-  bool Fill(Tensor* output) override;
+  bool Fill(Tensor<Context>* output) override;
 };
 
 template <class Context>
diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
index da6061ef03315..32bceda79acc8 100644
--- a/caffe2/operators/find_op.cu
+++ b/caffe2/operators/find_op.cu
@@ -38,7 +38,7 @@ bool FindOp<CUDAContext>::DoRunWithType() {
 
   const T* idx_data = idx.data<T>();
   const T* needles_data = needles.data<T>();
-  int* res_data = res_indices->template mutable_data<int>();
+  int* res_data = res_indices->mutable_data<int>();
 
   FindKernel<
       T><<<needles.size(), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
diff --git a/caffe2/operators/flatten_op.h b/caffe2/operators/flatten_op.h
index 43851ce3aa955..a250cd4c272b8 100644
--- a/caffe2/operators/flatten_op.h
+++ b/caffe2/operators/flatten_op.h
@@ -20,7 +20,7 @@ class FlattenOp : public Operator<Context> {
     CAFFE_ENFORCE_GE(
         input.dims().size(), axis_, "The rank of the tensor must be >= axis.");
     output->Resize(input.size_to_dim(axis_), input.size_from_dim(axis_));
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<Context, Context>(
         input.meta(),
         input.size(),
         input.raw_data(),
diff --git a/caffe2/operators/floor_op.cu b/caffe2/operators/floor_op.cu
index 41723d84da2b6..a1bd383e1821f 100644
--- a/caffe2/operators/floor_op.cu
+++ b/caffe2/operators/floor_op.cu
@@ -22,7 +22,7 @@ bool FloorOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->template mutable_data<float>());
+      X.size(), X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h
index eca665750cf05..068acfec19c30 100644
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@@ -144,8 +144,7 @@ class FullyConnectedOp final : public Operator<Context> {
   // A local vector to cache the output shape so we don't need to recreate
   // a vector object every time we run Run().
   vector<TIndex> Y_shape_cache_;
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  ;
+  Tensor<Context> bias_multiplier_;
 
   bool float16_compute_;
 };
@@ -313,7 +312,7 @@ class FullyConnectedGradientOp : public Operator<Context> {
  protected:
   size_t axis_{1};
   size_t axis_w_{1};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
   bool float16_compute_;
 };
 
diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.h b/caffe2/operators/gather_fused_8bit_rowwise_op.h
index 3b6f549fe624c..621ea335a4993 100644
--- a/caffe2/operators/gather_fused_8bit_rowwise_op.h
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h
@@ -14,7 +14,7 @@ class GatherFused8BitRowwiseOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(INDICES, CPU));
+        this, OperatorBase::Input<TensorCPU>(INDICES));
   }
 
   template <typename Index>
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index adc308ecdd325..81f4fa53d5599 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -30,7 +30,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(RANGES, CPU));
+        this, OperatorBase::Input<TensorCPU>(RANGES));
   }
 
   template <typename Index>
@@ -88,7 +88,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
             j);
 
         if (InputSize() == 2) {
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               data.meta(),
               rangeLength,
               rawData + rangeStart * itemsize,
diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc
index 2b1039b35a846..0b4f3a6a9d755 100644
--- a/caffe2/operators/generate_proposals_op.cc
+++ b/caffe2/operators/generate_proposals_op.cc
@@ -290,8 +290,8 @@ bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
   }
   out_rois->Extend(roi_counts, 50, &context_);
   out_rois_probs->Extend(roi_counts, 50, &context_);
-  float* out_rois_ptr = out_rois->template mutable_data<float>();
-  float* out_rois_probs_ptr = out_rois_probs->template mutable_data<float>();
+  float* out_rois_ptr = out_rois->mutable_data<float>();
+  float* out_rois_probs_ptr = out_rois_probs->mutable_data<float>();
   for (int i = 0; i < num_images; i++) {
     const ERArrXXf& im_i_boxes = im_boxes[i];
     const EArrXf& im_i_probs = im_probs[i];
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index a090842205b7f..af9214379becd 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -18,10 +18,10 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
-      tensor->size(), value, tensor->template mutable_data<float>(), &context);
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
   return;
 }
 
@@ -34,10 +34,10 @@ static void AddLinSpacedInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
-      tensor->template mutable_data<float>(), tensor->size());
+      tensor->mutable_data<float>(), tensor->size());
   tensor_vec.setLinSpaced(min_val, max_val);
 
   return;
@@ -51,10 +51,10 @@ static void AddInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
-      tensor->template mutable_data<float>(), tensor->size());
+      tensor->mutable_data<float>(), tensor->size());
   tensor_vec.array() = utils::AsEArrXt(values);
 
   return;
@@ -79,7 +79,7 @@ TEST(GenerateProposalsTest, TestComputeAllAnchors) {
       79, -68, 8, 115, 103, -160, -40, 207, 151, -6, 32, 85, 79, -52, 8, 131,
       103, -144, -40, 223, 151;
 
-  Tensor anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()}, CPU);
+  TensorCPU anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()});
   Eigen::Map<ERMatXf>(
       anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
       anchors;
@@ -143,7 +143,7 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) {
     all_anchors_gt(i, 4) = angles[i % angles.size()];
   }
 
-  Tensor anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()}, CPU);
+  TensorCPU anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()});
   Eigen::Map<ERMatXf>(
       anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
       anchors;
diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h
index df0c27642337b..bf2119d0f5a43 100644
--- a/caffe2/operators/given_tensor_fill_op.h
+++ b/caffe2/operators/given_tensor_fill_op.h
@@ -51,7 +51,7 @@ class GivenTensorFillOp final : public FillerOp<Context> {
     }
   }
 
-  bool Fill(Tensor* output) override {
+  bool Fill(Tensor<Context>* output) override {
     return (this->*body_)(output);
   }
 
@@ -69,20 +69,20 @@ class GivenTensorFillOp final : public FillerOp<Context> {
   }
 
   template <typename Type>
-  bool FillWithType(Tensor* output) {
+  bool FillWithType(Tensor<Context>* output) {
     DCHECK_EQ(output->size(), values_.size())
         << "output size: " << output->size()
         << " given size: " << values_.size();
     auto* data = output->template mutable_data<Type>();
     const Type* values_data = values_.template data<Type>();
     if (output->size()) {
-      context_.CopyItemsFromCPU(
-          TypeMeta::Make<Type>(), output->size(), values_data, data);
+      context_.template Copy<Type, CPUContext, Context>(
+          output->size(), values_data, data);
     }
     return true;
   }
 
-  bool (GivenTensorFillOp::*body_)(Tensor* output);
-  Tensor values_{CPU};
+  bool (GivenTensorFillOp::*body_)(Tensor<Context>* output);
+  TensorCPU values_;
 };
 } // namespace caffe2
diff --git a/caffe2/operators/group_norm_op.h b/caffe2/operators/group_norm_op.h
index 8ff11353aa4f8..a65f57c5fb98c 100644
--- a/caffe2/operators/group_norm_op.h
+++ b/caffe2/operators/group_norm_op.h
@@ -152,8 +152,8 @@ class GroupNormGradientOp final : public Operator<Context> {
   const int group_;
   const StorageOrder order_;
 
-  Tensor ds_{Context::GetDeviceType()};
-  Tensor db_{Context::GetDeviceType()};
+  Tensor<Context> ds_;
+  Tensor<Context> db_;
 
   // Input: dY, X, gamma, beta, mu, inv_sig
   // Output: dX, dgamma, dbeta
diff --git a/caffe2/operators/gru_unit_op.h b/caffe2/operators/gru_unit_op.h
index c6a85ac5fb61c..ecbaac2c3c15c 100644
--- a/caffe2/operators/gru_unit_op.h
+++ b/caffe2/operators/gru_unit_op.h
@@ -143,9 +143,8 @@ class GRUUnitOp : public Operator<Context> {
       seqLengths = Input(SEQ_LENGTHS).template data<int32_t>();
     }
 
-    const auto t = static_cast<OperatorBase*>(this)
-                       ->Input<Tensor>(TIMESTEP, CPU)
-                       .template data<int32_t>()[0];
+    const auto t = static_cast<OperatorBase*>(this)->
+      Input<Tensor<CPUContext>>(TIMESTEP).template data<int32_t>()[0];
     Output(HIDDEN_T)->ResizeLike(Input(HIDDEN_T_M_1));
     auto* H = Output(HIDDEN_T)->template mutable_data<T>();
 
@@ -195,9 +194,8 @@ class GRUUnitGradientOp : public Operator<Context> {
     CAFFE_ENFORCE_EQ(3 * D, G);
     const auto* H_prev = Input(HIDDEN_T_M_1).template data<T>();
     const auto* X = Input(GATES).template data<T>();
-    const auto t = static_cast<OperatorBase*>(this)
-                       ->Input<Tensor>(TIMESTEP, CPU)
-                       .template data<int32_t>()[0];
+    const auto t = static_cast<OperatorBase*>(this)->
+      Input<Tensor<CPUContext>>(TIMESTEP).template data<int32_t>()[0];
     const auto* H = Input(HIDDEN_T).template data<T>();
     const auto* H_diff = Input(HIDDEN_T_GRAD).template data<T>();
 
diff --git a/caffe2/operators/h_softmax_op.cc b/caffe2/operators/h_softmax_op.cc
index ff65ba1797c98..1a8689d9c76ff 100644
--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@@ -36,8 +36,8 @@ float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,
     scale_.mutable_data<float>(), &context_);
 
   // Put the intermediate result X - max(X) into Y
-  context_.template CopyFromCPU<float>(
-      dim_out, fc_output_data, softmax_output_data);
+  context_.template Copy<float, CPUContext, CPUContext>(dim_out, fc_output_data,
+    softmax_output_data);
   // Subtract the scale
   math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, -1,
     sum_multiplier_.data<float>(), scale_.data<float>(), 1, softmax_output_data,
@@ -86,14 +86,14 @@ bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
   int N = W.dim32(0);
   CAFFE_ENFORCE_EQ(N, b.dim32(0));
   Y->Resize(M);
-  auto* Ydata = Y->template mutable_data<float>();
+  auto* Ydata = Y->mutable_data<float>();
   math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
   const auto* labeldata = label.data<int>();
 
   auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
   int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
   intermediate_output->Resize(int_output_size);
-  float* int_output_data = intermediate_output->template mutable_data<float>();
+  float * int_output_data = intermediate_output->mutable_data<float>();
   int int_output_offset = 0;
 
   if (bias_multiplier_.size() != M) {
@@ -151,7 +151,7 @@ void HSoftmaxGradientOp<float, CPUContext>::RunBackwardSingle(const float* X,
   }
 
   float* dX_softmax = dint_output + int_output_offset - dim_out;
-  context_.CopyFromCPU<float>(dim_out, dX_entropy, dX_softmax);
+  context_.Copy<float, CPUContext, CPUContext>(dim_out, dX_entropy, dX_softmax);
 
   math::Dot<float, CPUContext>(dim_out, X_entropy, dX_entropy, scaledata,
     &context_);
@@ -205,10 +205,10 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
   db->ResizeLike(b);
   dX_intermediate_output->ResizeLike(intermediate_output);
 
-  float* dX_data = dX->template mutable_data<float>();
-  float* dW_data = dW->template mutable_data<float>();
-  float* db_data = db->template mutable_data<float>();
-  float* dOutput_data = dX_intermediate_output->template mutable_data<float>();
+  float* dX_data = dX->mutable_data<float>();
+  float* dW_data = dW->mutable_data<float>();
+  float* db_data = db->mutable_data<float>();
+  float* dOutput_data = dX_intermediate_output->mutable_data<float>();
 
   math::Set<float, CPUContext>(X.size(), 0.f, dX_data, &context_);
   math::Set<float, CPUContext>(W.size(), 0.f, dW_data, &context_);
@@ -257,7 +257,7 @@ bool HSoftmaxSearchOp<float, CPUContext>::pruning(
     float parent_score,
     float beam) {
   int w_length = src_node.children_size() + src_node.word_ids_size();
-  Tensor intermediate_data{CPU};
+  Tensor<CPUContext> intermediate_data;
   intermediate_data.Resize(2 * w_length);
   float* int_output_data = intermediate_data.template mutable_data<float>();
   int int_output_offset = 0;
@@ -398,10 +398,8 @@ bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
         [&](std::pair<string, float> a, std::pair<string, float> b) {
           return a.second < b.second;
         });
-    auto* y_name_data =
-        Y_names->template mutable_data<string>() + sample * top_n_;
-    auto* y_score_data =
-        Y_scores->template mutable_data<float>() + sample * top_n_;
+    auto* y_name_data = Y_names->mutable_data<string>() + sample * top_n_;
+    auto* y_score_data = Y_scores->mutable_data<float>() + sample * top_n_;
     for (int i = 0; i < top_n_; i++) {
       if (i < info.size()) {
         y_name_data[i] = info[i].first;
@@ -545,18 +543,18 @@ REGISTER_CPU_OPERATOR(
     HuffmanTreeHierarchyOp<int64_t, CPUContext>);
 
 OPERATOR_SCHEMA(HSoftmax)
-    .NumInputs(4)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
+  .NumInputs(4)
+  .NumOutputs(2)
+  .SetDoc(R"DOC(
 Hierarchical softmax is an operator which approximates the softmax operator
 while giving significant training speed gains and reasonably comparable
 performance. In this operator, instead of calculating the probabilities of all
 the classes, we calculate the probability of each step in the path from root to
 the target word in the hierarchy.
 
-The operator takes a 2-D tensor (Tensor) containing a batch of layers, a
+The operator takes a 2-D tensor (Tensor<float>) containing a batch of layers, a
 set of parameters represented by the weight matrix and bias terms, and a 1-D
-tensor (Tensor) holding labels, or the indices of the target class. The
+tensor (Tensor<int>) holding labels, or the indices of the target class. The
 hierarchy has to be specified as an argument to the operator.
 
 The operator returns a 1-D tensor holding the computed log probability of the
@@ -564,28 +562,20 @@ target class and a 2-D tensor of intermediate outputs (from the weight matrix
 and softmax from each step in the path from root to target class) which will be
 used by the gradient operator to compute gradients for all samples in the batch.
 )DOC")
-    .Arg(
-        "hierarchy",
-        "Serialized HierarchyProto string containing list of "
-        "vocabulary words and their paths from root of hierarchy to the leaf")
-    .Input(0, "X", "Input data from previous layer")
-    .Input(
-        1,
-        "W",
-        "2D blob containing 'stacked' fully connected weight "
-        "matrices. Each node in the hierarchy contributes one FC weight matrix if "
-        "it has children nodes. Dimension is N*D, D is input dimension of data (X), "
-        "N is sum of all output dimensions, or total number of nodes (excl root)")
-    .Input(2, "b", "1D blob with N parameters")
-    .Input(3, "labels", "int word_id of the target word")
-    .Output(0, "Y", "1-D of log probability outputs, one per sample")
-    .Output(
-        1,
-        "intermediate_output",
-        "Extra blob to store the intermediate "
-        "FC and softmax outputs for each node in the hierarchical path of a word. "
-        "The outputs from samples are stored in consecutive blocks in the forward "
-        "pass and are used in reverse order in the backward gradientOp pass");
+  .Arg("hierarchy", "Serialized HierarchyProto string containing list of "
+  "vocabulary words and their paths from root of hierarchy to the leaf")
+  .Input(0, "X", "Input data from previous layer")
+  .Input(1, "W", "2D blob containing 'stacked' fully connected weight "
+  "matrices. Each node in the hierarchy contributes one FC weight matrix if "
+  "it has children nodes. Dimension is N*D, D is input dimension of data (X), "
+  "N is sum of all output dimensions, or total number of nodes (excl root)")
+  .Input(2, "b", "1D blob with N parameters")
+  .Input(3, "labels", "int word_id of the target word")
+  .Output(0, "Y", "1-D of log probability outputs, one per sample")
+  .Output(1, "intermediate_output", "Extra blob to store the intermediate "
+  "FC and softmax outputs for each node in the hierarchical path of a word. "
+  "The outputs from samples are stored in consecutive blocks in the forward "
+  "pass and are used in reverse order in the backward gradientOp pass");
 
 OPERATOR_SCHEMA(HSoftmaxGradient).NumInputs(6).NumOutputs(4);
 
diff --git a/caffe2/operators/h_softmax_op.h b/caffe2/operators/h_softmax_op.h
index 10ee600d89776..423f5b7a4f2e3 100644
--- a/caffe2/operators/h_softmax_op.h
+++ b/caffe2/operators/h_softmax_op.h
@@ -25,9 +25,9 @@ class HSoftmaxOpBase : public Operator<Context> {
 
  protected:
   std::unordered_map<int, PathProto> hierarchy_all_map_;
-  Tensor scale_{Context::GetDeviceType()};
-  Tensor sum_multiplier_{Context::GetDeviceType()};
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> scale_;
+  Tensor<Context> sum_multiplier_;
+  Tensor<Context> bias_multiplier_;
   static constexpr T kLOG_THRESHOLD() {
     return 1e-20f;
   }
diff --git a/caffe2/operators/half_float_ops.cu b/caffe2/operators/half_float_ops.cu
index 111ff3ca9fe2f..fb1cd16db4044 100644
--- a/caffe2/operators/half_float_ops.cu
+++ b/caffe2/operators/half_float_ops.cu
@@ -31,7 +31,7 @@ bool FloatToHalfOp<CUDAContext>::RunOnDevice() {
       context_.cuda_stream()>>>(
       X.size(),
       X.data<float>(),
-      reinterpret_cast<half*>(Y->template mutable_data<float16>()));
+      reinterpret_cast<half*>(Y->mutable_data<float16>()));
   return true;
 }
 
@@ -47,7 +47,7 @@ bool HalfToFloatOp<CUDAContext>::RunOnDevice() {
       context_.cuda_stream()>>>(
       X.size(),
       reinterpret_cast<const half*>(X.data<float16>()),
-      Y->template mutable_data<float>());
+      Y->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h
index cff2a620ef469..355dc31d8e550 100644
--- a/caffe2/operators/if_op.h
+++ b/caffe2/operators/if_op.h
@@ -32,7 +32,7 @@ class IfOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
+        this->template InputIsType<Tensor<Context>>(0),
         "Invalid condition in If operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index b9a8b1b46e272..dd04c87b75872 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -93,7 +93,8 @@ struct Index: IndexBase {
     return true;
   }
 
-  bool Store(Tensor* out) {
+  template<typename Ctx>
+  bool Store(Tensor<Ctx>* out) {
     std::lock_guard<std::mutex> lock(dictMutex_);
     out->Resize(nextId_ - 1);
     auto outData = out->template mutable_data<T>();
@@ -150,10 +151,7 @@ class IndexGetOp: public Operator<CPUContext> {
     const auto& keys = Input(1);
     auto* values = Output(0);
     values->ResizeLike(keys);
-    dict->Get(
-        keys.data<T>(),
-        values->template mutable_data<TIndexValue>(),
-        keys.size());
+    dict->Get(keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
     return true;
   }
 };
@@ -228,7 +226,7 @@ class IndexSizeOp : public Operator<CPUContext> {
     auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
     auto* out = Output(0);
     out->Resize(std::vector<TIndex>{});
-    *out->template mutable_data<TIndexValue>() = base->Size();
+    *out->mutable_data<TIndexValue>() = base->Size();
     return true;
   }
 };
@@ -353,7 +351,7 @@ class IndexSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor) override {
     auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
     Blob tensor_blob;
-    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
+    auto* tensor_out = tensor_blob.template GetMutable<Tensor<CPUContext>>();
 
     if (base->Type().Match<std::string>()) {
       doStore<std::string>(base, tensor_out);
@@ -369,7 +367,7 @@ class IndexSerializer : public BlobSerializerBase {
         tensor_out->size() <= std::numeric_limits<int32_t>::max(),
         "Index too large to be serialized.");
     BlobProto blob_proto;
-    TensorSerializer ser;
+    TensorSerializer<CPUContext> ser;
     ser.Serialize(
         *tensor_out, name, blob_proto.mutable_tensor(), 0, tensor_out->size());
     blob_proto.set_name(name);
@@ -384,7 +382,9 @@ class IndexSerializer : public BlobSerializerBase {
 
  private:
   template <typename T>
-  void doStore(const std::unique_ptr<IndexBase>& base, Tensor* tensor_out) {
+  void doStore(
+      const std::unique_ptr<IndexBase>& base,
+      Tensor<CPUContext>* tensor_out) {
     auto* dict = dynamic_cast_if_rtti<Index<T>*>(base.get());
     CAFFE_ENFORCE(dict, "Wrong dictionary type.");
     dict->Store(tensor_out);
@@ -394,7 +394,7 @@ class IndexSerializer : public BlobSerializerBase {
 class IndexDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override {
-    TensorDeserializer deser;
+    TensorDeserializer<CPUContext> deser;
     Blob tensor_blob;
     deser.Deserialize(proto, &tensor_blob);
 
@@ -403,7 +403,7 @@ class IndexDeserializer : public BlobDeserializerBase {
     bool isFrozen{false};
     is >> maxElements >> isFrozen;
 
-    auto& tensor_in = tensor_blob.template Get<Tensor>();
+    auto& tensor_in = tensor_blob.template Get<Tensor<CPUContext>>();
     auto* base = blob->template GetMutable<std::unique_ptr<IndexBase>>();
 
     if (tensor_in.IsType<std::string>()) {
@@ -426,7 +426,7 @@ class IndexDeserializer : public BlobDeserializerBase {
   void doLoad(
       std::unique_ptr<IndexBase>* base,
       int64_t maxElements,
-      const Tensor& tensor_in) {
+      const Tensor<CPUContext>& tensor_in) {
     base->reset(new Index<T>(maxElements));
     auto* dict = dynamic_cast_if_rtti<Index<T>*>(base->get());
     dict->Load(tensor_in.data<T>(), tensor_in.size());
diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu
index 8796684c6f237..87532066278b2 100644
--- a/caffe2/operators/instance_norm_op.cu
+++ b/caffe2/operators/instance_norm_op.cu
@@ -206,9 +206,9 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto input_data = input.data<float>();
   const auto scale_data = scale.data<float>();
   const auto bias_data = bias.data<float>();
-  auto output_data = output->template mutable_data<float>();
-  auto mean_data = mean->template mutable_data<float>();
-  auto inv_stdev_data = inv_stdev->template mutable_data<float>();
+  auto output_data = output->mutable_data<float>();
+  auto mean_data = mean->mutable_data<float>();
+  auto inv_stdev_data = inv_stdev->mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
@@ -283,9 +283,9 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto input_data = input.data<float>();
   const auto scale_data = scale.data<float>();
   const auto bias_data = bias.data<float>();
-  auto output_data = output->template mutable_data<float>();
-  auto mean_data = mean->template mutable_data<float>();
-  auto inv_stdev_data = inv_stdev->template mutable_data<float>();
+  auto output_data = output->mutable_data<float>();
+  auto mean_data = mean->mutable_data<float>();
+  auto inv_stdev_data = inv_stdev->mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
@@ -370,9 +370,9 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto bias_data = bias.data<float>();
   const auto output_grad_data = output_grad.data<float>();
 
-  auto input_grad_data = input_grad->template mutable_data<float>();
-  auto scale_grad_data = scale_grad->template mutable_data<float>();
-  auto bias_grad_data = bias_grad->template mutable_data<float>();
+  auto input_grad_data = input_grad->mutable_data<float>();
+  auto scale_grad_data = scale_grad->mutable_data<float>();
+  auto bias_grad_data = bias_grad->mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
@@ -501,9 +501,9 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto bias_data = bias.data<float>();
   const auto output_grad_data = output_grad.data<float>();
 
-  auto input_grad_data = input_grad->template mutable_data<float>();
-  auto scale_grad_data = scale_grad->template mutable_data<float>();
-  auto bias_grad_data = bias_grad->template mutable_data<float>();
+  auto input_grad_data = input_grad->mutable_data<float>();
+  auto scale_grad_data = scale_grad->mutable_data<float>();
+  auto bias_grad_data = bias_grad->mutable_data<float>();
 
   const auto dim = H * W;
   const auto N_stride = C * H * W;
diff --git a/caffe2/operators/instance_norm_op.h b/caffe2/operators/instance_norm_op.h
index 7435c7c8b43c0..90f11b3d49c08 100644
--- a/caffe2/operators/instance_norm_op.h
+++ b/caffe2/operators/instance_norm_op.h
@@ -40,8 +40,8 @@ class InstanceNormOp : public Operator<Context> {
   StorageOrder order_;
 
   // temp results that get passed to the gradient, but are otherwise stored here
-  Tensor mean_{Context::GetDeviceType()};
-  Tensor inv_stdev_{Context::GetDeviceType()};
+  Tensor<Context> mean_;
+  Tensor<Context> inv_stdev_;
 
   INPUT_TAGS(INPUT, SCALE, BIAS);
   OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
@@ -81,8 +81,8 @@ class InstanceNormGradientOp : public Operator<Context> {
 
   // temp results that could get passed through to this gradient, but if not,
   // are stored here
-  Tensor mean_{Context::GetDeviceType()};
-  Tensor inv_stdev_{Context::GetDeviceType()};
+  Tensor<Context> mean_;
+  Tensor<Context> inv_stdev_;
 
   INPUT_TAGS(INPUT, SCALE, BIAS, OUTPUT_GRAD, MEAN, INV_STDEV);
   OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
diff --git a/caffe2/operators/integral_image_op.cu b/caffe2/operators/integral_image_op.cu
index d8fa0b8f4dcc6..872d29bd0dddb 100644
--- a/caffe2/operators/integral_image_op.cu
+++ b/caffe2/operators/integral_image_op.cu
@@ -144,7 +144,7 @@ bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
       cols_out,
       chans,
       X.data<float>(),
-      Y->template mutable_data<float>());
+      Y->mutable_data<float>());
   // Integral image over columns of the integral image over rows
   const int col_pass_size = X.dim32(0) * chans * cols_out;
   ColPassKernel<<<
@@ -152,11 +152,7 @@ bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      col_pass_size,
-      rows_out,
-      cols_out,
-      chans,
-      Y->template mutable_data<float>());
+      col_pass_size, rows_out, cols_out, chans, Y->mutable_data<float>());
   return true;
 }
 
@@ -165,8 +161,8 @@ bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Original input to "forward" op
   auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
                        // (aka "gradOutput")
-  auto* dX = Output(0); // Gradient of net w.r.t. input to
-                        // "forward" op (aka "gradInput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
 
   dX->ResizeLike(X);
   // Row pass reduces shape of dY from (N, C, H + 1, W + 1)
@@ -203,7 +199,7 @@ bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
       cols_out,
       chans,
       row_pass_buffer_.data<float>(),
-      dX->template mutable_data<float>());
+      dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/integral_image_op.h b/caffe2/operators/integral_image_op.h
index 16bf99b33db04..b8920d677de83 100644
--- a/caffe2/operators/integral_image_op.h
+++ b/caffe2/operators/integral_image_op.h
@@ -28,7 +28,7 @@ class IntegralImageGradientOp final : public Operator<Context> {
   bool RunOnDevice() override;
 
  protected:
-  Tensor row_pass_buffer_{Context::GetDeviceType()};
+  Tensor<Context> row_pass_buffer_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/jsd_op.cc b/caffe2/operators/jsd_op.cc
index 890fd96b6556d..44648838f4a2b 100644
--- a/caffe2/operators/jsd_op.cc
+++ b/caffe2/operators/jsd_op.cc
@@ -35,7 +35,7 @@ bool BernoulliJSDOp<float, CPUContext>::RunOnDevice() {
   L->ResizeLike(X);
   auto* x_data = X.data<float>();
   auto* t_data = T.data<float>();
-  auto* l_data = L->template mutable_data<float>();
+  auto* l_data = L->mutable_data<float>();
   for (int i = 0; i < N; i++) {
     auto p_mdl = x_data[i];
     auto p_emp = t_data[i];
@@ -57,7 +57,7 @@ bool BernoulliJSDGradientOp<float, CPUContext>::RunOnDevice() {
   auto* go_data = go.data<float>();
   auto* x_data = X.data<float>();
   auto* t_data = T.data<float>();
-  auto* gi_data = gi->template mutable_data<float>();
+  auto* gi_data = gi->mutable_data<float>();
   for (int i = 0; i < N; i++) {
     auto p_mdl = x_data[i];
     auto p_emp = t_data[i];
diff --git a/caffe2/operators/last_n_window_collector.cc b/caffe2/operators/last_n_window_collector.cc
index 25f06cf751dd4..c9d1a777538d1 100644
--- a/caffe2/operators/last_n_window_collector.cc
+++ b/caffe2/operators/last_n_window_collector.cc
@@ -94,7 +94,7 @@ class LastNWindowCollectorOp : public Operator<Context> {
 
     if (num_entries > numToCollect_) {
       // just copy the last N rows
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           input.meta(),
           num_to_copy * block_size,
           input_data + (num_entries - numToCollect_) * block_bytesize,
@@ -105,13 +105,13 @@ class LastNWindowCollectorOp : public Operator<Context> {
     auto start = *next_data;
     auto first_chunk_size =
         std::min<size_t>(num_to_copy + start, numToCollect_) - start;
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<Context, Context>(
         input.meta(),
         first_chunk_size * block_size,
         input_data,
         output_data + start * block_bytesize);
 
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<Context, Context>(
         input.meta(),
         (num_to_copy - first_chunk_size) * block_size,
         input_data + first_chunk_size * block_bytesize,
diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu
index 2a909739c6f4c..bcec393b2ad95 100644
--- a/caffe2/operators/layer_norm_op.cu
+++ b/caffe2/operators/layer_norm_op.cu
@@ -45,7 +45,7 @@ void allocScratchAndReduce(
     float* output,
     int num_segments,
     int* seg_indices,
-    Tensor* scratch,
+    Tensor<CUDAContext>* scratch,
     cudaStream_t stream) {
   size_t temp_storage_bytes;
   cub::DeviceSegmentedReduce::Sum(
@@ -63,8 +63,8 @@ void allocScratchAndReduce(
   scratch->Resize(vector<size_t>{temp_storage_floats});
 
   cub::DeviceSegmentedReduce::Sum(
-      scratch->template mutable_data<float>(), // To retrieve required temporary
-                                               // storage size
+      scratch->mutable_data<float>(), // To retrieve required temporary storage
+                                      // size
       temp_storage_bytes, // size_t &temp_storage_bytes
       input, // InputIteratorT d_i
       output, // OutputIteratorT d_out
@@ -72,7 +72,7 @@ void allocScratchAndReduce(
       seg_indices, // int *d_begin_offsets
       seg_indices + 1, // int *d_end_offsets
       stream // cudaStream_t stream=0
-  );
+      );
 }
 
 } //  namespace
@@ -107,7 +107,7 @@ bool LayerNormOp<CUDAContext>::DoRunWithType<float>() {
       std::bind1st(std::multiplies<int>(), right));
 
   seg_indices_.Resize(vector<size_t>{segs.size()});
-  context_.CopyBytesFromCPU(
+  context_.CopyBytes<CPUContext, CUDAContext>(
       sizeof(int) * segs.size(),
       static_cast<void*>(segs.data()),
       static_cast<void*>(seg_indices_.mutable_data<int>()));
@@ -237,7 +237,7 @@ __global__ void gradientMegaKernel(
   }
 }
 
-#define PRINT(X, N, D) printTensor >> (X, N, D)
+#define PRINT(X, N, D) printTensor<<<1, 1, 0, context_.cuda_stream()>>>(X, N, D)
 
 } // namespace
 
@@ -272,7 +272,7 @@ bool LayerNormGradientOp<CUDAContext>::DoRunWithType<float>() {
       std::bind1st(std::multiplies<int>(), right));
 
   seg_indices_.Resize(vector<size_t>{segs.size()});
-  context_.CopyBytesFromCPU(
+  context_.CopyBytes<CPUContext, CUDAContext>(
       sizeof(int) * segs.size(),
       static_cast<void*>(segs.data()),
       static_cast<void*>(seg_indices_.mutable_data<int>()));
diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h
index b6d032a8211aa..da74f83398812 100644
--- a/caffe2/operators/layer_norm_op.h
+++ b/caffe2/operators/layer_norm_op.h
@@ -28,8 +28,8 @@ class LayerNormOp : public Operator<Context> {
   int axis_;
   float epsilon_;
 
-  Tensor scratch_{Context::GetDeviceType()};
-  Tensor seg_indices_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
+  Tensor<Context> seg_indices_;
 };
 
 template <class Context>
@@ -53,11 +53,11 @@ class LayerNormGradientOp : public Operator<Context> {
   int axis_;
   float epsilon_;
 
-  Tensor scratch_{Context::GetDeviceType()};
-  Tensor gscratch_{Context::GetDeviceType()};
-  Tensor seg_indices_{Context::GetDeviceType()};
-  Tensor dstdev_{Context::GetDeviceType()};
-  Tensor dmean_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
+  Tensor<Context> gscratch_;
+  Tensor<Context> seg_indices_;
+  Tensor<Context> dstdev_;
+  Tensor<Context> dmean_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/leaky_relu_op.cc b/caffe2/operators/leaky_relu_op.cc
index 280630e7fbae7..dcf62084a1207 100644
--- a/caffe2/operators/leaky_relu_op.cc
+++ b/caffe2/operators/leaky_relu_op.cc
@@ -104,7 +104,7 @@ print("Y:\n", workspace.FetchBlob("Y"))
 )DOC")
     .Input(0, "X", "Input tensor of data to be operated on.")
     .Output(0, "Y", "Output tensor, calculated as described above.");
-
+    
 OPERATOR_SCHEMA(LeakyReluGradient)
     .NumInputs(2)
     .NumOutputs(1)
diff --git a/caffe2/operators/leaky_relu_op.cu b/caffe2/operators/leaky_relu_op.cu
index 95429e6d63793..ece07b786a510 100644
--- a/caffe2/operators/leaky_relu_op.cu
+++ b/caffe2/operators/leaky_relu_op.cu
@@ -36,7 +36,7 @@ bool LeakyReluOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), alpha_, X.data<float>(), Y->template mutable_data<float>());
+      X.size(), alpha_, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -56,7 +56,7 @@ bool LeakyReluGradientOp<float, CUDAContext>::RunOnDevice() {
       alpha_,
       Y.data<float>(),
       dY.data<float>(),
-      dX->template mutable_data<float>());
+      dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/lengths_pad_op.h b/caffe2/operators/lengths_pad_op.h
index 9f65c39a262a7..e89f4fbbfc5f7 100644
--- a/caffe2/operators/lengths_pad_op.h
+++ b/caffe2/operators/lengths_pad_op.h
@@ -33,14 +33,13 @@ class LengthsPadOp : public Operator<Context> {
 
     // Context::CopyFrom and math::Sum need the same context to avoid race
     // conditions
-    // why? CPUContext is not used in Sum
-    lengths_host_.CopyFrom(lengths);
+    CPUContext cpuContext;
+    lengths_host_.CopyFrom(lengths, &cpuContext);
 
     auto lengths_size = lengths_host_.size();
-    auto* lengths_data = lengths_host_.template data<int32_t>();
+    auto* lengths_data = lengths_host_.data<int32_t>();
 
     int32_t total_length = 0;
-    CPUContext cpuContext;
     math::Sum<int32_t, CPUContext>(
         lengths_size, lengths_data, &total_length, &cpuContext);
 
@@ -66,7 +65,7 @@ class LengthsPadOp : public Operator<Context> {
           i,
           " is larger than target length");
 
-      context_.template CopySameDevice<T>(
+      context_.template Copy<T, Context, Context>(
           block_size * length, src_data, out_data);
 
       out_data += block_size * target_length_;
@@ -80,7 +79,7 @@ class LengthsPadOp : public Operator<Context> {
  private:
   double padding_value_;
   int target_length_;
-  Tensor lengths_host_{CPU};
+  TensorCPU lengths_host_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h
index 9d2b7a6f07122..fb525bd9b972f 100644
--- a/caffe2/operators/lengths_tile_op.h
+++ b/caffe2/operators/lengths_tile_op.h
@@ -23,13 +23,12 @@ class LengthsTileOp : public Operator<Context> {
 
     // Context::CopyFrom and math::Sum need the same context to avoid race
     // conditions
-    // why? CPUContext is not used in Sum
-    lengths_host_.CopyFrom(lengths);
+    CPUContext cpuContext;
+    lengths_host_.CopyFrom(lengths, &cpuContext);
     auto lengths_size = lengths_host_.size();
     auto* lengths_data = lengths_host_.data<int32_t>();
 
     int32_t total_length = 0;
-    CPUContext cpuContext;
     math::Sum<int32_t, CPUContext>(
         lengths_size, lengths_data, &total_length, &cpuContext);
 
@@ -45,7 +44,7 @@ class LengthsTileOp : public Operator<Context> {
       auto length = lengths_data[i];
       CAFFE_ENFORCE_GE(length, 0);
       for (int32_t j = 0; j < length; ++j) {
-        context_.CopyBytesSameDevice(block_bytesize, src, out);
+        context_.template CopyBytes<Context, Context>(block_bytesize, src, out);
         out += block_bytesize;
       }
       src += block_bytesize;
@@ -56,7 +55,7 @@ class LengthsTileOp : public Operator<Context> {
   INPUT_TAGS(DATA, LENGTHS);
 
  private:
-  Tensor lengths_host_{CPU};
+  TensorCPU lengths_host_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/listwise_l2r_op.cc b/caffe2/operators/listwise_l2r_op.cc
index d52d3bcce63b7..3940dfb2b1670 100644
--- a/caffe2/operators/listwise_l2r_op.cc
+++ b/caffe2/operators/listwise_l2r_op.cc
@@ -77,9 +77,9 @@ template <>
 float LambdaRankNdcgOp<float, CPUContext>::LambdaRankNdcgSession(
     int start_index,
     int end_index,
-    const Tensor& y,
-    const Tensor& r,
-    Tensor** dy) {
+    const Tensor<CPUContext>& y,
+    const Tensor<CPUContext>& r,
+    Tensor<CPUContext>** dy) {
   CAFFE_ENFORCE(start_index >= 0);
   CAFFE_ENFORCE(start_index < y.size());
   const auto* y_data = y.template data<float>();
diff --git a/caffe2/operators/listwise_l2r_op.h b/caffe2/operators/listwise_l2r_op.h
index 9564222d473cc..ae1aca2c16436 100644
--- a/caffe2/operators/listwise_l2r_op.h
+++ b/caffe2/operators/listwise_l2r_op.h
@@ -29,16 +29,16 @@ class LambdaRankNdcgOp final : public Operator<Context> {
   float LambdaRankNdcgSession(
       int start_index,
       int end_index,
-      const Tensor& y,
-      const Tensor& r,
-      Tensor** dy);
+      const Tensor<CPUContext>& y,
+      const Tensor<CPUContext>& r,
+      Tensor<CPUContext>** dy);
   bool use_ndcg_as_loss_;
-  Tensor gain_{Context::GetDeviceType()};
-  Tensor discount_{Context::GetDeviceType()};
-  Tensor rank_idx_{Context::GetDeviceType()};
-  Tensor ideal_idx_{Context::GetDeviceType()};
-  Tensor lambda_{Context::GetDeviceType()};
-  Tensor inv_log_i_{Context::GetDeviceType()};
+  Tensor<Context> gain_;
+  Tensor<Context> discount_;
+  Tensor<Context> rank_idx_;
+  Tensor<Context> ideal_idx_;
+  Tensor<Context> lambda_;
+  Tensor<Context> inv_log_i_;
 };
 
 template <typename T, class Context>
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
index de8380e45e98c..4b21fb2660d18 100644
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@@ -536,7 +536,7 @@ class CheckpointOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     int64_t iter =
-        OperatorBase::Input<Tensor>(0, CPU).template data<int64_t>()[0];
+        OperatorBase::Input<TensorCPU>(0).template data<int64_t>()[0];
     if (iter % every_ == 0) {
       GetMutableArgument("db", true, &save_op_def_)
           ->set_s(FormatString(db_pattern_, iter));
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
index 1cba60e86d978..334570306c4f9 100644
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -15,7 +15,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   const int image_size = C * H * W;
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
 
   if (OutputSize() > 1) {
     scale_ = Output(1);
@@ -25,10 +25,11 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
-  Tensor padded_square(vector<TIndex>{C + size_ - 1, H, W}, CPU);
-  float* padded_square_data = padded_square.template mutable_data<float>();
+  TensorCPU padded_square(
+      vector<TIndex>{C + size_ - 1, H, W});
+  float* padded_square_data = padded_square.mutable_data<float>();
   math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
                                &context_);
   const float alpha_over_size = alpha_ / size_;
@@ -47,7 +48,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     for (int c = 1; c < C; ++c) {
       float* this_scale_slice = scale_data + n * image_size + c * H * W;
       // copy previous scale
-      context_.CopyFromCPU<float>(
+      context_.Copy<float, CPUContext, CPUContext>(
           H * W, this_scale_slice - H * W, this_scale_slice);
       // add head
       math::Axpy<float, CPUContext>(
@@ -79,7 +80,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   const int num_rows = N * H * W;
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
 
   if (OutputSize() > 1) {
     scale_ = Output(1);
@@ -89,10 +90,10 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
 
-  Tensor padded_square(vector<TIndex>(1, C + size_ - 1), CPU);
-  float* padded_square_data = padded_square.template mutable_data<float>();
+  TensorCPU padded_square(vector<TIndex>(1, C + size_ - 1));
+  float* padded_square_data = padded_square.mutable_data<float>();
   math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
                                &context_);
   const float alpha_over_size = alpha_ / size_;
@@ -142,12 +143,13 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
 
-  Tensor padded_ratio(vector<TIndex>{C + size_ - 1, H, W}, CPU);
-  float* padded_ratio_data = padded_ratio.template mutable_data<float>();
+  TensorCPU padded_ratio(
+      vector<TIndex>{C + size_ - 1, H, W});
+  float* padded_ratio_data = padded_ratio.mutable_data<float>();
   // Compute scale(copied from LRNOp) - reusing padded_ratio
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
@@ -168,7 +170,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
     for (int c = 1; c < C; ++c) {
       float* this_scale_slice = scale_data + n * image_size + c * H * W;
       // copy previous scale
-      context_.CopyFromCPU<float>(
+      context_.Copy<float, CPUContext, CPUContext>(
           H * W, this_scale_slice - H * W, this_scale_slice);
       // add head
       math::Axpy<float, CPUContext>(
@@ -183,8 +185,9 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
 
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
                                &context_);
-  Tensor accum_ratio(vector<TIndex>{H, W}, CPU);
-  float* accum_ratio_data = accum_ratio.template mutable_data<float>();
+  TensorCPU accum_ratio(vector<TIndex>{H, W});
+  float* accum_ratio_data = accum_ratio.mutable_data<float>();
+
 
   const float cache_ratio = 2. * alpha_ * beta_ / size_;
   const int inverse_pre_pad = size_ - (size_ + 1) / 2;
@@ -243,9 +246,9 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  Tensor padded_ratio(vector<TIndex>(1, C + size_ - 1), CPU);
-  float* padded_ratio_data = padded_ratio.template mutable_data<float>();
-  float* scale_data = scale_->template mutable_data<float>();
+  TensorCPU padded_ratio(vector<TIndex>(1, C + size_ - 1));
+  float* padded_ratio_data = padded_ratio.mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
   // Compute scale(copied from LRNOp) - reusing padded_ratio
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
@@ -275,7 +278,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   const float* Ydata = Y.data<float>();
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   for (int n = 0; n < num_rows; ++n) {
     const int offset = n * C;
     for (int c = 0; c < C; ++c) {
diff --git a/caffe2/operators/local_response_normalization_op.cu b/caffe2/operators/local_response_normalization_op.cu
index edcd8e878e774..a6a8f5011e33e 100644
--- a/caffe2/operators/local_response_normalization_op.cu
+++ b/caffe2/operators/local_response_normalization_op.cu
@@ -186,7 +186,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const int W = X.dim32(3);
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   if (OutputSize() > 1) {
     scale_ = Output(1);
   } else {
@@ -195,7 +195,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
 
   int n_threads = N * H * W;
   LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
@@ -219,7 +219,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const int C = X.dim32(3);
   const float* Xdata = X.data<float>();
   Y->ResizeLike(X);
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   if (OutputSize() > 1) {
     scale_ = Output(1);
   } else {
@@ -228,7 +228,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
     }
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
 
   int n_threads = X.size();
   LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
@@ -263,13 +263,13 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
   int n_threads = N * H * W;
   LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                         0, context_.cuda_stream()>>>(
       n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
 
   LRNComputeDiffNCHW<float><<<CAFFE_GET_BLOCKS(n_threads),
                               CAFFE_CUDA_NUM_THREADS,
@@ -301,30 +301,19 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   }
   scale_->ResizeLike(X);
 
-  float* scale_data = scale_->template mutable_data<float>();
+  float* scale_data = scale_->mutable_data<float>();
   int n_threads = X.size();
   LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                         0, context_.cuda_stream()>>>(
       n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
 
-  LRNComputeDiffNHWC<float>
-      <<<CAFFE_GET_BLOCKS(X.size()),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          X.size(),
-          X.data<float>(),
-          Y.data<float>(),
-          scale_data,
-          dY.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          size_,
-          -beta_,
-          2.f * alpha_ * beta_ / size_,
-          dX->template mutable_data<float>());
+  LRNComputeDiffNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
+                              CAFFE_CUDA_NUM_THREADS, 0,
+                              context_.cuda_stream()>>>(
+      X.size(), X.data<float>(), Y.data<float>(), scale_data,
+      dY.data<float>(),
+      X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3), size_, -beta_,
+      2.f * alpha_ * beta_ / size_, dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/local_response_normalization_op.h b/caffe2/operators/local_response_normalization_op.h
index 9b7f8da8095ef..79f388caea23f 100644
--- a/caffe2/operators/local_response_normalization_op.h
+++ b/caffe2/operators/local_response_normalization_op.h
@@ -66,8 +66,8 @@ class LRNOp final : public LRNOpBase<T, Context> {
  protected:
   // Input: X; Output: Y, scale.
   OUTPUT_TAGS(OUTPUT, SCALE);
-  Tensor* scale_ = nullptr;
-  Tensor local_scale_tensor_{Context::GetDeviceType()};
+  Tensor<Context>* scale_ = nullptr;
+  Tensor<Context> local_scale_tensor_;
 };
 
 template <typename T, class Context>
@@ -83,8 +83,8 @@ class LRNGradientOp final : public LRNOpBase<T, Context> {
  protected:
   // Input: X, Y, scale, dY; Output: dX
   INPUT_TAGS(INPUT, OUTPUT, SCALE, OUTPUT_GRAD);
-  Tensor* scale_ = nullptr;
-  Tensor local_scale_tensor_{Context::GetDeviceType()};
+  Tensor<Context>* scale_ = nullptr;
+  Tensor<Context> local_scale_tensor_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/locally_connected_op.h b/caffe2/operators/locally_connected_op.h
index cf5bf63e6893e..7b4bbd4de87db 100644
--- a/caffe2/operators/locally_connected_op.h
+++ b/caffe2/operators/locally_connected_op.h
@@ -37,9 +37,9 @@ class LocallyConnectedOp final : public ConvPoolOpBase<Context> {
       const T* filter_data,
       const T* bias_data,
       T* Y_data,
-      Tensor* column_buffer,
-      Tensor* column_transposed_buffer,
-      Tensor* output_buffer);
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* output_buffer);
 
   void RunOnDeviceWithOrderNHWCImpl(
       const lc_op_util::ShapeParams& shape,
@@ -47,16 +47,16 @@ class LocallyConnectedOp final : public ConvPoolOpBase<Context> {
       const T* filter_data,
       const T* bias_data,
       T* Y_data,
-      Tensor* column_buffer,
-      Tensor* column_transposed_buffer,
-      Tensor* Y_transposed_buffer);
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* Y_transposed_buffer);
 
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
 
   // Buffer.
-  Tensor column_buffer_{Context::GetDeviceType()};
-  Tensor column_transposed_buffer_{Context::GetDeviceType()};
-  Tensor Y_transposed_buffer_{Context::GetDeviceType()};
+  Tensor<Context> column_buffer_;
+  Tensor<Context> column_transposed_buffer_;
+  Tensor<Context> Y_transposed_buffer_;
 
   // Input: X, W, b
   // Output: Y
@@ -93,9 +93,9 @@ class LocallyConnectedGradientOp final : public ConvPoolOpBase<Context> {
       T* dfilter_data,
       T* dX_data,
       T* dbias_data,
-      Tensor* column_buffer,
-      Tensor* column_transposed_buffer,
-      Tensor* dY_transposed_buffer);
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* dY_transposed_buffer);
 
   void RunOnDeviceWithOrderNHWCImpl(
       const lc_op_util::ShapeParams& shape,
@@ -105,18 +105,18 @@ class LocallyConnectedGradientOp final : public ConvPoolOpBase<Context> {
       T* dfilter_data,
       T* dX_data,
       T* dbias_data,
-      Tensor* column_buffer,
-      Tensor* column_transposed_buffer,
-      Tensor* dY_transposed_buffer);
+      Tensor<Context>* column_buffer,
+      Tensor<Context>* column_transposed_buffer,
+      Tensor<Context>* dY_transposed_buffer);
 
   const bool no_bias_;
 
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
 
   // Buffer.
-  Tensor column_buffer_{Context::GetDeviceType()};
-  Tensor column_transposed_buffer_{Context::GetDeviceType()};
-  Tensor dY_transposed_buffer_{Context::GetDeviceType()};
+  Tensor<Context> column_buffer_;
+  Tensor<Context> column_transposed_buffer_;
+  Tensor<Context> dY_transposed_buffer_;
 
   // input: X, W, dY
   // output: dW, db, and optionally dX
diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h
index 76d7228c342ac..4d7762fccbbb3 100644
--- a/caffe2/operators/locally_connected_op_impl.h
+++ b/caffe2/operators/locally_connected_op_impl.h
@@ -189,9 +189,9 @@ void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
     const T* filter_data,
     const T* bias_data,
     T* Y_data,
-    Tensor* column_buffer,
-    Tensor* column_transposed_buffer,
-    Tensor* Y_transposed_buffer) {
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* Y_transposed_buffer) {
   const int input_stride = shape.C / group_ * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
@@ -292,9 +292,9 @@ void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
     const T* filter_data,
     const T* bias_data,
     T* Y_data,
-    Tensor* column_buffer,
-    Tensor* column_transposed_buffer,
-    Tensor* Y_transposed_buffer) {
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* Y_transposed_buffer) {
   const int input_stride = shape.C * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
@@ -550,9 +550,9 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
     T* dfilter_data,
     T* dX_data,
     T* dbias_data,
-    Tensor* column_buffer,
-    Tensor* column_transposed_buffer,
-    Tensor* dY_transposed_buffer) {
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* dY_transposed_buffer) {
   const int input_stride = shape.C * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
@@ -726,9 +726,9 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
     T* dfilter_data,
     T* dX_data,
     T* dbias_data,
-    Tensor* column_buffer,
-    Tensor* column_transposed_buffer,
-    Tensor* dY_transposed_buffer) {
+    Tensor<Context>* column_buffer,
+    Tensor<Context>* column_transposed_buffer,
+    Tensor<Context>* dY_transposed_buffer) {
   const int input_stride = shape.C * shape.input_image_size;
   const int column_stride = shape.kernel_size * shape.output_image_size;
   column_buffer->Resize(shape.column_dims);
diff --git a/caffe2/operators/logit_op.cu b/caffe2/operators/logit_op.cu
index c431e5b519eec..d2e8351fcac95 100644
--- a/caffe2/operators/logit_op.cu
+++ b/caffe2/operators/logit_op.cu
@@ -54,11 +54,7 @@ bool LogitGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      n,
-      X.data<float>(),
-      dY.data<float>(),
-      eps_,
-      dX->template mutable_data<float>());
+      n, X.data<float>(), dY.data<float>(), eps_, dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc
index f877786648350..c3795a0bb216e 100644
--- a/caffe2/operators/lp_pool_op.cc
+++ b/caffe2/operators/lp_pool_op.cc
@@ -17,7 +17,7 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
   const auto inv_p = 1.0 / p;
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
   // The main loop
   int channels = X.dim32(1);
@@ -67,7 +67,7 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
   const auto inv_p = 1.0 / p;
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
   // The main loop
   int pooled_height = Y->dim32(1);
@@ -115,11 +115,11 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
   // TODO(Yangqing): Add shape checks.
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
-      X.size(), 0, dX->template mutable_data<float>(), &context_);
+      X.size(), 0, dX->mutable_data<float>(), &context_);
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Ydata = Y.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
 
   int channels = X.dim32(1);
   CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
@@ -171,9 +171,9 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
   // TODO(Yangqing): Add shape checks.
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
-      X.size(), 0, dX->template mutable_data<float>(), &context_);
+      X.size(), 0, dX->mutable_data<float>(), &context_);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   const float* Xdata = X.data<float>();
   const float* Ydata = Y.data<float>();
   // The main loop
diff --git a/caffe2/operators/lp_pool_op.cu b/caffe2/operators/lp_pool_op.cu
index 1e5b66c5db2bb..53f6110294426 100644
--- a/caffe2/operators/lp_pool_op.cu
+++ b/caffe2/operators/lp_pool_op.cu
@@ -242,27 +242,27 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
   auto* Y = Output(0);
   ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
   int output_size = Y->size();
-  LpPoolForwardNCHW<float>
-      <<<CAFFE_GET_BLOCKS(output_size),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(2),
-          Y->dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->template mutable_data<float>(),
-          OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolForwardNCHW<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(2),
+      Y->dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      Y->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
@@ -272,27 +272,27 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
   auto* Y = Output(0);
   ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
   int output_size = Y->size();
-  LpPoolForwardNHWC<float>
-      <<<CAFFE_GET_BLOCKS(output_size),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(1),
-          Y->dim32(2),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->template mutable_data<float>(),
-          OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolForwardNHWC<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(1),
+      Y->dim32(2),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      Y->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
@@ -306,29 +306,29 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
   auto* dX = Output(0);
   dX->ResizeLike(X);
   ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
-  LpPoolBackwardNCHW<float>
-      <<<CAFFE_GET_BLOCKS(X.size()),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          Y.data<float>(),
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          dY.dim32(2),
-          dY.dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          dX->template mutable_data<float>(),
-          OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolBackwardNCHW<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<float>(),
+      Y.data<float>(),
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(2),
+      dY.dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      dX->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
@@ -342,29 +342,29 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
   auto* dX = Output(0);
   dX->ResizeLike(X);
   ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
-  LpPoolBackwardNHWC<float>
-      <<<CAFFE_GET_BLOCKS(X.size()),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          X.size(),
-          dY.data<float>(),
-          Y.data<float>(),
-          X.data<float>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          dY.dim32(1),
-          dY.dim32(2),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          dX->template mutable_data<float>(),
-          OperatorBase::GetSingleArgument<float>("p", 2.0));
+  LpPoolBackwardNHWC<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<float>(),
+      Y.data<float>(),
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(1),
+      dY.dim32(2),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      dX->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
   return true;
 }
 
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
index 6af404d115358..f79d51ad51c44 100644
--- a/caffe2/operators/lpnorm_op.cc
+++ b/caffe2/operators/lpnorm_op.cc
@@ -15,12 +15,12 @@ bool LpNormOp<float, CPUContext>::RunOnDevice() {
   const float size = average_ ? (float)X.size() : 1.0f;
   CAFFE_ENFORCE_GT(size, 0);
   if (p_ == 1) {
-    *(norm->template mutable_data<float>()) =
+    *(norm->mutable_data<float>()) =
         (ConstEigenVectorMap<float>(X_data, X.size()).array()).abs().sum() /
         size;
     // L1(x) = sum(|x|), L1_average(x) = sum(\x\) / x.size()
   } else if (p_ == 2) {
-    *(norm->template mutable_data<float>()) =
+    *(norm->mutable_data<float>()) =
         (ConstEigenVectorMap<float>(X_data, X.size()).array()).square().sum() /
         size;
     // L2(x) = (sum(|x|^2)), L2_average(x) = sum(|x|^2) / x.size()
@@ -43,17 +43,15 @@ bool LpNormGradientOp<float, CPUContext>::RunOnDevice() {
     for (int i = 0; i < X.size(); ++i) {
       float temp = (X.data<float>())[i];
       if (temp < -kEps) {
-        dX->template mutable_data<float>()[i] =
-            -(dnorm.data<float>())[0] / size;
+        dX->mutable_data<float>()[i] = -(dnorm.data<float>())[0] / size;
       } else if (temp > kEps) {
-        dX->template mutable_data<float>()[i] = (dnorm.data<float>())[0] / size;
+        dX->mutable_data<float>()[i] = (dnorm.data<float>())[0] / size;
       } else {
-        dX->template mutable_data<float>()[i] = 0;
+        dX->mutable_data<float>()[i] = 0;
       }
     }
   } else if (p_ == 2) {
-    EigenVectorMap<float>(dX->template mutable_data<float>(), X.size())
-        .array() =
+    EigenVectorMap<float>(dX->mutable_data<float>(), X.size()).array() =
         ConstEigenVectorMap<float>(X.data<float>(), X.size()).array() * 2.0f *
         ((dnorm.data<float>())[0] / size);
   }
diff --git a/caffe2/operators/lstm_unit_op.h b/caffe2/operators/lstm_unit_op.h
index 31ab070f8d26a..73afcbe08684a 100644
--- a/caffe2/operators/lstm_unit_op.h
+++ b/caffe2/operators/lstm_unit_op.h
@@ -176,7 +176,7 @@ class LSTMUnitOp : public Operator<Context> {
     }
 
     const auto t = static_cast<OperatorBase*>(this)
-                       ->Input<Tensor>(TIMESTEP, CPU)
+                       ->Input<Tensor<CPUContext>>(TIMESTEP)
                        .template data<int32_t>()[0];
     Output(CELL_T)->ResizeLike(Input(CELL_T_M_1));
     auto* C = Output(CELL_T)->template mutable_data<T>();
@@ -253,7 +253,7 @@ class LSTMUnitGradientOp : public Operator<Context> {
     const auto* C_prev = Input(CELL_T_M_1).template data<T>();
     const auto* X = Input(GATES).template data<T>();
     const auto t = static_cast<OperatorBase*>(this)
-                       ->Input<Tensor>(TIMESTEP, CPU)
+                       ->Input<Tensor<CPUContext>>(TIMESTEP)
                        .template data<int32_t>()[0];
     const auto* C = Input(CELL_T).template data<T>();
     const auto* H = Input(HIDDEN_T).template data<T>();
diff --git a/caffe2/operators/map_ops.h b/caffe2/operators/map_ops.h
index 5a436d7f2502a..8d1a18f8fc135 100644
--- a/caffe2/operators/map_ops.h
+++ b/caffe2/operators/map_ops.h
@@ -201,9 +201,9 @@ class MapSerializer : public BlobSerializerBase {
     CAFFE_ENFORCE(blob.IsType<MapType>());
     const MapType& map_data = blob.template Get<MapType>();
     TIndex sz = map_data.size();
-    Tensor key_tensor(CPU);
+    Tensor<CPUContext> key_tensor;
     key_tensor.Resize(sz);
-    Tensor value_tensor(CPU);
+    Tensor<CPUContext> value_tensor;
     value_tensor.Resize(sz);
     auto* key_data = key_tensor.mutable_data<KEY_T>();
     auto* value_data = value_tensor.mutable_data<VALUE_T>();
@@ -215,7 +215,7 @@ class MapSerializer : public BlobSerializerBase {
     }
 
     TensorProtos tensor_protos;
-    TensorSerializer ser;
+    TensorSerializer<CPUContext> ser;
     ser.Serialize(
         key_tensor, name, tensor_protos.add_protos(), 0, key_tensor.size());
     ser.Serialize(
@@ -239,8 +239,8 @@ class MapDeserializer : public BlobDeserializerBase {
     CAFFE_ENFORCE(
         tensor_protos.ParseFromString(proto.content()),
         "Fail to parse TensorProtos");
-    TensorDeserializer deser;
-    Tensor key_tensor(CPU), value_tensor(CPU);
+    TensorDeserializer<CPUContext> deser;
+    Tensor<CPUContext> key_tensor, value_tensor;
     deser.Deserialize(tensor_protos.protos(0), &key_tensor);
     deser.Deserialize(tensor_protos.protos(1), &value_tensor);
     auto* key_data = key_tensor.data<KEY_T>();
diff --git a/caffe2/operators/margin_ranking_criterion_op.cc b/caffe2/operators/margin_ranking_criterion_op.cc
index 30b4f2731af5f..b699c4bb7b756 100644
--- a/caffe2/operators/margin_ranking_criterion_op.cc
+++ b/caffe2/operators/margin_ranking_criterion_op.cc
@@ -23,7 +23,7 @@ bool MarginRankingCriterionOp<CPUContext>::RunOnDevice() {
   const float* X1data = X1.data<float>();
   const float* X2data = X2.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output = loss->template mutable_data<float>();
+  float* output = loss->mutable_data<float>();
   for (int i = 0; i < X1.size(); ++i) {
     output[i] = std::max(-Ydata[i] * (X1data[i] - X2data[i]) + margin_, 0.f);
   }
@@ -47,8 +47,8 @@ bool MarginRankingCriterionGradientOp<CPUContext>::RunOnDevice() {
   const int* Ydata = Y.data<int>();
   const float* dLoss_data = dLoss.data<float>();
 
-  float* dX1_data = dX1->template mutable_data<float>();
-  float* dX2_data = dX2->template mutable_data<float>();
+  float* dX1_data = dX1->mutable_data<float>();
+  float* dX2_data = dX2->mutable_data<float>();
   for (int i = 0; i < X1.size(); ++i) {
     auto dist = -Ydata[i] * (X1data[i] - X2data[i]) + margin_;
     if (dist < 0.f) {
@@ -72,9 +72,9 @@ OPERATOR_SCHEMA(MarginRankingCriterion)
     .NumInputs(3)
     .NumOutputs(1)
     .SetDoc(R"DOC(
-MarginRankingCriterion takes two input data X1 (Tensor),
-X2 (Tensor), and label Y (Tensor) to produce the
-loss (Tensor) where the loss function,
+MarginRankingCriterion takes two input data X1 (Tensor<float>),
+X2 (Tensor<float>), and label Y (Tensor<int>) to produce the
+loss (Tensor<float>) where the loss function,
 loss(X1, X2, Y) = max(0, -Y * (X1 - X2) + margin), is applied to
 the tensor elementwise.
 
diff --git a/caffe2/operators/margin_ranking_criterion_op.cu b/caffe2/operators/margin_ranking_criterion_op.cu
index 5593a1db8cd49..b01513a40b4f7 100644
--- a/caffe2/operators/margin_ranking_criterion_op.cu
+++ b/caffe2/operators/margin_ranking_criterion_op.cu
@@ -45,7 +45,7 @@ bool MarginRankingCriterionOp<CUDAContext>::RunOnDevice() {
   const float* X1data = X1.data<float>();
   const float* X2data = X2.data<float>();
   const int* Ydata = Y.data<int>();
-  float* output_data = loss->template mutable_data<float>();
+  float* output_data = loss->mutable_data<float>();
 
   MRCKernel<<<CAFFE_GET_BLOCKS(X1.size()), CAFFE_CUDA_NUM_THREADS,
               0, context_.cuda_stream()>>>(
@@ -70,8 +70,8 @@ bool MarginRankingCriterionGradientOp<CUDAContext>::RunOnDevice() {
   const int* Ydata = Y.data<int>();
   const float* dOutput_data = dOutput.data<float>();
 
-  float* dX1_data = dX1->template mutable_data<float>();
-  float* dX2_data = dX2->template mutable_data<float>();
+  float* dX1_data = dX1->mutable_data<float>();
+  float* dX2_data = dX2->mutable_data<float>();
   MRCGradientKernel<<<CAFFE_GET_BLOCKS(X1.size()), CAFFE_CUDA_NUM_THREADS,
                       0, context_.cuda_stream()>>>(
       X1.size(), Ydata, X1data, X2data,
diff --git a/caffe2/operators/max_pool_with_index.cu b/caffe2/operators/max_pool_with_index.cu
index eee10c488f5da..5ac3c58bb5f89 100644
--- a/caffe2/operators/max_pool_with_index.cu
+++ b/caffe2/operators/max_pool_with_index.cu
@@ -115,27 +115,27 @@ bool MaxPoolWithIndexOp::DoRunWithType() {
   int output_size = Y->size();
   mask->Resize(output_size);
 
-  MaxPoolForward<T>
-      <<<CAFFE_GET_BLOCKS(output_size),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          output_size,
-          X.data<T>(),
-          X.dim32(0),
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          Y->dim32(2),
-          Y->dim32(3),
-          kernel_h(),
-          kernel_w(),
-          stride_h(),
-          stride_w(),
-          pad_t(),
-          pad_l(),
-          Y->template mutable_data<T>(),
-          mask->template mutable_data<int>());
+  MaxPoolForward<T><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<T>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(2),
+      Y->dim32(3),
+      kernel_h(),
+      kernel_w(),
+      stride_h(),
+      stride_w(),
+      pad_t(),
+      pad_l(),
+      Y->mutable_data<T>(),
+      mask->mutable_data<int>());
   return true;
 }
 
diff --git a/caffe2/operators/mem_query_op.cu b/caffe2/operators/mem_query_op.cu
index e8351376f23e2..767be8af2c385 100644
--- a/caffe2/operators/mem_query_op.cu
+++ b/caffe2/operators/mem_query_op.cu
@@ -19,14 +19,12 @@ class GetGPUMemoryUsageOp final : public Operator<CUDAContext> {
 
     auto* stats = Output(0);
     stats->Resize(2, total_by_gpu.size());
-    context_.CopyFromCPU<long>(
-        total_by_gpu.size(),
-        total_by_gpu.data(),
-        stats->template mutable_data<long>());
-    context_.CopyFromCPU<long>(
+    context_.Copy<long, CPUContext, CUDAContext>(
+        total_by_gpu.size(), total_by_gpu.data(), stats->mutable_data<long>());
+    context_.Copy<long, CPUContext, CUDAContext>(
         max_by_gpu.size(),
         max_by_gpu.data(),
-        stats->template mutable_data<long>() + total_by_gpu.size());
+        stats->mutable_data<long>() + total_by_gpu.size());
     return true;
   }
 };
diff --git a/caffe2/operators/multi_class_accuracy_op.cc b/caffe2/operators/multi_class_accuracy_op.cc
index 581c034c2b8bd..9eda6fbe2c1b0 100644
--- a/caffe2/operators/multi_class_accuracy_op.cc
+++ b/caffe2/operators/multi_class_accuracy_op.cc
@@ -20,8 +20,8 @@ bool MultiClassAccuracyOp<float, CPUContext>::RunOnDevice() {
 
   const auto* Xdata = X.data<float>();
   const auto* labeldata = label.data<int>();
-  auto* accuracies = Y0->template mutable_data<float>();
-  auto* amounts = Y1->template mutable_data<int>();
+  auto* accuracies = Y0->mutable_data<float>();
+  auto* amounts = Y1->mutable_data<int>();
   std::fill(accuracies, accuracies + D, 0);
   std::fill(amounts, amounts + D, 0);
 
diff --git a/caffe2/operators/multi_class_accuracy_op.cu b/caffe2/operators/multi_class_accuracy_op.cu
index 275005be50889..f5672146fd418 100644
--- a/caffe2/operators/multi_class_accuracy_op.cu
+++ b/caffe2/operators/multi_class_accuracy_op.cu
@@ -51,8 +51,8 @@ bool MultiClassAccuracyOp<float, CUDAContext>::RunOnDevice() {
 
   const float* Xdata = X.data<float>();
   const int* labeldata = label.data<int>();
-  float* accuracies = Y0->template mutable_data<float>();
-  int* amounts = Y1->template mutable_data<int>();
+  float* accuracies = Y0->mutable_data<float>();
+  int* amounts = Y1->mutable_data<int>();
   math::Set<float, CUDAContext>(D, 0.0, accuracies, &context_);
   math::Set<int, CUDAContext>(D, 0, amounts, &context_);
 
diff --git a/caffe2/operators/norm_planar_yuv_op.cc b/caffe2/operators/norm_planar_yuv_op.cc
index c0f997484ca2a..ea3ccc222dc96 100644
--- a/caffe2/operators/norm_planar_yuv_op.cc
+++ b/caffe2/operators/norm_planar_yuv_op.cc
@@ -28,7 +28,7 @@ class NormalizePlanarYUVOp : public Operator<CPUContext> {
     CAFFE_ENFORCE(C == M.dim(1));
     CAFFE_ENFORCE(C == S.dim(1));
     const auto* Xdata = X.data<float>();
-    auto* Zdata = Z->template mutable_data<float>();
+    auto* Zdata = Z->mutable_data<float>();
 
     int offset = H * W;
     for (auto n = 0; n < N; n++) { // realistically N will always be 1
diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
index 8a8adb6dbe313..dcffe02f650ab 100644
--- a/caffe2/operators/normalize_ops.cu
+++ b/caffe2/operators/normalize_ops.cu
@@ -112,12 +112,7 @@ bool NormalizeGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      M,
-      N,
-      SF,
-      X.data<float>(),
-      dY.data<float>(),
-      dX->template mutable_data<float>());
+      M, N, SF, X.data<float>(), dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h
index 2413652e32771..66a5cc42393c3 100644
--- a/caffe2/operators/numpy_tile_op.h
+++ b/caffe2/operators/numpy_tile_op.h
@@ -37,7 +37,7 @@ class NumpyTileOp : public Operator<Context> {
     // Alternate inputs and outputs between two buffers. Repeatedly apply the
     // Tile kernel along each axis. Then copy out the resulting data into the
     // output tensor.
-    Tensor *src = &buffer, *dst = output;
+    Tensor<Context> *src = &buffer, *dst = output;
     src->CopyFrom(input);
     vector<TIndex> output_dims(input.dims());
     for (size_t i = 0; i < repeats.size(); ++i) {
@@ -98,14 +98,15 @@ class NumpyTileOp : public Operator<Context> {
       char* output_data) {
     for (auto i = 0; i < outer_dim; ++i) {
       for (auto t = 0; t < num_tiles; ++t) {
-        context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
+        context_.template CopyItems<Context, Context>(
+            meta, inner_dim, input_data, output_data);
         output_data += inner_dim * item_size;
       }
       input_data += inner_dim * item_size;
     }
   }
 
-  Tensor buffer{Context::GetDeviceType()};
+  Tensor<Context> buffer;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index dda273dd47cc3..bb8a1dbc77441 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -92,8 +92,8 @@ template <>
 void OneHotOp<CPUContext>::DoOneHotOp(
     TIndex batch_size,
     TIndex index_size,
-    const Tensor& indices,
-    Tensor* one_hots) {
+    const Tensor<CPUContext>& indices,
+    Tensor<CPUContext>* one_hots) {
   const TIndex* indices_ptr = indices.template data<TIndex>();
   float* one_hots_ptr = one_hots->template mutable_data<float>();
   memset(one_hots_ptr, 0, one_hots->nbytes());
@@ -187,7 +187,7 @@ class SegmentOneHotOp : public Operator<CPUContext> {
     auto* indices_ptr = indices.data<int64_t>();
     auto* one_hots = Output(0);
     one_hots->Resize(batch_size, index_size);
-    auto* one_hots_ptr = one_hots->template mutable_data<float>();
+    auto* one_hots_ptr = one_hots->mutable_data<float>();
     if (one_hots->size() == 0) {
       return true;
     }
diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
index e1b6e18daf870..9cca0a5b2ffeb 100644
--- a/caffe2/operators/one_hot_ops.cu
+++ b/caffe2/operators/one_hot_ops.cu
@@ -19,9 +19,9 @@ template <>
 void OneHotOp<CUDAContext>::DoOneHotOp(
     TIndex batch_size,
     TIndex index_size,
-    const Tensor& indices,
-    Tensor* output) {
-  float* output_ptr = output->template mutable_data<float>();
+    const Tensor<CUDAContext>& indices,
+    Tensor<CUDAContext>* output) {
+  float* output_ptr = output->mutable_data<float>();
   math::Set<float, CUDAContext>(output->size(), 0., output_ptr, &context_);
   OneHotOpKernel<<<
       CAFFE_GET_BLOCKS(batch_size),
diff --git a/caffe2/operators/one_hot_ops.h b/caffe2/operators/one_hot_ops.h
index f8d8c3262be11..1b48b69326f3e 100644
--- a/caffe2/operators/one_hot_ops.h
+++ b/caffe2/operators/one_hot_ops.h
@@ -24,7 +24,7 @@ class OneHotOp final : public Operator<Context> {
         "indices input must be 1D tensor of data type TIndex");
 
     // Index size input must be in CPU context
-    auto& index_size_tensor = OperatorBase::Input<Tensor>(1, CPU);
+    auto& index_size_tensor = OperatorBase::Input<Tensor<CPUContext>>(1);
     CAFFE_ENFORCE_EQ(
         index_size_tensor.size(),
         1,
@@ -47,8 +47,8 @@ class OneHotOp final : public Operator<Context> {
   void DoOneHotOp(
       TIndex batch_size,
       TIndex index_size,
-      const Tensor& indices,
-      Tensor* output);
+      const Tensor<Context>& indices,
+      Tensor<Context>* output);
 };
 
 template <class Context>
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index 874bf075458ee..d6a72bf9b1e5f 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -140,15 +140,16 @@ class ONNXWhileOp final : public Operator<Context> {
         for (int i = 0; i < num_loop_carried_deps; ++i) {
           Blob* b = cur_ws->GetBlob(
               scope_->net()->external_output()[i + 1]);
-          const Tensor& t = b->template Get<Tensor>();
+          const Tensor<Context>& t = b->template Get<Tensor<Context>>();
           scope_->lcd_tensor(i)->CopyFrom(t);
         }
         // Copy out scan_outputs
         for (int i = 0; i < num_scan_outputs; ++i) {
           int net_output_idx = i + 1 + num_loop_carried_deps;
-          const Tensor& scan_output =
-              cur_ws->GetBlob(scope_->net()->external_output()[net_output_idx])
-                  ->template Get<Tensor>();
+          const Tensor<Context>& scan_output =
+              cur_ws->GetBlob(
+                  scope_->net()->external_output()[net_output_idx])
+                  ->template Get<Tensor<Context>>();
           auto* scan_output_target = Output(i + num_loop_carried_deps);
           if (itr == 0) {
             auto dims = scan_output.dims();
@@ -213,23 +214,22 @@ class ONNXWhileOp final : public Operator<Context> {
       lcd_tensors_.clear();
       for (int i = 2; i < body_net_def.external_input_size(); ++i) {
         Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
+        Tensor<Context>* t = b->template GetMutable<Tensor<Context>>();
         lcd_tensors_.push_back(t);
       }
       // First output is the iteration variable
       auto* iteration_var_blob = loop_ws_->CreateBlob(
           body_net_def.external_input(0));
       iteration_var_ =
-          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
+          iteration_var_blob->template GetMutable<Tensor<Context>>();
 
-      input_condition_var_ =
-          loop_ws_->CreateBlob(body_net_def.external_input(1))
-              ->GetMutableTensor(Context::GetDeviceType());
+      input_condition_var_ = loop_ws_->CreateBlob(
+          body_net_def.external_input(1))
+          ->template GetMutable<Tensor<Context>>();
 
       auto* condition_var_blob =
           loop_ws_->CreateBlob(body_net_def.external_output(0));
-      condition_var_ =
-          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
+      condition_var_ = condition_var_blob->template GetMutable<Tensor<Context>>();
       condition_var_->Resize(1);
       condition_var_->template mutable_data<bool>();
 
@@ -254,7 +254,7 @@ class ONNXWhileOp final : public Operator<Context> {
       return *iteration_var_ptr;
     }
 
-    Tensor* lcd_tensor(int idx) {
+    Tensor<Context>* lcd_tensor(int idx) {
       return lcd_tensors_[idx];
     }
 
@@ -284,11 +284,11 @@ class ONNXWhileOp final : public Operator<Context> {
     Workspace *loop_ws_;
 
     NetBase* body_net_; // owned by a workspace
-    Tensor* iteration_var_;
-    Tensor* input_condition_var_;
-    Tensor* condition_var_;
+    Tensor<Context>* iteration_var_;
+    Tensor<Context>* input_condition_var_;
+    Tensor<Context>* condition_var_;
 
-    std::vector<Tensor*> lcd_tensors_;
+    std::vector<Tensor<Context>*> lcd_tensors_;
   };
 
   NetDef body_net_def_;
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 3cdf252c8c0ed..3030f45babde5 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->template IsType<Tensor>(CPU),
+      blob->template IsType<TensorCPU>(),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index e66ba7ea7383c..62d9cdbdaef07 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -62,8 +62,8 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
   bool RunOnDevice() override {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
-      if (OperatorBase::InputIsType<Tensor>(i, CUDA)) {
-        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
+      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
+        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
             Input(i), &context_);
         need_sync = true;
       } else {
@@ -93,10 +93,11 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->template IsType<TensorCPU>(),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
-      Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
+      Output(i)->CopyFrom(
+          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
     }
     return true;
   }
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index e562858c073ec..eb6c225478cb1 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -37,11 +37,11 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
       "IncrementByOne", "", vector<string>{"X"},
       vector<string>{"X"});
   Workspace ws;
-  Tensor source_tensor(vector<TIndex>{2, 3}, CPU);
+  TensorCPU source_tensor(vector<TIndex>{2, 3});
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
+  ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
@@ -61,16 +61,16 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
       vector<string>{"X"});
   op_def.mutable_device_option()->set_device_type(CUDA);
   Workspace ws;
-  Tensor source_tensor(vector<TIndex>{2, 3}, CPU);
+  TensorCPU source_tensor(vector<TIndex>{2, 3});
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
+  ws.CreateBlob("X")->GetMutable<TensorCUDA>()->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
   const TensorCUDA& output = ws.GetBlob("X")->Get<TensorCUDA>();
-  Tensor output_cpu(output, CPU);
+  TensorCPU output_cpu(output);
   EXPECT_EQ(output.ndim(), 2);
   EXPECT_EQ(output.dim(0), 2);
   EXPECT_EQ(output.dim(1), 3);
diff --git a/caffe2/operators/order_switch_ops.cc b/caffe2/operators/order_switch_ops.cc
index 7e862eb39ff11..11cc6dedc24f9 100644
--- a/caffe2/operators/order_switch_ops.cc
+++ b/caffe2/operators/order_switch_ops.cc
@@ -10,7 +10,7 @@ bool NHWC2NCHWOp<float, CPUContext>::RunOnDevice() {
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   Y->Resize(N, C, H, W);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   for (int n = 0; n < N; ++n) {
     for (int h = 0; h < H; ++h) {
       for (int w = 0; w < W; ++w) {
@@ -31,7 +31,7 @@ bool NCHW2NHWCOp<float, CPUContext>::RunOnDevice() {
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   Y->Resize(N, H, W, C);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   for (int n = 0; n < N; ++n) {
     for (int c = 0; c < C; ++c) {
       for (int h = 0; h < H; ++h) {
@@ -66,18 +66,20 @@ OPERATOR_SCHEMA(NHWC2NCHW)
 The operator switches the order of data in a tensor from NHWC- sample index N,
 height H, width H and channels C, to the NCHW order.
 )DOC")
-    .Input(0, "data", "The input data (Tensor) in the NHWC order.")
-    .Output(0, "output", "The output tensor (Tensor) in the NCHW order.");
+    .Input(0, "data", "The input data (Tensor<float>) in the NHWC order.")
+    .Output(
+        0,
+        "output",
+        "The output tensor (Tensor<float>) in the NCHW order.");
 
-OPERATOR_SCHEMA(NCHW2NHWC)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
+OPERATOR_SCHEMA(NCHW2NHWC).NumInputs(1).NumOutputs(1)
+  .SetDoc(R"DOC(
 The operator switches the order of data in a tensor from NCHW- sample index N,
 channels C, height H and width W, to the NHWC order.
 )DOC")
-    .Input(0, "data", "The input data (Tensor) in the NCHW order.")
-    .Output(0, "output", "The output tensor (Tensor) in the NHWC order.");
+  .Input(0, "data", "The input data (Tensor<float>) in the NCHW order.")
+  .Output(0, "output", "The output tensor (Tensor<float>) in the NHWC order.");
+
 
 class GetNHWC2NCHWGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
index f7fa090248c43..2d77b5da85a24 100644
--- a/caffe2/operators/order_switch_ops.cu
+++ b/caffe2/operators/order_switch_ops.cu
@@ -30,12 +30,9 @@ bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(X.ndim(), 4);
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   Y->Resize(N, C, H, W);
-  NHWC2NCHWKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, H * W, C, X.data<float>(), Y->template mutable_data<float>());
+  NHWC2NCHWKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+                    0, context_.cuda_stream()>>>(
+      N, H * W, C, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -46,12 +43,9 @@ bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(X.ndim(), 4);
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   Y->Resize(N, H, W, C);
-  NCHW2NHWCKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, C, H * W, X.data<float>(), Y->template mutable_data<float>());
+  NCHW2NHWCKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+                    0, context_.cuda_stream()>>>(
+      N, C, H * W, X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/pack_rnn_sequence_op.h b/caffe2/operators/pack_rnn_sequence_op.h
index 74d40f6bfd47d..c2fcb7d6beb00 100644
--- a/caffe2/operators/pack_rnn_sequence_op.h
+++ b/caffe2/operators/pack_rnn_sequence_op.h
@@ -73,7 +73,7 @@ class PackRNNSequenceOpBase : public Operator<Context> {
       for (int r = 0; r < lengths_vec[c]; r++) {
         auto input_offset = Forward ? (offset + r) : (r * cols + c);
         auto output_offset = Forward ? (r * cols + c) : (offset + r);
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<Context, Context>(
             values.meta(),
             block_size,
             values_vec + input_offset * block_size,
diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
index ab831445e56e3..2c2f3fdc4fafc 100644
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@@ -16,7 +16,7 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   const auto& data = Input(DATA);
   const auto& lengths = Input(LENGTHS);
   auto* output = Output(0);
-  Tensor* presence_mask = nullptr;
+  Tensor<CPUContext>* presence_mask = nullptr;
   if (return_presence_mask_) {
     presence_mask = Output(1);
   }
@@ -88,7 +88,7 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   const auto* d = static_cast<const char*>(data.raw_data());
   TIndex start = 0;
   for (TIndex i = 0; i < lengths.dim(0); ++i) {
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<CPUContext, CPUContext>(
         data.meta(),
         l[i] * block_size,
         d + block_bytesize * start,
@@ -145,7 +145,7 @@ bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
   const auto* d = static_cast<const char*>(data.raw_data());
   TIndex start = 0;
   for (TIndex i = 0; i < lengths.dim(0); ++i) {
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<CPUContext, CPUContext>(
         data.meta(),
         l[i] * block_size,
         d + block_bytesize * data.dim(1) * i,
diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
index ae573adde3db4..8e4bdc49339c1 100644
--- a/caffe2/operators/pack_segments.cu
+++ b/caffe2/operators/pack_segments.cu
@@ -53,9 +53,9 @@ template <typename T>
 int64_t int_array_sum(
     const T* dev_array,
     int64_t num_items,
-    Tensor& dev_buffer,
-    Tensor& dev_sum,
-    Tensor& host_sum,
+    Tensor<CUDAContext>& dev_buffer,
+    Tensor<CUDAContext>& dev_sum,
+    Tensor<CPUContext>& host_sum,
     CUDAContext& context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -82,7 +82,7 @@ int64_t int_array_sum(
       context.cuda_stream());
 
   // Copy to host
-  host_sum.CopyFrom(dev_sum);
+  host_sum.CopyFrom<CUDAContext>(dev_sum);
   context.FinishDeviceComputation();
   return *host_sum.data<int64_t>();
 }
@@ -91,9 +91,9 @@ template <typename T>
 T array_max(
     const T* dev_array,
     int64_t num_items,
-    Tensor& dev_max_buffer,
-    Tensor& dev_max,
-    Tensor& host_max,
+    Tensor<CUDAContext>& dev_max_buffer,
+    Tensor<CUDAContext>& dev_max,
+    Tensor<CPUContext>& host_max,
     CUDAContext& context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -120,7 +120,7 @@ T array_max(
       context.cuda_stream());
 
   // Copy to host
-  host_max.CopyFrom(dev_max);
+  host_max.CopyFrom<CUDAContext>(dev_max);
   context.FinishDeviceComputation();
   return *host_max.data<T>();
 }
@@ -129,8 +129,8 @@ template <typename T>
 void array_prefix_sum_exclusive(
     const T* dev_array,
     const int32_t num_items,
-    Tensor& prefix_buffer,
-    Tensor& prefix_sum,
+    Tensor<CUDAContext>& prefix_buffer,
+    Tensor<CUDAContext>& prefix_sum,
     CUDAContext& context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
diff --git a/caffe2/operators/pack_segments.h b/caffe2/operators/pack_segments.h
index c6eb759f36dba..c35299d66d1a2 100644
--- a/caffe2/operators/pack_segments.h
+++ b/caffe2/operators/pack_segments.h
@@ -51,10 +51,10 @@ class PackSegmentsOp final : public Operator<Context> {
   bool return_presence_mask_;
 
   // Scratch space required by the CUDA version
-  Tensor dev_buffer_{Context::GetDeviceType()};
-  Tensor dev_lengths_prefix_sum_{Context::GetDeviceType()};
-  Tensor dev_max_length_{Context::GetDeviceType()};
-  Tensor host_max_length_{CPU};
+  Tensor<Context> dev_buffer_;
+  Tensor<Context> dev_lengths_prefix_sum_;
+  Tensor<Context> dev_max_length_;
+  Tensor<CPUContext> host_max_length_;
 };
 
 template <class Context>
@@ -81,12 +81,12 @@ class UnpackSegmentsOp final : public Operator<Context> {
 
  private:
   TIndex max_length_;
-  Tensor dev_buffer_{Context::GetDeviceType()};
-  Tensor dev_lengths_prefix_sum_{Context::GetDeviceType()};
-  Tensor dev_max_length_{Context::GetDeviceType()};
-  Tensor dev_num_cell_{Context::GetDeviceType()};
-  Tensor host_max_length_{CPU};
-  Tensor host_num_cell_{CPU};
+  Tensor<Context> dev_buffer_;
+  Tensor<Context> dev_lengths_prefix_sum_;
+  Tensor<Context> dev_max_length_;
+  Tensor<Context> dev_num_cell_;
+  Tensor<CPUContext> host_max_length_;
+  Tensor<CPUContext> host_num_cell_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/pad_op.cc b/caffe2/operators/pad_op.cc
index d7ac46ff8136b..74de23e2a9763 100644
--- a/caffe2/operators/pad_op.cc
+++ b/caffe2/operators/pad_op.cc
@@ -29,7 +29,7 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   ConvPoolOpBase::SetOutputSize(X, Y, channels);
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   // The main loop
   int padded_height = Y->dim32(2);
   int padded_width = Y->dim32(3);
@@ -166,7 +166,7 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   int channels = X.dim32(3);
   ConvPoolOpBase::SetOutputSize(X, Y, channels);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
 
   // The main loop
   int padded_height = Y->dim32(1);
@@ -259,7 +259,7 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   int width = dX->dim32(3);
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   math::Set<float, CPUContext>(dX->size(), 0, dXdata, &context_);
   // The main loop
   switch (mode_) {
@@ -339,7 +339,7 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   int width = dX->dim32(2);
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   math::Set<float, CPUContext>(dX->size(), 0, dXdata, &context_);
 
   switch (mode_) {
diff --git a/caffe2/operators/pad_op_gpu.cu b/caffe2/operators/pad_op_gpu.cu
index fa812d2a11f8b..bfb4542ca81d1 100644
--- a/caffe2/operators/pad_op_gpu.cu
+++ b/caffe2/operators/pad_op_gpu.cu
@@ -261,7 +261,7 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const int padded_height = Y->dim32(2);
   const int padded_width = Y->dim32(3);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
 
   switch (mode_) {
     case PadMode::CONSTANT:
@@ -337,7 +337,7 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const int padded_height = Y->dim32(1);
   const int padded_width = Y->dim32(2);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
 
   switch (mode_) {
     case PadMode::CONSTANT:
@@ -418,7 +418,7 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const int height = dX->dim32(2);
   const int width = dX->dim32(3);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   math::Set<float, CUDAContext>(output_size, 0, dXdata, &context_);
 
   switch (mode_) {
@@ -499,7 +499,7 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const int width = dX->dim32(2);
   const int channels = dX->dim32(3);
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   math::Set<float, CUDAContext>(output_size, 0, dXdata, &context_);
 
   switch (mode_) {
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index 003653cbc8976..563a02638edd4 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -81,7 +81,7 @@ class GatherByKeyOp : public Operator<CPUContext> {
         if (currentShard != -1) {
           auto inStartOffset = inStartOffsets_[currentShard];
           auto numItems = i - outStartOffset;
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<CPUContext, CPUContext>(
               meta,
               numItems * blockSize,
               inputDatas_[currentShard] +
@@ -183,7 +183,7 @@ class PartitionOpBase : public Operator<CPUContext> {
         auto bs = block_sizes_[i];
         auto meta = metas_[i];
         // special case for small bs?
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<CPUContext, CPUContext>(
             meta,
             bs,
             static_cast<const char*>(raw_datas_[i]) + p * bs * meta.itemsize(),
diff --git a/caffe2/operators/percentile_op.h b/caffe2/operators/percentile_op.h
index 895281c6a88b8..2cc96e78c0c8a 100644
--- a/caffe2/operators/percentile_op.h
+++ b/caffe2/operators/percentile_op.h
@@ -25,8 +25,8 @@ class PercentileOp final : public Operator<Context> {
  protected:
   INPUT_TAGS(X, VAL_PCT_PAIRS, LENS);
   OUTPUT_TAGS(PCT);
-  Tensor values_tensor{Context::GetDeviceType()};
-  Tensor percentiles_tensor{Context::GetDeviceType()};
+  Tensor<Context> values_tensor;
+  Tensor<Context> percentiles_tensor;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/perplexity_op.cc b/caffe2/operators/perplexity_op.cc
index 028a6077cc860..a7c4d52285e3d 100644
--- a/caffe2/operators/perplexity_op.cc
+++ b/caffe2/operators/perplexity_op.cc
@@ -17,7 +17,7 @@ bool PerplexityOp<float, CPUContext>::RunOnDevice() {
   for (int i = 0; i < N; ++i) {
     perplexity *= pow(Xdata[i], -1.0/N);
   }
-  *(Y->template mutable_data<float>()) = perplexity;
+  *(Y->mutable_data<float>()) = perplexity;
   return true;
 }
 
diff --git a/caffe2/operators/perplexity_op.cu b/caffe2/operators/perplexity_op.cu
index 230bdb1601cb4..afb4d3dc27261 100644
--- a/caffe2/operators/perplexity_op.cu
+++ b/caffe2/operators/perplexity_op.cu
@@ -26,7 +26,7 @@ bool PerplexityOp<float, CUDAContext>::RunOnDevice() {
   int N = X.dim32(0);
 
   Y->Resize(vector<TIndex>());
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   const float* Xdata = X.data<float>();
 
   float perplexity = thrust::transform_reduce(
diff --git a/caffe2/operators/piecewise_linear_transform_op.cc b/caffe2/operators/piecewise_linear_transform_op.cc
index a572d60651f16..528b33619b222 100644
--- a/caffe2/operators/piecewise_linear_transform_op.cc
+++ b/caffe2/operators/piecewise_linear_transform_op.cc
@@ -11,7 +11,7 @@ OPERATOR_SCHEMA(PiecewiseLinearTransform)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 PiecewiseLinearTransform takes inputs -- predictions, a 2-D or 1-D tensor
-(Tensor) of size (batch_size x prediction_dimensions). The piecewise
+(Tensor<float>) of size (batch_size x prediction_dimensions). The piecewise
 linear functions are stored in bounds, slopes and intercepts. The output tensor
 has the same shape of input `predictions` and contains the predictions
 transformed by the piecewise linear functions. Each column of predictions has
@@ -57,7 +57,7 @@ bound.
     .Input(
         0,
         "predictions",
-        "2-D tensor (Tensor) of size "
+        "2-D tensor (Tensor<float>) of size "
         "(num_batches x num_classes) containing scores")
     .Input(
         1,
@@ -77,7 +77,7 @@ bound.
     .Output(
         0,
         "transforms",
-        "2-D tensor (Tensor) of size (num_batches x num_classes) "
+        "2-D tensor (Tensor<float>) of size (num_batches x num_classes) "
         "containing transformed predictions");
 
 SHOULD_NOT_DO_GRADIENT(PiecewiseLinearTransform);
diff --git a/caffe2/operators/piecewise_linear_transform_op.cu b/caffe2/operators/piecewise_linear_transform_op.cu
index 8dc2d4e022850..ecc9f0f249397 100644
--- a/caffe2/operators/piecewise_linear_transform_op.cu
+++ b/caffe2/operators/piecewise_linear_transform_op.cu
@@ -137,27 +137,27 @@ void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
       }
 
       int length = num_group * num_func_per_group;
-      Tensor bounds_host{CPU};
+      TensorCPU bounds_host;
       bounds_host.Resize(length + num_group);
       memcpy(
           bounds_host.mutable_data<float>(),
           bounds,
           (length + num_group) * sizeof(float));
 
-      Tensor intercepts_host{CPU};
+      TensorCPU intercepts_host;
       intercepts_host.Resize(length);
       memcpy(
           intercepts_host.mutable_data<float>(),
           intercepts,
           (length) * sizeof(float));
-      Tensor slopes_host{CPU};
+      TensorCPU slopes_host;
       slopes_host.Resize(length);
       memcpy(
           slopes_host.mutable_data<float>(), slopes, (length) * sizeof(float));
 
-      bounds_device_.CopyFrom(bounds_host);
-      intercepts_device_.CopyFrom(intercepts_host);
-      slopes_device_.CopyFrom(slopes_host);
+      bounds_device_.CopyFrom<CPUContext>(bounds_host);
+      intercepts_device_.CopyFrom<CPUContext>(intercepts_host);
+      slopes_device_.CopyFrom<CPUContext>(slopes_host);
 
       gpu_copied_ = true;
     }
@@ -185,9 +185,9 @@ void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
       CAFFE_ENFORCE_EQ(num_group, M);
     }
 
-    bounds_device_.CopyFrom(bounds_input);
-    slopes_device_.CopyFrom(slopes_input);
-    intercepts_device_.CopyFrom(intercepts_input);
+    bounds_device_.CopyFrom<CUDAContext>(bounds_input);
+    slopes_device_.CopyFrom<CUDAContext>(slopes_input);
+    intercepts_device_.CopyFrom<CUDAContext>(intercepts_input);
   }
 }
 
@@ -218,7 +218,7 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformGeneral() {
       slopes_device_.data<float>(),
       intercepts_device_.data<float>(),
       X.data<float>(),
-      Y->template mutable_data<float>());
+      Y->mutable_data<float>());
 
   return true;
 }
@@ -254,7 +254,7 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
         slopes_device_.data<float>(),
         intercepts_device_.data<float>(),
         X.data<float>(),
-        Y->template mutable_data<float>());
+        Y->mutable_data<float>());
   } else {
     // don't want N*M threads, only N*M/2
     PieceWiseLinearTransformBinaryKernel2<<<
@@ -270,7 +270,7 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
         slopes_device_.data<float>(),
         intercepts_device_.data<float>(),
         X.data<float>(),
-        Y->template mutable_data<float>());
+        Y->mutable_data<float>());
   }
 
   return true;
diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h
index 7428b6cc24e4b..701acb87f9ad8 100644
--- a/caffe2/operators/piecewise_linear_transform_op.h
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@@ -233,9 +233,9 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
   vector<T> slopes_from_arg_;
   vector<T> intercepts_from_arg_;
 
-  Tensor bounds_device_{Context::GetDeviceType()};
-  Tensor intercepts_device_{Context::GetDeviceType()};
-  Tensor slopes_device_{Context::GetDeviceType()};
+  Tensor<Context> bounds_device_;
+  Tensor<Context> intercepts_device_;
+  Tensor<Context> slopes_device_;
   bool gpu_copied_ = false;
 
   // If true, the piecewise linear functions are passed through args,
diff --git a/caffe2/operators/pool_op.cu b/caffe2/operators/pool_op.cu
index 4af3be93a6b8f..48b30afaa4d00 100644
--- a/caffe2/operators/pool_op.cu
+++ b/caffe2/operators/pool_op.cu
@@ -564,70 +564,70 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      Average1DPoolForwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              Y->dim32(2),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              Y->template mutable_data<float>());
+      Average1DPoolForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
       break;
     case 2:
-      Average2DPoolForwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              Y->dim32(2),
-              Y->dim32(3),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              Y->template mutable_data<float>());
+      Average2DPoolForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
       break;
     case 3:
-      Average3DPoolForwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              Y->dim32(2),
-              Y->dim32(3),
-              Y->dim32(4),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              Y->template mutable_data<float>());
+      Average3DPoolForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(2),
+          Y->dim32(3),
+          Y->dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -643,70 +643,70 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      Average1DPoolForwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              Y->dim32(1),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              Y->template mutable_data<float>());
+      Average1DPoolForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
       break;
     case 2:
-      Average2DPoolForwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              Y->dim32(1),
-              Y->dim32(2),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              Y->template mutable_data<float>());
+      Average2DPoolForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(1),
+          Y->dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
       break;
     case 3:
-      Average3DPoolForwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              Y->dim32(1),
-              Y->dim32(2),
-              Y->dim32(3),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              Y->template mutable_data<float>());
+      Average3DPoolForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(1),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -726,70 +726,70 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      Ave1DPoolBackwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              dY.dim32(2),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              dX->template mutable_data<float>());
+      Ave1DPoolBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
       break;
     case 2:
-      Ave2DPoolBackwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              dY.dim32(2),
-              dY.dim32(3),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              dX->template mutable_data<float>());
+      Ave2DPoolBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
       break;
     case 3:
-      Ave3DPoolBackwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              dY.dim32(2),
-              dY.dim32(3),
-              dY.dim32(4),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              dX->template mutable_data<float>());
+      Ave3DPoolBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(2),
+          dY.dim32(3),
+          dY.dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -810,70 +810,70 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      Ave1DPoolBackwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              dY.dim32(1),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              dX->template mutable_data<float>());
+      Ave1DPoolBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
       break;
     case 2:
-      Ave2DPoolBackwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              dY.dim32(1),
-              dY.dim32(2),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              dX->template mutable_data<float>());
+      Ave2DPoolBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(1),
+          dY.dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
       break;
     case 3:
-      Ave3DPoolBackwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              dY.dim32(1),
-              dY.dim32(2),
-              dY.dim32(3),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              dX->template mutable_data<float>());
+      Ave3DPoolBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(1),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1405,67 +1405,67 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DForwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(1),
-              X.dim32(2),
-              Y->dim32(2),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              Y->template mutable_data<float>());
+      MaxPool1DForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
       break;
     case 2:
-      MaxPool2DForwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              Y->dim32(2),
-              Y->dim32(3),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              Y->template mutable_data<float>());
+      MaxPool2DForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
       break;
     case 3:
-      MaxPool3DForwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              Y->dim32(2),
-              Y->dim32(3),
-              Y->dim32(4),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              Y->template mutable_data<float>());
+      MaxPool3DForwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(2),
+          Y->dim32(3),
+          Y->dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1481,67 +1481,67 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
   int output_size = Y->size();
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DForwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(1),
-              X.dim32(2),
-              Y->dim32(1),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              Y->template mutable_data<float>());
+      MaxPool1DForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          Y->dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          Y->mutable_data<float>());
       break;
     case 2:
-      MaxPool2DForwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              Y->dim32(1),
-              Y->dim32(2),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              Y->template mutable_data<float>());
+      MaxPool2DForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          Y->dim32(1),
+          Y->dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          Y->mutable_data<float>());
       break;
     case 3:
-      MaxPool3DForwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(output_size),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              output_size,
-              X.data<float>(),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              Y->dim32(1),
-              Y->dim32(2),
-              Y->dim32(3),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              Y->template mutable_data<float>());
+      MaxPool3DForwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(output_size),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          Y->dim32(1),
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          Y->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1561,76 +1561,76 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DBackwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              X.data<float>(),
-              Y.data<float>(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              dY.dim32(2),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              dX->template mutable_data<float>());
+      MaxPool1DBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(2),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
       break;
     case 2:
-      MaxPool2DBackwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              X.data<float>(),
-              Y.data<float>(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              dY.dim32(2),
-              dY.dim32(3),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              dX->template mutable_data<float>());
+      MaxPool2DBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
       break;
     case 3:
-      MaxPool3DBackwardNCHW<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              X.data<float>(),
-              Y.data<float>(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              dY.dim32(2),
-              dY.dim32(3),
-              dY.dim32(4),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              dX->template mutable_data<float>());
+      MaxPool3DBackwardNCHW<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(2),
+          dY.dim32(3),
+          dY.dim32(4),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
@@ -1650,75 +1650,75 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
   ConvPoolOpBase<CUDAContext>::ComputePads(dims);
   switch (kernel_.size()) {
     case 1:
-      MaxPool1DBackwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              X.data<float>(),
-              Y.data<float>(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              dY.dim32(1),
-              kernel_h(),
-              stride_h(),
-              pad_t(),
-              dX->template mutable_data<float>());
+      MaxPool1DBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          dY.dim32(1),
+          kernel_h(),
+          stride_h(),
+          pad_t(),
+          dX->mutable_data<float>());
     case 2:
-      MaxPool2DBackwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              X.data<float>(),
-              Y.data<float>(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              dY.dim32(1),
-              dY.dim32(2),
-              kernel_h(),
-              kernel_w(),
-              stride_h(),
-              stride_w(),
-              pad_t(),
-              pad_l(),
-              dX->template mutable_data<float>());
+      MaxPool2DBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          dY.dim32(1),
+          dY.dim32(2),
+          kernel_h(),
+          kernel_w(),
+          stride_h(),
+          stride_w(),
+          pad_t(),
+          pad_l(),
+          dX->mutable_data<float>());
       break;
     case 3:
-      MaxPool3DBackwardNHWC<float>
-          <<<CAFFE_GET_BLOCKS(X.size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(
-              X.size(),
-              X.data<float>(),
-              Y.data<float>(),
-              dY.data<float>(),
-              X.dim32(0),
-              X.dim32(1),
-              X.dim32(2),
-              X.dim32(3),
-              X.dim32(4),
-              dY.dim32(1),
-              dY.dim32(2),
-              dY.dim32(3),
-              kernel_h(),
-              kernel_w(),
-              kernel_[2],
-              stride_h(),
-              stride_w(),
-              stride_[2],
-              pad_t(),
-              pad_l(),
-              pads_[2],
-              dX->template mutable_data<float>());
+      MaxPool3DBackwardNHWC<float><<<
+          CAFFE_GET_BLOCKS(X.size()),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          X.size(),
+          X.data<float>(),
+          Y.data<float>(),
+          dY.data<float>(),
+          X.dim32(0),
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          X.dim32(4),
+          dY.dim32(1),
+          dY.dim32(2),
+          dY.dim32(3),
+          kernel_h(),
+          kernel_w(),
+          kernel_[2],
+          stride_h(),
+          stride_w(),
+          stride_[2],
+          pad_t(),
+          pad_l(),
+          pads_[2],
+          dX->mutable_data<float>());
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
diff --git a/caffe2/operators/pool_op_cudnn.cu b/caffe2/operators/pool_op_cudnn.cu
index 809828a99defa..00f719b819975 100644
--- a/caffe2/operators/pool_op_cudnn.cu
+++ b/caffe2/operators/pool_op_cudnn.cu
@@ -196,10 +196,7 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
                  CAFFE_CUDA_NUM_THREADS,
                  0,
                  context_.cuda_stream()>>>(
-                  N * C,
-                  H * W * D,
-                  X.data<float>(),
-                  Y->template mutable_data<float>());
+                  N * C, H * W * D, X.data<float>(), Y->mutable_data<float>());
           return true;
         }
         if (mode_ == CUDNN_POOLING_MAX) {
@@ -208,10 +205,7 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
                  CAFFE_CUDA_NUM_THREADS,
                  0,
                  context_.cuda_stream()>>>(
-                  N * C,
-                  H * W * D,
-                  X.data<float>(),
-                  Y->template mutable_data<float>());
+                  N * C, H * W * D, X.data<float>(), Y->mutable_data<float>());
           return true;
         }
       }
@@ -385,7 +379,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
                   N * C,
                   H * W * D,
                   dY.data<float>(),
-                  dX->template mutable_data<float>());
+                  dX->mutable_data<float>());
           return true;
         }
 #if CUDNN_VERSION_MIN(6, 0, 0)
@@ -402,7 +396,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
                   N * C,
                   H * W * D,
                   dY.data<float>(),
-                  dX->template mutable_data<float>(),
+                  dX->mutable_data<float>(),
                   Y.data<float>(),
                   X.data<float>());
           return true;
diff --git a/caffe2/operators/prelu_op.cc b/caffe2/operators/prelu_op.cc
index 2edebecf82f2f..8bacf1e29153c 100644
--- a/caffe2/operators/prelu_op.cc
+++ b/caffe2/operators/prelu_op.cc
@@ -188,8 +188,8 @@ bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Wdata = W.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
-  float* dWdata = dW->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  float* dWdata = dW->mutable_data<float>();
 
   // non-shared case.
   switch (order_) {
diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
index 9cf5d5893a96f..b14393d81b50d 100644
--- a/caffe2/operators/prelu_op.cu
+++ b/caffe2/operators/prelu_op.cu
@@ -154,7 +154,7 @@ bool PReluOp<float, CUDAContext>::RunOnDevice() {
   Y->ResizeLike(X);
   const auto* Xdata = X.data<float>();
   const auto* Wdata = W.data<float>();
-  auto* Ydata = Y->template mutable_data<float>();
+  auto* Ydata = Y->mutable_data<float>();
 
   const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
   const auto C_shared = (W.size() == 1);
@@ -221,8 +221,8 @@ bool PReluGradientOp<float, CUDAContext>::RunOnDevice() {
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Wdata = W.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
-  float* dWdata = dW->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  float* dWdata = dW->mutable_data<float>();
   int N = Y.dim(0);
 
   if (C_shared) {
diff --git a/caffe2/operators/prepend_dim_op.h b/caffe2/operators/prepend_dim_op.h
index ab40085bf3c89..16cbb11eec74c 100644
--- a/caffe2/operators/prepend_dim_op.h
+++ b/caffe2/operators/prepend_dim_op.h
@@ -40,7 +40,7 @@ class PrependDimOp : public Operator<Context> {
 
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           input.meta(),
           input.size(),
           input.raw_data(),
@@ -75,7 +75,7 @@ class MergeDimOp : public Operator<Context> {
 
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           input.meta(),
           input.size(),
           input.raw_data(),
diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h
index 8068b2e00510e..768d879fdf0b7 100644
--- a/caffe2/operators/quant_decode_op.h
+++ b/caffe2/operators/quant_decode_op.h
@@ -12,10 +12,10 @@ namespace {
 
 template <class CodebookT, class CodeT>
 void Decode(
-    const Tensor& codebook,
-    const Tensor& codes,
-    /* optional */ const Tensor* const decoded_grad,
-    Tensor* const output,
+    const TensorCPU& codebook,
+    const TensorCPU& codes,
+    /* optional */ const TensorCPU* const decoded_grad,
+    TensorCPU* const output,
     bool resizeOnly) {
   CAFFE_ENFORCE(codebook.IsType<CodebookT>());
 
@@ -28,7 +28,7 @@ void Decode(
   if (decoded_grad == nullptr) {
     // Forward pass: decode and store codebook values in output.
     output->ResizeLike(codes);
-    auto* out_ptr = output->template mutable_data<CodebookT>();
+    auto* out_ptr = output->mutable_data<CodebookT>();
     if (resizeOnly) {
       return;
     }
@@ -45,7 +45,7 @@ void Decode(
     auto* const gradient_end = gradient_ptr + decoded_grad->size();
 
     CAFFE_ENFORCE_EQ(cb_size, output->size());
-    auto* out_ptr = output->template mutable_data<CodebookT>();
+    auto* out_ptr = output->mutable_data<CodebookT>();
     while (gradient_ptr < gradient_end) {
       DCHECK_LE(*code_ptr, cb_size);
       out_ptr[*code_ptr++] += *gradient_ptr++;
@@ -56,10 +56,10 @@ void Decode(
 #define REGISTER_DECODER(codebookType, codesType)                      \
   {                                                                    \
     {TypeMeta::Id<codebookType>(), TypeMeta::Id<codesType>()},         \
-        [](const Tensor& codebook_,                                    \
-           const Tensor& codes_,                                       \
-           const Tensor* gradient_,                                    \
-           Tensor* outDecoded_,                                        \
+        [](const TensorCPU& codebook_,                                 \
+           const TensorCPU& codes_,                                    \
+           const TensorCPU* gradient_,                                 \
+           TensorCPU* outDecoded_,                                     \
            bool resizeOnly_) {                                         \
           Decode<codebookType, codesType>(                             \
               codebook_, codes_, gradient_, outDecoded_, resizeOnly_); \
@@ -67,18 +67,18 @@ void Decode(
   }
 
 inline void DecodeGeneral(
-    const Tensor& codebook,
-    const Tensor& codes,
-    const Tensor* gradient,
-    Tensor* outDecoded,
+    const TensorCPU& codebook,
+    const TensorCPU& codes,
+    const TensorCPU* gradient,
+    TensorCPU* outDecoded,
     bool resizeOnly) {
   const static std::map<
       std::pair<CaffeTypeId, CaffeTypeId>,
       std::function<void(
-          const Tensor& codebook,
-          const Tensor& codes,
-          const Tensor* gradient,
-          Tensor* outDecoded,
+          const TensorCPU& codebook,
+          const TensorCPU& codes,
+          const TensorCPU* gradient,
+          TensorCPU* outDecoded,
           bool resizeOnly)>>
       gDecoderMapper = {REGISTER_DECODER(float, uint8_t),
                         REGISTER_DECODER(float, uint16_t),
@@ -153,7 +153,7 @@ class QuantDecodeGradientOp final : public Operator<CPUContext> {
 
     auto* gradient = Output(0);
     gradient->ResizeLike(codebook);
-    auto* gradient_ptr = gradient->template mutable_data<float>();
+    auto* gradient_ptr = gradient->mutable_data<float>();
     std::fill(gradient_ptr, gradient_ptr + gradient->size(), 0);
 
     for (int i = 0; i < num_code_tensors; i++) {
diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h
index 6d357e1b9f996..f3dd35b956078 100644
--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@@ -51,7 +51,7 @@ class SumRangeReducerGradient {
       Context* context) {
     // do we have some op that does it smartly with minimum number of memcpy?
     for (TIndex i = 0; i < blocks; ++i) {
-      context->template CopySameDevice<T>(
+      context->template Copy<T, Context, Context>(
           block_size, segment_grad, data_grad + block_size * i);
     }
   }
@@ -342,7 +342,8 @@ class BaseReducer {
                              : size_from_dim_(dims.size() - skip_dims, dims);
     }
 
-    void observeInput(int input, const Tensor& value, int skip_dims) {
+    void
+    observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
       DCHECK_EQ(0, input);
       auto& dims = value.dims();
       computeMeta(dims, skip_dims);
@@ -393,7 +394,10 @@ class BaseReducerGradient {
     vector<TIndex> block_shape;
     bool first_dim;
 
-    Meta(const Tensor& out_grad, int skip_dims, bool first_dim = true)
+    Meta(
+        const Tensor<CPUContext>& out_grad,
+        int skip_dims,
+        bool first_dim = true)
         : first_dim(first_dim) {
       auto& dims = out_grad.dims();
       first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
@@ -405,8 +409,8 @@ class BaseReducerGradient {
 
     void observeOriginalInput(
         int /*original_input*/,
-        const Tensor& /*value*/,
-        Tensor* /*input_grad*/, // optional grad to populate
+        const Tensor<CPUContext>& /*value*/,
+        Tensor<CPUContext>* /*input_grad*/, // optional grad to populate
         int /*skip_dims*/) {}
 
     void appendGradShape(vector<TIndex>* output_shape) {
@@ -475,7 +479,8 @@ class SumReducerGradient : public BaseReducerGradient {
     if (FixedSize == 1) { // static if
       *data_grad = *s_grad_;
     } else if (meta.first_dim) {
-      context->template CopySameDevice<T>(meta.block_size, s_grad_, data_grad);
+      context->template Copy<T, Context, Context>(
+          meta.block_size, s_grad_, data_grad);
     } else {
       math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
     }
@@ -517,7 +522,8 @@ class WeightedSumReducer<T, CPUContext> : public BaseReducer {
 
     explicit Meta(bool first = true) : first_dim(first) {}
 
-    void observeInput(int input, const Tensor& value, int skip_dims) {
+    void
+    observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
       if (input == 1) {
         CAFFE_ENFORCE_EQ(
             skip_dims, value.ndim(), "SCALARS mustn't have extra dimensions");
@@ -574,14 +580,14 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
 
     void observeOriginalInput(
         int original_input,
-        const Tensor& value,
-        Tensor* input_grad, // optional grad to populate
+        const Tensor<CPUContext>& value,
+        Tensor<CPUContext>* input_grad, // optional grad to populate
         int /*skip_dims*/) {
       CAFFE_ENFORCE_EQ(1, original_input);
       scalars = value.data<T>();
       if (input_grad) {
         input_grad->ResizeLike(value);
-        scalars_grad = input_grad->template mutable_data<T>();
+        scalars_grad = input_grad->mutable_data<T>();
       }
     }
   };
diff --git a/caffe2/operators/reduction_front_back_ops.h b/caffe2/operators/reduction_front_back_ops.h
index 85de482e12273..03633ce7c4a37 100644
--- a/caffe2/operators/reduction_front_back_ops.h
+++ b/caffe2/operators/reduction_front_back_ops.h
@@ -157,7 +157,7 @@ class SumReduceDimsGradientOp final : public Operator<Context> {
       T* dXdata);
   int num_reduce_dims_;
   // scratch space used for former version of this reducer
-  Tensor shape_{Context::GetDeviceType()};
+  Tensor<CPUContext> shape_;
 };
 
 template <typename T, class Context, bool FIRSTDIMS>
diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc
index 0d01d50ca000e..6f043eb4c5678 100644
--- a/caffe2/operators/reduction_ops.cc
+++ b/caffe2/operators/reduction_ops.cc
@@ -296,7 +296,9 @@ bool SumElementsGradientOp<T, Context>::RunOnDevice()
 #endif
 {
   auto& X = Input(0);
-  Tensor sum_grad(Input(1), CPU);
+  // Copy Input(1) from Context to CPUContext
+  CPUContext context;
+  TensorCPU sum_grad(Input(1), &context);
   auto* dX = Output(0);
   dX->ResizeLike(X);
   DCHECK_EQ(sum_grad.size(), 1);
diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
index f5937cd926cf7..3f9728c860975 100644
--- a/caffe2/operators/reduction_ops.cu
+++ b/caffe2/operators/reduction_ops.cu
@@ -86,15 +86,12 @@ bool SumElementsGradientOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(dY.size(), 1);
   auto* dX = Output(0);
   dX->ResizeLike(X);
-  SumElementsGradientKernel<float>
-      <<<CAFFE_GET_BLOCKS(X.size()),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          average_,
-          X.size(),
-          dY.data<float>(),
-          dX->template mutable_data<float>());
+  SumElementsGradientKernel<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      average_, X.size(), dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h
index 11cf06a2f15a1..ba502489b148a 100644
--- a/caffe2/operators/reduction_ops.h
+++ b/caffe2/operators/reduction_ops.h
@@ -43,7 +43,7 @@ class SumElementsOp : public Operator<Context> {
 
  private:
   bool average_;
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
 };
 
 template <typename T, class Context>
@@ -66,7 +66,7 @@ class SumElementsIntOp : public Operator<Context> {
   }
 
  private:
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
 };
 
 template <typename T, class Context>
@@ -124,7 +124,7 @@ class SumSqrElementsOp : public Operator<Context> {
   }
 
  private:
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
 };
 
 template <typename T, class Context, bool ROWWISE>
diff --git a/caffe2/operators/relu_n_op.cc b/caffe2/operators/relu_n_op.cc
index abaa8a14aefc0..f04769a98df60 100644
--- a/caffe2/operators/relu_n_op.cc
+++ b/caffe2/operators/relu_n_op.cc
@@ -69,8 +69,8 @@ OPERATOR_SCHEMA(ReluN)
     .CostInferenceFunction(CostInferenceForReluN)
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(
-Relu takes one input data (Tensor) and produces one output data
-(Tensor) where the rectified linear function, y = min(max(0, x), n),
+Relu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the rectified linear function, y = min(max(0, x), n),
 is applied to the tensor elementwise.
 )DOC")
     .Input(0, "X", "1D input tensor")
diff --git a/caffe2/operators/remove_data_blocks_op.h b/caffe2/operators/remove_data_blocks_op.h
index 243f27e1c1797..9530242df9de2 100644
--- a/caffe2/operators/remove_data_blocks_op.h
+++ b/caffe2/operators/remove_data_blocks_op.h
@@ -65,7 +65,7 @@ class RemoveDataBlocksOp final : public Operator<Context> {
       int64_t interval_end =
           (i == ind_vec_size - 1) ? outer_size : ind_vec[i + 1];
       auto num_items = interval_end - interval_start;
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           data.meta(),
           num_items * block_size,
           data_ptr + block_size_bytes * interval_start,
diff --git a/caffe2/operators/reservoir_sampling.cc b/caffe2/operators/reservoir_sampling.cc
index 5d6b94248b54c..79198d7c6b510 100644
--- a/caffe2/operators/reservoir_sampling.cc
+++ b/caffe2/operators/reservoir_sampling.cc
@@ -153,7 +153,7 @@ class ReservoirSamplingOp final : public Operator<Context> {
         CAFFE_ENFORCE_GE(*num_visited, numToCollect_);
       } else {
         // replace
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<Context, Context>(
             input.meta(),
             block_size,
             input_data + i * block_bytesize,
diff --git a/caffe2/operators/reshape_op.h b/caffe2/operators/reshape_op.h
index f332192b55e0e..f59da8ab779ab 100644
--- a/caffe2/operators/reshape_op.h
+++ b/caffe2/operators/reshape_op.h
@@ -35,7 +35,9 @@ class ReshapeOp : public Operator<Context> {
 
  protected:
   template <typename T>
-  void DoRunWithTypeImpl(const Tensor& input, Tensor* output) {
+  void DoRunWithTypeImpl(
+      const Tensor<Context>& input,
+      Tensor<Context>* output) {
     vector<int64_t> actual_new_shape = new_shape_;
     if (InputSize() == 2) {
       CAFFE_ENFORCE(
@@ -50,7 +52,8 @@ class ReshapeOp : public Operator<Context> {
 
       // Bit awkward, but needed so works on both CPU and CUDA contexts
       std::vector<T> tmpv(shape.size());
-      context_.CopyBytesToCPU(shape.size() * sizeof(T), shape_data, &tmpv[0]);
+      context_.template CopyBytes<Context, CPUContext>(
+          shape.size() * sizeof(T), shape_data, &tmpv[0]);
       actual_new_shape.assign(tmpv.begin(), tmpv.begin() + shape.size());
     }
 
@@ -121,7 +124,7 @@ class ReshapeOp : public Operator<Context> {
     output->Resize(actual_new_shape);
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           input.meta(),
           input.size(),
           input.raw_data(),
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index 86faf736ca391..300cf87f59d18 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -20,10 +20,10 @@ static void AddConstInput(
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
-      tensor->size(), value, tensor->template mutable_data<float>(), &context);
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
   return;
 }
 
@@ -44,7 +44,7 @@ TEST(ReshapeOpGPUTest, testReshapeWithScalar) {
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
   Blob* XNew = ws.GetBlob("XNew");
-  const Tensor& XNewTensor = XNew->Get<Tensor>();
+  const Tensor<CUDAContext>& XNewTensor = XNew->Get<Tensor<CUDAContext>>();
   EXPECT_EQ(1, XNewTensor.ndim());
   EXPECT_EQ(1, XNewTensor.size());
 }
diff --git a/caffe2/operators/resize_op.cc b/caffe2/operators/resize_op.cc
index d79b90b0d3d40..8a272a3d40f96 100644
--- a/caffe2/operators/resize_op.cc
+++ b/caffe2/operators/resize_op.cc
@@ -67,7 +67,7 @@ bool ResizeNearestOp<float, CPUContext>::RunOnDevice() {
   Y->Resize(batch_size, num_channels, output_height, output_width);
 
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
 
   // Specialized implementation for fast 2x upsampling
   if (width_scale_ == 2.0 && height_scale_ == 2.0) {
@@ -108,11 +108,13 @@ bool ResizeNearestGradientOp<float, CPUContext>::RunOnDevice() {
   const int output_height = X.dim32(2);
   const int output_width = X.dim32(3);
   dX->Resize(batch_size, num_channels, output_height, output_width);
-  math::Set<float, CPUContext>(
-      dX->size(), 0.0f, dX->template mutable_data<float>(), &context_);
+  math::Set<float, CPUContext>(dX->size(),
+                               0.0f,
+                               dX->mutable_data<float>(),
+                               &context_);
 
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
 
   for (int n = 0; n < batch_size; ++n) {
     for (int c = 0; c < num_channels; ++c) {
diff --git a/caffe2/operators/resize_op.cu b/caffe2/operators/resize_op.cu
index 6c433c3b00e0c..0e1d55e5a4f30 100644
--- a/caffe2/operators/resize_op.cu
+++ b/caffe2/operators/resize_op.cu
@@ -98,7 +98,7 @@ bool ResizeNearestOp<float, CUDAContext>::RunOnDevice() {
       height_scale_,
       width_scale_,
       X.data<float>(),
-      Y->template mutable_data<float>());
+      Y->mutable_data<float>());
 
   return true;
 }
@@ -117,7 +117,7 @@ bool ResizeNearestGradientOp<float, CUDAContext>::RunOnDevice() {
   int output_width = X.dim32(3);
   dX->Resize(batch_size, num_channels, output_height, output_width);
   math::Set<float, CUDAContext>(
-      dX->size(), 0.0f, dX->template mutable_data<float>(), &context_);
+      dX->size(), 0.0f, dX->mutable_data<float>(), &context_);
 
   const auto size = dY.size();
   NearestNeighborGradientKernel<<<
@@ -134,7 +134,7 @@ bool ResizeNearestGradientOp<float, CUDAContext>::RunOnDevice() {
       height_scale_,
       width_scale_,
       dY.data<float>(),
-      dX->template mutable_data<float>());
+      dX->mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h
index f0bdbcf482116..f2f1122ceabe3 100644
--- a/caffe2/operators/reverse_packed_segs_op.h
+++ b/caffe2/operators/reverse_packed_segs_op.h
@@ -58,7 +58,7 @@ class ReversePackedSegsOp final : public Operator<Context> {
     const LengthType* lengths_ptr = lengths.template data<LengthType>();
 
     vector<LengthType> lengths_host(batch_size);
-    context_.template CopyToCPU<LengthType>(
+    context_.template Copy<LengthType, Context, CPUContext>(
         batch_size, lengths_ptr, &lengths_host[0]);
     context_.FinishDeviceComputation();
 
@@ -71,14 +71,14 @@ class ReversePackedSegsOp final : public Operator<Context> {
         const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
         T* rev_data_block_ptr =
             rev_data_ptr + ((seg_length - 1 - j) * batch_size + i) * block_size;
-        context_.template CopySameDevice<T>(
+        context_.template Copy<T, Context, Context>(
             block_size, data_block_ptr, rev_data_block_ptr);
       }
       for (; j < max_length; j++) {
         const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
         T* rev_data_block_ptr =
             rev_data_ptr + (j * batch_size + i) * block_size;
-        context_.template CopySameDevice<T>(
+        context_.template Copy<T, Context, Context>(
             block_size, data_block_ptr, rev_data_block_ptr);
       }
     }
diff --git a/caffe2/operators/rmac_regions_op.cc b/caffe2/operators/rmac_regions_op.cc
index da0df05d63fd7..ba6ab10973504 100644
--- a/caffe2/operators/rmac_regions_op.cc
+++ b/caffe2/operators/rmac_regions_op.cc
@@ -56,7 +56,7 @@ bool RMACRegionsOp<CPUContext>::RunOnDevice() {
 
     int cur_rows = output->dim32(0);
     output->Extend((l + Wd) * (l + Hd), 50, &context_);
-    auto* outputData = output->template mutable_data<float>() + cur_rows * 5;
+    auto* outputData = output->mutable_data<float>() + cur_rows * 5;
 
     for (int i = 0; i < l + Wd; ++i) {
       for (int j = 0; j < l + Hd; ++j) {
@@ -85,7 +85,7 @@ bool RMACRegionsOp<CPUContext>::RunOnDevice() {
   // Replicate regions for all items in batch
   int num_rois = output->dim32(0);
   output->Extend((batch_size - 1) * num_rois, 50, &context_);
-  auto* outputData = output->template mutable_data<float>();
+  auto* outputData = output->mutable_data<float>();
   for (int b = 1; b < batch_size; ++b) {
     // Copy all rois
     std::copy_n(outputData, num_rois * 5, outputData + b * num_rois * 5);
diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
index ee0fb38459c18..49faf7a8403cb 100644
--- a/caffe2/operators/rmac_regions_op.cu
+++ b/caffe2/operators/rmac_regions_op.cu
@@ -192,7 +192,8 @@ bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
   // NumRMACRegionsKernel (number of RoIs), so need to copy that to CPU
   // to Resize() output appropriately.
   int num_rois = 0;
-  context_.CopyBytesToCPU(sizeof(int), num_rois_.data<int>(), &num_rois);
+  context_.CopyBytes<CUDAContext, CPUContext>(
+      sizeof(int), num_rois_.data<int>(), &num_rois);
   int N = batch_size * num_rois;
   output->Resize(N, 5); // [batch_id x1 y1 x2 y2]
 
@@ -202,7 +203,7 @@ bool RMACRegionsOp<CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      W, H, N, num_rois_.data<int>(), output->template mutable_data<float>());
+      W, H, N, num_rois_.data<int>(), output->mutable_data<float>());
 
   return true;
 }
diff --git a/caffe2/operators/rmac_regions_op.h b/caffe2/operators/rmac_regions_op.h
index 93af252e9af34..ec5e86f841419 100644
--- a/caffe2/operators/rmac_regions_op.h
+++ b/caffe2/operators/rmac_regions_op.h
@@ -21,7 +21,7 @@ class RMACRegionsOp final : public Operator<Context> {
  protected:
   int scales_;
   float overlap_;
-  Tensor num_rois_{Context::GetDeviceType()};
+  Tensor<Context> num_rois_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 501b7d8dc2b10..1f2e62fdc8f28 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -37,18 +37,19 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
 
       for (auto& blob_name : blob_names) {
         const Blob* currentBlob = currentStepWorkspace->GetBlob(blob_name);
-        const auto& currentTensor = currentBlob->Get<Tensor>();
+        const auto& currentTensor = currentBlob->Get<Tensor<Context>>();
 
         std::string newBlobName =
             prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
         ws_->CreateBlob(newBlobName)
-            ->GetMutableTensor(CPU)
+            ->template GetMutable<TensorCPU>()
             ->ResizeLike(currentTensor);
-        auto type = Context::GetDeviceType();
-        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
-        newTensor->CopyFrom(currentTensor);
+
+        auto* newTensor =
+            ws_->GetBlob(newBlobName)->template GetMutable<Tensor<Context>>();
+        newTensor->template CopyFrom<Context>(currentTensor);
       }
     }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 3afaedf577c60..c241931978407 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
       // the forward-only mode.
       std::string this_timestep_blob =
           timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
+      ws->CreateBlob(this_timestep_blob)->GetMutable<TensorCPU>()->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
-      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+      b->GetMutable<TensorCPU>()->mutable_data<int32_t>()[0] = t;
 
       // Copy the operators from template
       for (auto& template_rnn_op : timestep_ops_template_) {
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index a92c7690c7d91..c50d18e9223d1 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -52,10 +52,10 @@ struct ScratchWorkspaces {
 };
 
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
+  ws->CreateBlob(blob_name)->GetMutable<TensorCPU>()->Resize(1);
   auto timestepBlob = ws->GetBlob(blob_name);
   CAFFE_ENFORCE(timestepBlob);
-  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+  timestepBlob->GetMutable<TensorCPU>()->mutable_data<int32_t>()[0] = t;
 }
 
 std::map<string, string> GetRecurrentMapping(
@@ -70,8 +70,8 @@ void applyOffsetAlias(
           << " at offset: " << oc.offset;
   auto srcBlob = ws->GetBlob(oc.src);
   CAFFE_ENFORCE(srcBlob);
-  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
-  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
+  auto* src = srcBlob->template GetMutable<Tensor<Context>>();
+  auto* dst = ws->GetBlob(oc.dst)->template GetMutable<Tensor<Context>>();
   auto timestep = src->size() / src->dim(0);
   auto dims = src->dims();
   const int32_t startDstTimestep =
@@ -95,7 +95,7 @@ void repeatCopy(
     T* dst,
     Context* context) {
   for (int i = 0; i < repeat_n; ++i) {
-    context->template CopySameDevice<T>(n, src, dst + i * n);
+    context->template Copy<T, Context, Context>(n, src, dst + i * n);
   }
 }
 
@@ -112,11 +112,11 @@ void initializeRecurrentInput(
     Context* context) {
   auto stateBlob = ws->GetBlob(rc.state);
   CAFFE_ENFORCE(stateBlob);
-  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* state = stateBlob->template GetMutable<Tensor<Context>>();
 
   auto inputBlob = ws->GetBlob(rc.input);
   CAFFE_ENFORCE(inputBlob);
-  const auto& input = inputBlob->template Get<Tensor>();
+  const auto& input = inputBlob->template Get<Tensor<Context>>();
   CAFFE_ENFORCE_GE(input.ndim(), 1, rc.input);
   CAFFE_ENFORCE_LE(input.ndim(), 3, rc.input);
 
@@ -134,7 +134,7 @@ void initializeRecurrentInput(
 
   if (input.ndim() >= 2) {
     CAFFE_ENFORCE_EQ(input.dim(input.ndim() - 2), batchSize, rc.input);
-    context->template CopySameDevice<T>(
+    context->template Copy<T, Context, Context>(
         batchSize * stateSize * initialStateLength,
         input.template data<T>(),
         state->template mutable_data<T>());
@@ -654,11 +654,11 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
     for (auto& param : params_) {
       auto pBlob = sharedWs_->GetBlob(param.param);
       CAFFE_ENFORCE(pBlob);
-      const auto& p = pBlob->template Get<Tensor>();
+      const auto& p = pBlob->template Get<Tensor<Context>>();
 
       auto gBlob = sharedWs_->GetBlob(param.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = gBlob->template GetMutable<Tensor<Context>>();
       g->ResizeLike(p);
       math::Set<T, Context>(
           g->size(),
@@ -670,11 +670,11 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
     for (auto& rg : recurrentGradients_) {
       auto pBlob = sharedWs_->GetBlob(rg.param);
       CAFFE_ENFORCE(pBlob);
-      const auto& p = pBlob->template Get<Tensor>();
+      const auto& p = pBlob->template Get<Tensor<Context>>();
 
       auto gBlob = sharedWs_->CreateBlob(rg.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = gBlob->template GetMutable<Tensor<Context>>();
       g->ResizeLike(p);
       CAFFE_ENFORCE_EQ(g->ndim(), 3);
       const auto timestep = g->size() / g->dim(0);
@@ -701,7 +701,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
               << ". Size: " << Input(gradientInputIndex).size();
       auto pGradientBlob = sharedWs_->GetBlob(gradientName);
       CAFFE_ENFORCE(pGradientBlob);
-      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = pGradientBlob->template GetMutable<Tensor<Context>>();
       g->ResizeLike(Input(gradientInputIndex));
       g->template mutable_data<T>();
     }
@@ -715,11 +715,11 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                 << rg.lastExternalGrad << " for final time step (sep. blob)";
         auto gBlob = sharedWs_->GetBlob(rg.grad);
         CAFFE_ENFORCE(gBlob);
-        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+        auto* g = gBlob->template GetMutable<Tensor<Context>>();
 
         auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
         CAFFE_ENFORCE(oglastBlob);
-        const auto& oglast = oglastBlob->template Get<Tensor>();
+        const auto& oglast = oglastBlob->template Get<Tensor<Context>>();
         CAFFE_ENFORCE_EQ(g->dim(1), oglast.dim(1));
         CAFFE_ENFORCE_EQ(g->dim(2), oglast.dim(2));
 
@@ -777,7 +777,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
       T* output_data = Output(outputIdx)->template mutable_data<T>();
       auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
       CAFFE_ENFORCE(pBlob);
-      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* p = pBlob->template GetMutable<Tensor<Context>>();
 
       if (Input(inputId).ndim() >= 2) {
         // Gradient states blob should live. And if it gets changed by the
@@ -841,7 +841,7 @@ class AccumulateInputGradientOp : public Operator<Context> {
 
   template<typename T>
   bool DoRunWithType() {
-    const auto& t0 = OperatorBase::Input<Tensor>(0, CPU);
+    const auto& t0 = OperatorBase::Input<Tensor<CPUContext>>(0);
     const auto t = t0.template data<int32_t>()[0];
     auto& og = Input(1);
     auto* g = Output(0);
@@ -890,7 +890,7 @@ class RNNApplyLinkOp : public Operator<Context> {
   bool DoRunWithType() {
     // Both internal and external appear as both input and output to enforce
     // correct dependency computation.
-    const auto& t0 = OperatorBase::Input<Tensor>(0, CPU);
+    const auto& t0 = OperatorBase::Input<Tensor<CPUContext>>(0);
     const auto t = t0.template data<int32_t>()[0];
     auto& external = Input(1);
 
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index 4b3496558436f..fe556147ad67a 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -60,11 +60,11 @@ RecurrentBaseOp<T>::~RecurrentBaseOp() {
 
 template <typename T>
 void RecurrentBaseOp<T>::initialize(
-    const Tensor& input,
-    Tensor* dropoutStates,
-    Tensor* output,
-    Tensor* hiddenOutput,
-    Tensor* cellOutput) {
+    const Tensor<CUDAContext>& input,
+    Tensor<CUDAContext>* dropoutStates,
+    Tensor<CUDAContext>* output,
+    Tensor<CUDAContext>* hiddenOutput,
+    Tensor<CUDAContext>* cellOutput) {
   static_assert(sizeof(T) == 4, ""); // workaround clang bug
   CAFFE_ENFORCE_GE(input.ndim(), 3);
   const int seqLength = input.dim(0);
@@ -458,13 +458,13 @@ bool RecurrentParamAccessOp<T, mode>::RunOnDevice() {
     if (mode == SET_PARAM) {
       CAFFE_ENFORCE_EQ(
           biasDims[0] * biasDims[1] * biasDims[2], Input(2).size());
-      context_.template CopySameDevice<T>(
+      context_.template Copy<T, CUDAContext, CUDAContext>(
           biasDims[0] * biasDims[1] * biasDims[2],
           Input(2).template data<T>(),
           static_cast<T*>(bias));
     } else {
       Output(0)->Resize(biasDims);
-      context_.template CopySameDevice<T>(
+      context_.template Copy<T, CUDAContext, CUDAContext>(
           biasDims[0] * biasDims[1] * biasDims[2],
           static_cast<T*>(bias),
           Output(0)->template mutable_data<T>());
@@ -495,13 +495,13 @@ bool RecurrentParamAccessOp<T, mode>::RunOnDevice() {
     CAFFE_ENFORCE_EQ(numDims, 3);
     if (mode == SET_PARAM) {
       CAFFE_ENFORCE_EQ(matDims[0] * matDims[1] * matDims[2], Input(2).size());
-      context_.template CopySameDevice<T>(
+      context_.template Copy<T, CUDAContext, CUDAContext>(
           matDims[0] * matDims[1] * matDims[2],
           Input(2).template data<T>(),
           static_cast<T*>(pmatrix));
     } else {
       Output(0)->Resize(matDims);
-      context_.template CopySameDevice<T>(
+      context_.template Copy<T, CUDAContext, CUDAContext>(
           matDims[0] * matDims[1] * matDims[2],
           static_cast<T*>(pmatrix),
           Output(0)->template mutable_data<T>());
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.h b/caffe2/operators/rnn/recurrent_op_cudnn.h
index 5c70b52620299..25bcc204a1e63 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.h
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.h
@@ -37,12 +37,12 @@ class RecurrentBaseOp : public Operator<CUDAContext> {
 
  protected:
   void initialize(
-      const Tensor& input,
-      Tensor* dropoutStates = nullptr,
+      const Tensor<CUDAContext>& input,
+      Tensor<CUDAContext>* dropoutStates = nullptr,
       // If passed, reshapes to the appropriate size
-      Tensor* output = nullptr,
-      Tensor* hiddenOutput = nullptr,
-      Tensor* cellOutput = nullptr);
+      Tensor<CUDAContext>* output = nullptr,
+      Tensor<CUDAContext>* hiddenOutput = nullptr,
+      Tensor<CUDAContext>* cellOutput = nullptr);
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnDropoutDescriptor_t dropoutDesc_;
diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc
index 269b57f94068e..1cc4103a535b5 100644
--- a/caffe2/operators/roi_align_gradient_op.cc
+++ b/caffe2/operators/roi_align_gradient_op.cc
@@ -202,7 +202,7 @@ bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
   // Must zero-out dX before accumulating gradients
   // (TODO): Kaiming - is this safe?
   math::Set<float, CPUContext>(
-      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
 
   if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
     ROIAlignBackwardFeature<float>(
@@ -216,7 +216,7 @@ bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
         pooled_height_,
         pooled_width_,
         sampling_ratio_,
-        dX->template mutable_data<float>(),
+        dX->mutable_data<float>(),
         R.data<float>(),
         R.dim32(1));
   }
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
index 0ba327663ec59..534d55ddd9a46 100644
--- a/caffe2/operators/roi_align_gradient_op.cu
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -193,15 +193,15 @@ bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
                        // (aka "gradOutput")
-  auto* dX = Output(0); // Gradient of net w.r.t. input to
-                        // "forward" op (aka "gradInput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
 
   dX->ResizeLike(X);
 
   // Must zero-out dX before accumulating gradients
   // (TODO): Kaiming - is this safe?
   math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
 
   if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
     RoIAlignBackwardFeature<float>
@@ -219,7 +219,7 @@ bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
             pooled_height_,
             pooled_width_,
             sampling_ratio_,
-            dX->template mutable_data<float>(),
+            dX->mutable_data<float>(),
             R.data<float>());
   }
   return true;
diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc
index f0c6a10f7cdb9..0d62dcfd71bee 100644
--- a/caffe2/operators/roi_align_op.cc
+++ b/caffe2/operators/roi_align_op.cc
@@ -283,7 +283,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
       Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
     }
     // The following mutable_data calls are needed to allocate the tensors
-    Y->template mutable_data<float>();
+    Y->mutable_data<float>();
     return true;
   }
 
@@ -308,7 +308,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
         sampling_ratio_,
         R.data<float>(),
         R.dim32(1),
-        Y->template mutable_data<float>(),
+        Y->mutable_data<float>(),
         order_);
   } else if (order_ == StorageOrder::NHWC) {
     Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
@@ -325,7 +325,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
         sampling_ratio_,
         R.data<float>(),
         R.dim32(1),
-        Y->template mutable_data<float>(),
+        Y->mutable_data<float>(),
         order_);
   }
 
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
index bfd108ff24c85..e512f3d974139 100644
--- a/caffe2/operators/roi_align_op.cu
+++ b/caffe2/operators/roi_align_op.cu
@@ -156,7 +156,7 @@ bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
     // Handle empty rois
     Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
     // The following mutable_data calls are needed to allocate the tensors
-    Y->template mutable_data<float>();
+    Y->mutable_data<float>();
     return true;
   }
 
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 92eafefcb65eb..ed4ef33a1d688 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -18,7 +18,7 @@ void AddConstInput(
     Context* context,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = blob->GetMutable<Tensor<Context>>();
   tensor->Resize(shape);
   math::Set<float, Context>(
       tensor->size(), value, tensor->template mutable_data<float>(), context);
@@ -39,10 +39,10 @@ void AddInput<CPUContext>(
     const string& name,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
-      tensor->template mutable_data<float>(), tensor->size());
+      tensor->mutable_data<float>(), tensor->size());
   tensor_vec.array() = utils::AsEArrXt(values);
 }
 
@@ -52,12 +52,12 @@ void AddInput<CUDAContext>(
     const vector<float>& values,
     const string& name,
     Workspace* ws) {
-  Tensor tmp(shape, CPU);
+  TensorCPU tmp(shape);
   EigenVectorMap<float> tmp_vec(tmp.mutable_data<float>(), tmp.size());
   tmp_vec.array() = utils::AsEArrXt(values);
 
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = blob->template GetMutable<Tensor<CUDAContext>>();
   tensor->CopyFrom(tmp);
 }
 
@@ -186,7 +186,7 @@ void CreateAndRun(
   Blob* Y_blob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, Y_blob);
 
-  auto& Y = Y_blob->Get<Tensor>();
+  auto& Y = Y_blob->Get<Tensor<Context>>();
   outResult->CopyFrom(Y, &context);
 }
 
@@ -196,9 +196,9 @@ TEST(RoiAlignTest, CheckCPUGPUEqual) {
   if (!caffe2::HasCudaGPU())
     return;
 
-  Tensor y_cpu(CPU);
-  Tensor y_gpu(CPU);
-  Tensor y_cpu_nhwc(CPU);
+  TensorCPU y_cpu;
+  TensorCPU y_gpu;
+  TensorCPU y_cpu_nhwc;
 
   // tests using FAIR example
   {
diff --git a/caffe2/operators/roi_pool_op.cc b/caffe2/operators/roi_pool_op.cc
index 00e3ccde3e4ef..d369aecd638e6 100644
--- a/caffe2/operators/roi_pool_op.cc
+++ b/caffe2/operators/roi_pool_op.cc
@@ -31,8 +31,8 @@ bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
 
   const float* Xdata = X.data<float>();
   const float* rois = R.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
-  int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();
+  float* Ydata = Y->mutable_data<float>();
+  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
 
   // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
   for (int n = 0; n < num_rois; ++n) {
diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu
index db18b3f551a74..45839117b2eda 100644
--- a/caffe2/operators/roi_pool_op.cu
+++ b/caffe2/operators/roi_pool_op.cu
@@ -133,10 +133,10 @@ bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
   if (R.size() == 0) {
     Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
     // mutable_data calls are needed to allocate the tensors
-    Y->template mutable_data<float>();
+    Y->mutable_data<float>();
     if (!is_test_) {
       A->Resize(Y->dims());
-      A->template mutable_data<int>();
+      A->mutable_data<int>();
     }
     return true;
   }
@@ -146,23 +146,23 @@ bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
     A->Resize(Y->dims());
   }
   int output_size = Y->size();
-  int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();
-  ROIPoolForward<float>
-      <<<CAFFE_GET_BLOCKS(output_size),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          spatial_scale_,
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          pooled_height_,
-          pooled_width_,
-          R.data<float>(),
-          Y->template mutable_data<float>(),
-          argmax_data);
+  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
+  ROIPoolForward<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      spatial_scale_,
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      pooled_height_,
+      pooled_width_,
+      R.data<float>(),
+      Y->mutable_data<float>(),
+      argmax_data);
   return true;
 }
 
@@ -179,25 +179,25 @@ bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
   dX->ResizeLike(X);
   // Must zero-out dX before accumulating gradients
   math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
   if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
-    ROIPoolBackward<float>
-        <<<CAFFE_GET_BLOCKS(dY.size()),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           context_.cuda_stream()>>>(
-            dY.size(),
-            dY.data<float>(),
-            A.data<int>(),
-            R.dim32(0),
-            spatial_scale_,
-            X.dim32(1),
-            X.dim32(2),
-            X.dim32(3),
-            pooled_height_,
-            pooled_width_,
-            dX->template mutable_data<float>(),
-            R.data<float>());
+    ROIPoolBackward<float><<<
+        CAFFE_GET_BLOCKS(dY.size()),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        dY.size(),
+        dY.data<float>(),
+        A.data<int>(),
+        R.dim32(0),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        dX->mutable_data<float>(),
+        R.data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/scale_op.cc b/caffe2/operators/scale_op.cc
index db0e3e1c42dac..f246db4495231 100644
--- a/caffe2/operators/scale_op.cc
+++ b/caffe2/operators/scale_op.cc
@@ -4,15 +4,15 @@ namespace caffe2 {
 
 REGISTER_CPU_OPERATOR(Scale, ScaleOp<CPUContext>);
 OPERATOR_SCHEMA(Scale)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape()
-    .SetDoc(R"DOC(
-Scale takes one input data (Tensor) and produces one output data
-(Tensor) whose value is the input data tensor scaled element-wise.
+  .NumInputs(1)
+  .NumOutputs(1)
+  .AllowInplace({{0, 0}})
+  .IdenticalTypeAndShape()
+  .SetDoc(R"DOC(
+Scale takes one input data (Tensor<float>) and produces one output data
+(Tensor<float>) whose value is the input data tensor scaled element-wise.
 )DOC")
-    .Arg("scale", "(float, default 1.0) the scale to apply.");
+  .Arg("scale", "(float, default 1.0) the scale to apply.");
 
 class GetScaleGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index 50b344611ee82..1d51692ac71f1 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -13,7 +13,7 @@ class BaseInputAccessor {
  public:
   BaseInputAccessor() {}
 
-  bool observeInput(const Tensor& dataInput) {
+  bool observeInput(const Tensor<CPUContext>& dataInput) {
     data_ = dataInput.raw_data();
     return dataInput.template IsType<TData>();
   }
@@ -373,7 +373,7 @@ class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& reduction_grad = Input(REDUCTION_GRAD);
-    auto& source_shape = OperatorBase::Input<Tensor>(SOURCE_SHAPE, CPU);
+    auto& source_shape = OperatorBase::Input<TensorCPU>(SOURCE_SHAPE);
 
     auto* data_grads = Output(0);
 
diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu
index 6eec2deba9ce2..9a2d3a8f78ee0 100644
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@@ -13,8 +13,8 @@ namespace {
 void inclusive_scan_wrapper(
     const int* length_data,
     int len_length,
-    Tensor* temp_buffer,
-    Tensor* prefix_sum_out,
+    Tensor<CUDAContext>* temp_buffer,
+    Tensor<CUDAContext>* prefix_sum_out,
     CUDAContext* context_) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -22,20 +22,19 @@ void inclusive_scan_wrapper(
       NULL,
       temp_storage_bytes,
       length_data,
-      prefix_sum_out->template mutable_data<int>(),
+      prefix_sum_out->mutable_data<int>(),
       len_length,
       context_->cuda_stream());
   // Allocate temporary storage
   auto buffer_size = (temp_storage_bytes + sizeof(int)) / sizeof(int);
   temp_buffer->Resize(buffer_size);
-  void* d_temp_storage =
-      static_cast<void*>(temp_buffer->template mutable_data<int>());
+  void* d_temp_storage = static_cast<void*>(temp_buffer->mutable_data<int>());
   // Run inclusive prefix sum
   cub::DeviceScan::InclusiveSum(
       d_temp_storage,
       temp_storage_bytes,
       length_data,
-      prefix_sum_out->template mutable_data<int>(),
+      prefix_sum_out->mutable_data<int>(),
       len_length,
       context_->cuda_stream());
 }
@@ -524,8 +523,8 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, class Context = CUDAContext, bool SparseFused = true>
@@ -646,8 +645,8 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, class Context = CUDAContext, bool SparseFused = true>
@@ -780,8 +779,8 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, class Context = CUDAContext, bool SparseFused = true>
@@ -880,8 +879,8 @@ class CUDASparseLengthsWeightedSumOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename SIndex>
@@ -989,7 +988,7 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
     }
 
     SIndex K = 0;
-    context_.CopyBytesToCPU(
+    context_.CopyBytes<CUDAContext, CPUContext>(
         sizeof(SIndex), K_tensor_.template data<SIndex>(), &K);
     context_.FinishDeviceComputation();
 
@@ -1047,9 +1046,9 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
   }
 
  private:
-  Tensor buffer_tensor_{CUDA};
-  Tensor K_tensor_{CUDA};
-  Tensor scaling_factors_{CUDA}; // for mean
+  Tensor<CUDAContext> buffer_tensor_;
+  Tensor<CUDAContext> K_tensor_;
+  Tensor<CUDAContext> scaling_factors_; // for mean
 };
 
 template <typename SIndex>
@@ -1098,7 +1097,7 @@ class SortedSegmentRangeMeanOp : public Operator<Context> {
     auto* output = Output(0);
     auto dims = input.dims();
     SIndex K = 0;
-    context_.CopyBytesToCPU(
+    context_.template CopyBytes<Context, CPUContext>(
         sizeof(SIndex),
         indices.template data<SIndex>() + indices.size() - 1,
         &K);
@@ -1157,9 +1156,9 @@ class SortedSegmentRangeMeanOp : public Operator<Context> {
   }
 
  private:
-  Tensor segment_len_{CUDA}; // for mean
-  Tensor segment_len_prefix_sum_{CUDA};
-  Tensor prefix_buffer_{CUDA};
+  Tensor<CUDAContext> segment_len_; // for mean
+  Tensor<CUDAContext> segment_len_prefix_sum_;
+  Tensor<CUDAContext> prefix_buffer_;
 };
 
 template <typename T, typename SIndex, bool LOGEXP = false>
@@ -1202,7 +1201,7 @@ class SortedSegmentRangeMeanGradientOp : public Operator<Context> {
     const int N = X.size_from_dim(1);
 
     SIndex K = 0;
-    context_.CopyBytesToCPU(
+    context_.template CopyBytes<Context, CPUContext>(
         sizeof(SIndex), I.template data<SIndex>() + I.size() - 1, &K);
 
     K += 1;
@@ -1242,7 +1241,7 @@ class SortedSegmentRangeMeanGradientOp : public Operator<Context> {
   }
 
  private:
-  Tensor segment_len_{CUDA}; // for mean
+  Tensor<CUDAContext> segment_len_; // for mean
 };
 
 REGISTER_CUDA_OPERATOR_STR(
@@ -1359,8 +1358,8 @@ class CUDASparseLengthsSumGradientWithIndicesOp : public Operator<CUDAContext> {
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, class Context = CUDAContext>
@@ -1438,8 +1437,8 @@ class CUDASparseLengthsMeanGradientWithIndicesOp
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, class Context = CUDAContext>
@@ -1527,8 +1526,8 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, bool ExactBlock = false>
@@ -1665,8 +1664,8 @@ class CUDALengthsMaxWithMainInputAndForwardOutputGradientOp
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 template <typename T, class Context = CUDAContext>
@@ -1793,8 +1792,8 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
 
  private:
   // menber field to manage memory
-  Tensor inclusive_scan_buffer_{CUDA};
-  Tensor inclusive_scan_length_buffer_{CUDA};
+  Tensor<Context> inclusive_scan_buffer_;
+  Tensor<Context> inclusive_scan_length_buffer_;
 };
 
 // Needed because name is auto-generated in segment_reduction_op.cc:224
diff --git a/caffe2/operators/selu_op.cc b/caffe2/operators/selu_op.cc
index 45467ef2c8183..50d823d8bedf1 100644
--- a/caffe2/operators/selu_op.cc
+++ b/caffe2/operators/selu_op.cc
@@ -12,7 +12,7 @@ bool SeluOp<float, CPUContext>::RunOnDevice() {
   Y->ResizeLike(X);
 
   ConstEigenVectorArrayMap<float> Xvec(X.data<float>(), X.size());
-  EigenVectorArrayMap<float> Yvec(Y->template mutable_data<float>(), Y->size());
+  EigenVectorArrayMap<float> Yvec(Y->mutable_data<float>(), Y->size());
   Yvec = lambda_ * (Xvec > 0).select(Xvec, (alpha_ * Xvec.exp() - alpha_));
   return true;
 }
@@ -27,8 +27,7 @@ bool SeluGradientOp<float, CPUContext>::RunOnDevice() {
 
   ConstEigenVectorArrayMap<float> Yvec(Y.data<float>(), Y.size());
   ConstEigenVectorArrayMap<float> dYvec(dY.data<float>(), dY.size());
-  EigenVectorArrayMap<float> dXvec(
-      dX->template mutable_data<float>(), dX->size());
+  EigenVectorArrayMap<float> dXvec(dX->mutable_data<float>(), dX->size());
 
   const float la = lambda_ * alpha_;
   dXvec = (Yvec > 0).select(lambda_ * dYvec, dYvec * (Yvec + la));
diff --git a/caffe2/operators/selu_op.cu b/caffe2/operators/selu_op.cu
index f2339acb20a20..95eb2c54ee96a 100644
--- a/caffe2/operators/selu_op.cu
+++ b/caffe2/operators/selu_op.cu
@@ -38,11 +38,7 @@ bool SeluOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(),
-      X.data<float>(),
-      Y->template mutable_data<float>(),
-      alpha_,
-      lambda_);
+      X.size(), X.data<float>(), Y->mutable_data<float>(), alpha_, lambda_);
   return true;
 }
 
@@ -62,7 +58,7 @@ bool SeluGradientOp<float, CUDAContext>::RunOnDevice() {
       Y.size(),
       Y.data<float>(),
       dY.data<float>(),
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       alpha_,
       lambda_);
   return true;
diff --git a/caffe2/operators/sequence_ops.cc b/caffe2/operators/sequence_ops.cc
index 2b7b820956867..4dd8e65aa3842 100644
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@@ -95,7 +95,7 @@ bool RemovePaddingOp<CPUContext>::DoRunWithType() {
   std::transform(
       lengths_ptr,
       lengths_ptr + lengths_size,
-      lengths_out->template mutable_data<int32_t>(),
+      lengths_out->mutable_data<int32_t>(),
       [pad_width](int32_t x) { return x - pad_width; });
   return true;
 }
@@ -156,7 +156,7 @@ bool AddPaddingOp<CPUContext>::MakePadding(
   std::transform(
       lengths_ptr,
       lengths_ptr + lengths_size,
-      lengths_out->template mutable_data<int32_t>(),
+      lengths_out->mutable_data<int32_t>(),
       [pad_width](int32_t x) { return x + pad_width; });
   return true;
 }
@@ -203,7 +203,7 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
         static_cast<char*>(out_features->raw_mutable_data(features.meta()));
     auto src_base = static_cast<const char*>(features.raw_data());
     // copy data and add padding index as zero
-    Tensor zero{CPU};
+    Tensor<CPUContext> zero;
     zero.Resize(block_size);
     auto zeroPtr =
         static_cast<const char*>(zero.raw_mutable_data(features.meta()));
@@ -211,7 +211,7 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
     int start_src = 0;
     for (int i = 0; i < lengths.size(); ++i) {
       if (lengthsPtr[i] == 0) {
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<CPUContext, CPUContext>(
             features.meta(),
             block_size,
             zeroPtr,
@@ -219,7 +219,7 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
         start_dest += block_size;
       } else {
         auto src = src_base + start_src * features.meta().itemsize();
-        context_.CopyItemsSameDevice(
+        context_.template CopyItems<CPUContext, CPUContext>(
             features.meta(),
             lengthsPtr[i] * block_size,
             src,
diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
index 95ad9ece32d41..549c288c0368b 100644
--- a/caffe2/operators/sequence_ops.cu
+++ b/caffe2/operators/sequence_ops.cu
@@ -126,8 +126,8 @@ template <bool Inclusive = true>
 void lengths_prefix_sum(
     const int32_t* lengths,
     int32_t num_items,
-    Tensor* prefix_buffer,
-    Tensor* prefix_sum,
+    Tensor<CUDAContext>* prefix_buffer,
+    Tensor<CUDAContext>* prefix_sum,
     CUDAContext* context) {
   // Retrieve buffer size
   size_t temp_storage_bytes = 0;
@@ -137,7 +137,7 @@ void lengths_prefix_sum(
         NULL,
         temp_storage_bytes,
         lengths,
-        prefix_sum->template mutable_data<int32_t>(),
+        prefix_sum->mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   } else {
@@ -145,7 +145,7 @@ void lengths_prefix_sum(
         NULL,
         temp_storage_bytes,
         lengths,
-        prefix_sum->template mutable_data<int32_t>(),
+        prefix_sum->mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   }
@@ -154,14 +154,14 @@ void lengths_prefix_sum(
   auto buffer_size = (temp_storage_bytes + sizeof(int32_t)) / sizeof(int32_t);
   prefix_buffer->Resize(buffer_size);
   void* d_temp_storage =
-      static_cast<void*>(prefix_buffer->template mutable_data<int32_t>());
+      static_cast<void*>(prefix_buffer->mutable_data<int32_t>());
 
   if (Inclusive) {
     cub::DeviceScan::InclusiveSum(
         d_temp_storage,
         temp_storage_bytes,
         lengths,
-        prefix_sum->template mutable_data<int32_t>(),
+        prefix_sum->mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   } else {
@@ -169,7 +169,7 @@ void lengths_prefix_sum(
         d_temp_storage,
         temp_storage_bytes,
         lengths,
-        prefix_sum->template mutable_data<int32_t>(),
+        prefix_sum->mutable_data<int32_t>(),
         num_items,
         context->cuda_stream());
   }
@@ -204,7 +204,7 @@ bool AddPaddingOp<CUDAContext>::MakePadding(
   if (OutputSize() > 1) {
     auto* lengths_out = Output(1);
     lengths_out->Resize(lengths_size);
-    lengths_out_ptr = lengths_out->template mutable_data<int32_t>();
+    lengths_out_ptr = lengths_out->mutable_data<int32_t>();
   }
 
   if (lengths_size == 0) {
@@ -274,7 +274,7 @@ bool RemovePaddingOp<CUDAContext>::DoRunWithType() {
   if (OutputSize() > 1) {
     auto* lengths_out = Output(1);
     lengths_out->Resize(lengths_size);
-    lengths_out_ptr = lengths_out->template mutable_data<int32_t>();
+    lengths_out_ptr = lengths_out->mutable_data<int32_t>();
   }
 
   if (lengths_size == 0) {
diff --git a/caffe2/operators/sequence_ops.h b/caffe2/operators/sequence_ops.h
index d91f3f701c199..c29ff7bfda64a 100644
--- a/caffe2/operators/sequence_ops.h
+++ b/caffe2/operators/sequence_ops.h
@@ -93,8 +93,8 @@ class GatherPaddingOp final : public Operator<Context> {
   int startPaddingWidth_;
   int endPaddingWidth_;
   // Scratch space required by the CUDA version
-  Tensor lengths_prefix_sum_buffer_{Context::GetDeviceType()};
-  Tensor lengths_prefix_sum_{Context::GetDeviceType()};
+  Tensor<Context> lengths_prefix_sum_buffer_;
+  Tensor<Context> lengths_prefix_sum_;
 };
 
 template <class Context>
@@ -133,8 +133,8 @@ class RemovePaddingOp final : public Operator<Context> {
   int endPaddingWidth_;
 
   // Scratch space required by the CUDA version
-  Tensor lengths_prefix_sum_buffer_{Context::GetDeviceType()};
-  Tensor lengths_prefix_sum_{Context::GetDeviceType()};
+  Tensor<Context> lengths_prefix_sum_buffer_;
+  Tensor<Context> lengths_prefix_sum_;
 };
 
 template <class Context>
@@ -236,8 +236,8 @@ class AddPaddingOp final : public Operator<Context> {
   int endPaddingWidth_;
 
   // Scratch space required by the CUDA version
-  Tensor lengths_prefix_sum_buffer_{Context::GetDeviceType()};
-  Tensor lengths_prefix_sum_{Context::GetDeviceType()};
+  Tensor<Context> lengths_prefix_sum_buffer_;
+  Tensor<Context> lengths_prefix_sum_;
 };
 
 template <class Context>
diff --git a/caffe2/operators/shape_op.h b/caffe2/operators/shape_op.h
index 05ea7a2f7c5fe..128a00a3d1561 100644
--- a/caffe2/operators/shape_op.h
+++ b/caffe2/operators/shape_op.h
@@ -19,13 +19,13 @@ class ShapeOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& data = Input(DATA);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<Tensor<Context>>(0);
     int numDims = data.ndim();
     int numAxes = axes_.size();
     if (numAxes == 0) {
       output->Resize(numDims);
       TIndex* output_data = output->template mutable_data<TIndex>();
-      context_.CopyBytesSameDevice(
+      context_.template CopyBytes<Context, Context>(
           numDims * sizeof(TIndex), data.dims().data(), output_data);
       return true;
     }
@@ -37,7 +37,7 @@ class ShapeOp : public Operator<Context> {
       auto axis = axes_[i];
       CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
       CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
-      context_.CopyBytesSameDevice(
+      context_.template CopyBytes<Context, Context>(
           sizeof(TIndex), src + axis * sizeof(TIndex), out);
       out += sizeof(TIndex);
     }
diff --git a/caffe2/operators/sinusoid_position_encoding_op.h b/caffe2/operators/sinusoid_position_encoding_op.h
index 101fd56d12603..5591b9749a704 100644
--- a/caffe2/operators/sinusoid_position_encoding_op.h
+++ b/caffe2/operators/sinusoid_position_encoding_op.h
@@ -28,7 +28,7 @@ class SinusoidPositionEncodingOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(0, CPU));
+        this, OperatorBase::Input<TensorCPU>(0));
   }
 
   template <typename Index>
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index e2523ad7cbf3f..a9ac0db0d08fb 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -49,13 +49,13 @@ __global__ void SliceCopyKernel(
 
 template <class SIndex, class Context>
 bool SliceImplGpu(
-    Tensor* output,
-    const Tensor& data,
+    Tensor<Context>* output,
+    const Tensor<Context>& data,
     const TensorCPU& starts,
     const TensorCPU& ends,
     Context* context,
-    Tensor* gdata = nullptr,
-    const Tensor* go = nullptr) {
+    Tensor<Context>* gdata = nullptr,
+    const Tensor<Context>* go = nullptr) {
   bool backward = output == nullptr;
 
   auto* starts_data = starts.template data<SIndex>();
@@ -237,8 +237,8 @@ bool SliceOp<int, CUDAContext>::RunOnDevice() {
   auto& data = Input(0);
 
   if (InputSize() > 1) {
-    starts_host_.CopyFrom(Input(1));
-    ends_host_.CopyFrom(Input(2));
+    starts_host_.CopyFrom<CUDAContext>(Input(1));
+    ends_host_.CopyFrom<CUDAContext>(Input(2));
   } else {
     if (!statically_inited_) {
       CAFFE_ENFORCE(HasArgument("starts"));
@@ -272,8 +272,8 @@ bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
   auto& data = Input(0);
 
   if (InputSize() == 4) {
-    starts_host_.CopyFrom(Input(1));
-    ends_host_.CopyFrom(Input(2));
+    starts_host_.CopyFrom<CUDAContext>(Input(1));
+    ends_host_.CopyFrom<CUDAContext>(Input(2));
 
     auto& go = Input(3);
 
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index 6c8872db7fa6d..12734a8e33df7 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -11,13 +11,13 @@ namespace {
 
 template <class SIndex, class Context>
 bool SliceImpl(
-    Tensor* output,
-    const Tensor& data,
-    const Tensor& starts,
-    const Tensor& ends,
+    Tensor<Context>* output,
+    const Tensor<Context>& data,
+    const Tensor<Context>& starts,
+    const Tensor<Context>& ends,
     Context* context,
-    Tensor* gdata = nullptr,
-    const Tensor* go = nullptr) {
+    Tensor<Context>* gdata = nullptr,
+    const Tensor<Context>* go = nullptr) {
   bool backward = output == nullptr;
 
   auto* starts_data = starts.template data<SIndex>();
@@ -140,7 +140,7 @@ bool SliceImpl(
       DCHECK_LE(
           static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes),
           static_cast<void*>(dst_bytes + dst_nbytes));
-      context->CopyItemsSameDevice(
+      context->template CopyItems<Context, Context>(
           data.meta(),
           dst_block_size,
           (void*)local_src_offset_bytes,
@@ -186,7 +186,7 @@ bool SliceImpl(
       DCHECK_LE(
           local_dst_offset_bytes + src_block_size_bytes,
           dst_bytes + dst_nbytes);
-      context->CopyItemsSameDevice(
+      context->template CopyItems<Context, Context>(
           go->meta(),
           src_block_size,
           (void*)local_src_offset_bytes,
@@ -213,10 +213,10 @@ class SliceOp : public Operator<Context> {
   }
 
  protected:
-  bool RunOnDeviceImpl(const Tensor& data, Tensor* output) {
+  bool RunOnDeviceImpl(const Tensor<Context>& data, Tensor<Context>* output) {
     if (InputSize() > 1) {
-      starts_host_.CopyFrom(Input(1));
-      ends_host_.CopyFrom(Input(2));
+      starts_host_.template CopyFrom<Context>(Input(1));
+      ends_host_.template CopyFrom<Context>(Input(2));
     } else {
       if (!statically_inited_) {
         CAFFE_ENFORCE(HasArgument("starts"));
@@ -248,8 +248,8 @@ class SliceOp : public Operator<Context> {
   std::vector<SIndex> starts_;
   std::vector<SIndex> ends_;
   bool statically_inited_;
-  Tensor starts_host_{CPU};
-  Tensor ends_host_{CPU};
+  TensorCPU starts_host_;
+  TensorCPU ends_host_;
 };
 
 template <class SIndex, class Context>
@@ -267,8 +267,8 @@ class SliceGradientOp : public Operator<Context> {
     auto& data = Input(0);
 
     if (InputSize() == 4) {
-      starts_host_.CopyFrom(Input(1));
-      ends_host_.CopyFrom(Input(2));
+      starts_host_.template CopyFrom<Context>(Input(1));
+      ends_host_.template CopyFrom<Context>(Input(2));
 
       auto& go = Input(3);
 
@@ -307,7 +307,7 @@ class SliceGradientOp : public Operator<Context> {
   std::vector<SIndex> starts_;
   std::vector<SIndex> ends_;
   bool statically_inited_;
-  Tensor starts_host_{CPU};
-  Tensor ends_host_{CPU};
+  TensorCPU starts_host_;
+  TensorCPU ends_host_;
 };
 } // namespace caffe2
diff --git a/caffe2/operators/softmax_op.cc b/caffe2/operators/softmax_op.cc
index 3f338492ce3e1..881b939dd0bb8 100644
--- a/caffe2/operators/softmax_op.cc
+++ b/caffe2/operators/softmax_op.cc
@@ -12,7 +12,7 @@ bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
   const int N = X.size_to_dim(canonical_axis);
   const int D = X.size_from_dim(canonical_axis);
   Y->ResizeLike(X);
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   // First, get scales
   if (scale_.size() != N) {
     scale_.Resize(N);
@@ -64,7 +64,7 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
   if (N == 0) {
     return true;
   }
-  context_.CopySameDevice<float>(Y.size(), dYdata, dXdata);
+  context_.Copy<float, CPUContext, CPUContext>(Y.size(), dYdata, dXdata);
   float* scaledata = scale_.mutable_data<float>();
   for (int i = 0; i < N; ++i) {
     math::Dot<float, CPUContext>(D, Ydata + i * D, dYdata + i * D,
diff --git a/caffe2/operators/softmax_op.h b/caffe2/operators/softmax_op.h
index 8fd70fded99b1..9073a0e6a98f7 100644
--- a/caffe2/operators/softmax_op.h
+++ b/caffe2/operators/softmax_op.h
@@ -19,9 +19,9 @@ class SoftmaxOp final : public Operator<Context> {
 
  protected:
   int axis_;
-  Tensor scale_{Context::GetDeviceType()};
-  Tensor rowmax_{Context::GetDeviceType()};
-  Tensor sum_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> scale_;
+  Tensor<Context> rowmax_;
+  Tensor<Context> sum_multiplier_;
 };
 
 template <typename T, class Context>
@@ -35,8 +35,8 @@ class SoftmaxGradientOp final : public Operator<Context> {
 
  protected:
   int axis_;
-  Tensor scale_{Context::GetDeviceType()};
-  Tensor sum_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> scale_;
+  Tensor<Context> sum_multiplier_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
index 05b91c3b4d164..08dbf6e7d07a4 100644
--- a/caffe2/operators/softmax_ops.cu
+++ b/caffe2/operators/softmax_ops.cu
@@ -243,7 +243,7 @@ void Softmax(
 
   math::RowwiseMax<float, CUDAContext>(N, D, logits, rowmax, context);
   // Put the intermediate result X - max(X) into Y
-  context->CopySameDevice<float>(size, logits, probs);
+  context->Copy<float, CUDAContext, CUDAContext>(size, logits, probs);
   // Subtract the scale
   math::Gemm<float, CUDAContext>(
       CblasNoTrans,
@@ -327,7 +327,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
       sum_multiplier_.data<float>(),
       losses_.mutable_data<float>(),
       rowmax_.mutable_data<float>(),
-      P->template mutable_data<float>(),
+      P->mutable_data<float>(),
       !label_prob_mode_, // logarithmic output
       &context_);
   // Compute label xent loss per example
@@ -346,7 +346,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
     // Since we had logarithmic output, we need to exponentiate
     // them again.
     math::Exp<float, CUDAContext>(
-        N * D, P->data<float>(), P->template mutable_data<float>(), &context_);
+        N * D, P->data<float>(), P->mutable_data<float>(), &context_);
   } else {
     ProbCrossEntropyKernel<<<
         std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
@@ -375,7 +375,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
   }
 
   // Sum of all losses
-  float* avg_loss_data = avg_loss->template mutable_data<float>();
+  float* avg_loss_data = avg_loss->mutable_data<float>();
   math::Sum<float, CUDAContext>(
       losses_.size(), losses_.data<float>(), avg_loss_data, &context_, &scratch_);
   // Average of input batch size
@@ -413,7 +413,7 @@ bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
   }
 
   const float* Xdata = X.data<float>();
-  float* Pdata = P->template mutable_data<float>();
+  float* Pdata = P->mutable_data<float>();
 
   // Softmax for each x,y location
   SpatialSoftmaxKernel<<<
@@ -424,7 +424,7 @@ bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
 
   // Cross entropy
   avg_loss->Resize(vector<TIndex>());
-  float* avg_loss_data = avg_loss->template mutable_data<float>();
+  float* avg_loss_data = avg_loss->mutable_data<float>();
   math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
 
   const int* label_data = T.data<int>();
@@ -516,19 +516,15 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
     if (weights == nullptr) {
       // Copy softmax probabilities into dX
       if (!only_loss_) {
-        context_.CopySameDevice<float>(
-            P.size(), P.data<float>(), dX->template mutable_data<float>());
+        context_.Copy<float, CUDAContext, CUDAContext>(
+            P.size(), P.data<float>(), dX->mutable_data<float>());
       }
       LabelCrossEntropyGradientKernel<<<
           CAFFE_GET_BLOCKS(N),
           CAFFE_CUDA_NUM_THREADS,
           0,
           context_.cuda_stream()>>>(
-          N,
-          D,
-          P.data<float>(),
-          T.data<int>(),
-          dX->template mutable_data<float>());
+          N, D, P.data<float>(), T.data<int>(), dX->mutable_data<float>());
     } else {
       // Weighted version gets the Pdata values internally
       LabelCrossEntropyGradientKernelWeighted<<<
@@ -540,7 +536,7 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
           D,
           P.data<float>(),
           T.data<int>(),
-          dX->template mutable_data<float>(),
+          dX->mutable_data<float>(),
           weights);
     }
   } else {
@@ -553,7 +549,7 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
         D,
         P.data<float>(),
         T.data<float>(),
-        dX->template mutable_data<float>(),
+        dX->mutable_data<float>(),
         weights);
   }
   float total_weight = N;
@@ -575,14 +571,14 @@ bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
         dX->size(),
         scale_ / total_weight,
         dX->data<float>(),
-        dX->template mutable_data<float>(),
+        dX->mutable_data<float>(),
         &context_);
   }
   math::Scale<float, CUDAContext>(
       dX->size(),
       d_avg_loss.data<float>(),
       dX->data<float>(),
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       &context_);
 
   return true;
@@ -624,14 +620,14 @@ bool SpatialSoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
   }
 
   const float* Pdata = P.data<float>();
-  float* dX_data = dX->template mutable_data<float>();
+  float* dX_data = dX->mutable_data<float>();
   const int* label_data = T.data<int>();
   const float* d_avg_loss_data = d_avg_loss.data<float>();
 
   // Copy softmax probabilities into dX. All but the neuron
   // corresponding to the correct label has gradient equaling e(x_j)
   // which is the probability under softmax.
-  context_.CopySameDevice<float>(P.size(), Pdata, dX_data);
+  context_.Copy<float, CUDAContext, CUDAContext>(P.size(), Pdata, dX_data);
 
   math::Set<float, CUDAContext>(
       1, 0.0f, total_weight_ptr_.mutable_data<float>(), &context_);
@@ -665,14 +661,14 @@ bool SpatialSoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
         dX->size(),
         scale_ / h_total_weight,
         dX->data<float>(),
-        dX->template mutable_data<float>(),
+        dX->mutable_data<float>(),
         &context_);
   }
   math::Scale<float, CUDAContext>(
       dX->size(),
       d_avg_loss.data<float>(),
       dX->data<float>(),
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       &context_);
 
   return true;
diff --git a/caffe2/operators/softmax_shared.cc b/caffe2/operators/softmax_shared.cc
index c1b376187937a..14e823c1f9c19 100644
--- a/caffe2/operators/softmax_shared.cc
+++ b/caffe2/operators/softmax_shared.cc
@@ -16,7 +16,7 @@ void SoftmaxCPU(
     float* rowmax) {
   math::RowwiseMax<float, CPUContext>(N, D, Xdata, rowmax, &context);
   // Put the intermediate result X - max(X) into Y
-  context.template CopyFromCPU<float>(N * D, Xdata, Ydata);
+  context.template Copy<float, CPUContext, CPUContext>(N * D, Xdata, Ydata);
   // Subtract the max (for numerical reasons)
   math::Gemm<float, CPUContext>(
       CblasNoTrans,
diff --git a/caffe2/operators/softmax_with_loss_op.cc b/caffe2/operators/softmax_with_loss_op.cc
index e2ea869528b96..32cb2cec3c9ab 100644
--- a/caffe2/operators/softmax_with_loss_op.cc
+++ b/caffe2/operators/softmax_with_loss_op.cc
@@ -169,7 +169,7 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
         D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
   }
 
-  float* Pdata = P->template mutable_data<float>();
+  float* Pdata = P->mutable_data<float>();
   const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
 
   if (label_prob_mode_) {
@@ -253,7 +253,7 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
   }
 
   avg_loss->Resize(vector<TIndex>());
-  float* avg_loss_data = avg_loss->template mutable_data<float>();
+  float* avg_loss_data = avg_loss->mutable_data<float>();
   if (weight_sum != 0.0) {
     avg_loss_data[0] = loss_sum * scale_ / weight_sum;
   } else {
@@ -292,12 +292,12 @@ bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
   }
 
   const float* Pdata = P.data<float>();
-  float* dX_data = dX->template mutable_data<float>();
+  float* dX_data = dX->mutable_data<float>();
 
   // Copy softmax probabilities into dX. All but the neuron
   // corresponding to the correct label has gradient equaling e(x_j)
   // which is the probability under softmax.
-  context_.CopyFromCPU<float>(P.size(), Pdata, dX_data);
+  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
 
   // Compute gradient for the matching labels.
   float total_weight = 0.0f;
diff --git a/caffe2/operators/softmax_with_loss_op.h b/caffe2/operators/softmax_with_loss_op.h
index 911fa9e58691b..27e6db2d76fc5 100644
--- a/caffe2/operators/softmax_with_loss_op.h
+++ b/caffe2/operators/softmax_with_loss_op.h
@@ -32,13 +32,12 @@ class SoftmaxWithLossOp final : public Operator<Context> {
   StorageOrder order_;
   int axis_;
 
-  Tensor losses_{Context::GetDeviceType()}; // Per example loss
-  Tensor rowmax_{Context::GetDeviceType()}; // per example row max
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
-  Tensor sum_multiplier_{
-      Context::GetDeviceType()}; // Vector of ones for summing via dot prod
-  Tensor total_weight_ptr_{Context::GetDeviceType()};
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> losses_; // Per example loss
+  Tensor<Context> rowmax_; // per example row max
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
+  Tensor<Context> total_weight_ptr_;
+  Tensor<Context> scratch_;
 };
 
 template <typename T, class Context>
@@ -63,13 +62,13 @@ class SoftmaxWithLossGradientOp final : public Operator<Context> {
  protected:
   float scale_;
   int label_prob_mode_;
-  Tensor sum_multiplier_{Context::GetDeviceType()};
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
-  Tensor total_weight_ptr_{Context::GetDeviceType()};
+  Tensor<Context> sum_multiplier_;
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> total_weight_ptr_;
   StorageOrder order_;
   bool only_loss_;
   int axis_;
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/softplus_op.cc b/caffe2/operators/softplus_op.cc
index bba4f461553b1..7d2efd578560a 100644
--- a/caffe2/operators/softplus_op.cc
+++ b/caffe2/operators/softplus_op.cc
@@ -11,7 +11,7 @@ bool SoftplusOp<float, CPUContext>::RunOnDevice() {
   auto* Y = Output(0);
   Y->ResizeLike(X);
 
-  EigenVectorMap<float>(Y->template mutable_data<float>(), X.size()) =
+  EigenVectorMap<float>(Y->mutable_data<float>(), X.size()) =
       (ConstEigenVectorMap<float>(X.data<float>(), X.size()).array().exp() +
        1.0f)
           .log();
@@ -28,7 +28,7 @@ bool SoftplusGradientOp<float, CPUContext>::RunOnDevice() {
 
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
   ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
   ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
diff --git a/caffe2/operators/softplus_op.cu b/caffe2/operators/softplus_op.cu
index 3aefb03a5850f..7e542f5a9b7c8 100644
--- a/caffe2/operators/softplus_op.cu
+++ b/caffe2/operators/softplus_op.cu
@@ -31,7 +31,7 @@ bool SoftplusOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->template mutable_data<float>());
+      X.size(), X.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -48,10 +48,7 @@ bool SoftplusGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      Y.size(),
-      Y.data<float>(),
-      dY.data<float>(),
-      dX->template mutable_data<float>());
+      Y.size(), Y.data<float>(), dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/space_batch_op.h b/caffe2/operators/space_batch_op.h
index 4690b503c52a8..4f42dc0e94838 100644
--- a/caffe2/operators/space_batch_op.h
+++ b/caffe2/operators/space_batch_op.h
@@ -10,11 +10,11 @@ namespace caffe2 {
 
 template <typename Context>
 void spaceToBatch(
-    const Tensor& input,
+    const Tensor<Context>& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor* output,
+    Tensor<Context>* output,
     Context* /*context*/) {
   CAFFE_ENFORCE(input.ndim() == 4);
   CAFFE_ENFORCE(output->ndim() == 4);
@@ -60,11 +60,11 @@ void spaceToBatch(
 
 template <typename Context>
 void batchToSpace(
-    const Tensor& input,
+    const Tensor<Context>& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor* output,
+    Tensor<Context>* output,
     Context* /*context*/) {
   CAFFE_ENFORCE(input.ndim() == 4);
   CAFFE_ENFORCE(output->ndim() == 4);
diff --git a/caffe2/operators/space_batch_op_gpu.cu b/caffe2/operators/space_batch_op_gpu.cu
index e9018c8a0e74e..862440907fcf8 100644
--- a/caffe2/operators/space_batch_op_gpu.cu
+++ b/caffe2/operators/space_batch_op_gpu.cu
@@ -48,13 +48,13 @@ __global__ void SpaceToBatch(
   }
 }
 
-template <>
+template<>
 void spaceToBatch<CUDAContext>(
-    const Tensor& input,
+    const Tensor<CUDAContext>& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor* output,
+    Tensor<CUDAContext>* output,
     CUDAContext* context) {
   const int output_batch = output->dim32(0);
   const int output_depth = output->dim32(1);
@@ -84,7 +84,7 @@ void spaceToBatch<CUDAContext>(
       pad_t,
       block_size,
       input.data<float>(),
-      output->template mutable_data<float>());
+      output->mutable_data<float>());
 }
 
 
@@ -133,11 +133,11 @@ __global__ void BatchToSpace(
 
 template <>
 void batchToSpace(
-    const Tensor& input,
+    const Tensor<CUDAContext>& input,
     int pad_t,
     int pad_l,
     int block_size,
-    Tensor* output,
+    Tensor<CUDAContext>* output,
     CUDAContext* context) {
   CAFFE_ENFORCE(input.ndim() == 4);
   CAFFE_ENFORCE(output->ndim() == 4);
@@ -170,7 +170,7 @@ void batchToSpace(
       pad_t,
       block_size,
       input.data<float>(),
-      output->template mutable_data<float>());
+      output->mutable_data<float>());
 }
 
 REGISTER_CUDA_OPERATOR(SpaceToBatch, SpaceToBatchOp<CUDAContext>);
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 9e2da09fe65a6..5ea10d17c3f29 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -104,7 +104,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
     int32_t sparse_indices_length = sparse_indices.dim32(0);
     const int32_t* lengths_vec = nullptr;
     auto* output = Output(OUTPUTVALUE);
-    Tensor* presence_mask = nullptr;
+    Tensor<Context>* presence_mask = nullptr;
     if (returnPresenceMask_) {
       presence_mask = Output(PRESENCEMASK);
     }
@@ -135,7 +135,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
     char* output_data =
         static_cast<char*>(output->raw_mutable_data(sparse_values.meta()));
     for (int i = 0; i < cols * rows; i++) {
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           default_value.meta(),
           block_size,
           default_val,
@@ -162,7 +162,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
         }
         int idx = this->getFeatureIdx(sparse_index);
         if (idx != -1) {
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               sparse_values.meta(),
               block_size,
               sparse_values_vec + (offset + c) * block_nbytes,
@@ -266,7 +266,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
         int idx = this->getFeatureIdx(sparse_indices_vec[offset + c]);
         if (idx != -1 && !gradient_used[idx]) {
           gradient_used[idx] = true;
-          context_.CopyItemsSameDevice(
+          context_.template CopyItems<Context, Context>(
               gradient_output.meta(),
               block_size,
               gradient_output_vec + (r * cols + idx) * block_nbytes,
diff --git a/caffe2/operators/sparse_to_dense_op.h b/caffe2/operators/sparse_to_dense_op.h
index 6a9f2fddb9943..7fbfa38518c9f 100644
--- a/caffe2/operators/sparse_to_dense_op.h
+++ b/caffe2/operators/sparse_to_dense_op.h
@@ -110,9 +110,9 @@ class SparseToDenseOp final : public Operator<Context> {
 
  private:
   int output_first_dim_;
-  Tensor scratch_{Context::GetDeviceType()};
-  Tensor max_element_host_{CPU};
-  Tensor max_element_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
+  Tensor<CPUContext> max_element_host_;
+  Tensor<Context> max_element_;
 
   INPUT_TAGS(INDICES, VALUES, DATA_TO_INFER_DIM);
 };
diff --git a/caffe2/operators/spatial_batch_norm_gradient_op.cc b/caffe2/operators/spatial_batch_norm_gradient_op.cc
index 5a9d55341f27e..dd5434db725a7 100644
--- a/caffe2/operators/spatial_batch_norm_gradient_op.cc
+++ b/caffe2/operators/spatial_batch_norm_gradient_op.cc
@@ -48,10 +48,8 @@ bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
   // dX = (1. / N) * scale * inv_var * (N * dY - np.sum(dY, axis=0) - (X - mean)
   //   * inv_var * inv_var * np.sum(dY * (X - mean), axis=0))
 
-  EigenVectorArrayMap<float> dBias_arr(
-      dBias->template mutable_data<float>(), C);
-  EigenVectorArrayMap<float> dScale_arr(
-      dScale->template mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
 
   if (num_batches_ == 1) {
     dBias_arr.setZero();
@@ -65,7 +63,7 @@ bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
       ConstEigenArrayMap<float> X_arr(X.data<float>(), sample_size, N * C);
       ConstEigenArrayMap<float> dY_arr(dY.data<float>(), sample_size, N * C);
       EigenArrayMap<float> dX_arr(
-          dX->template mutable_data<float>(), sample_size, N * C);
+          dX->mutable_data<float>(), sample_size, N * C);
       dX_arr.setZero();
       if (N == 0) {
         return true;
@@ -96,7 +94,7 @@ bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
       ConstEigenArrayMap<float> X_arr(X.data<float>(), C, N * sample_size);
       ConstEigenArrayMap<float> dY_arr(dY.data<float>(), C, N * sample_size);
       EigenArrayMap<float> dX_arr(
-          dX->template mutable_data<float>(), C, N * sample_size);
+          dX->mutable_data<float>(), C, N * sample_size);
       dX_arr.setZero();
       if (N == 0) {
         return true;
diff --git a/caffe2/operators/spatial_batch_norm_op.cc b/caffe2/operators/spatial_batch_norm_op.cc
index 09f2b04fd9f25..671493a1df010 100644
--- a/caffe2/operators/spatial_batch_norm_op.cc
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@@ -45,7 +45,7 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
     Output(SAVED_MEAN)->Resize(C);
     Output(SAVED_INV_VAR)->Resize(C);
     EigenVectorArrayMap<float> mean(
-        Output(SAVED_MEAN)->template mutable_data<float>(), C);
+        Output(SAVED_MEAN)->mutable_data<float>(), C);
     EigenVectorArrayMap<float> var(
         Output(SAVED_INV_VAR)->mutable_data<float>(), C);
     if (N > 0) {
@@ -131,7 +131,7 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
     inv_std = (var_arr + epsilon_).sqrt().inverse();
   } else {
     EigenVectorArrayMap<float> saved_inv_std(
-        Output(SAVED_INV_VAR)->template mutable_data<float>(), C);
+        Output(SAVED_INV_VAR)->mutable_data<float>(), C);
     saved_inv_std = (saved_inv_std + epsilon_).inverse().sqrt();
     inv_std = saved_inv_std;
   }
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.cc b/caffe2/operators/spatial_softmax_with_loss_op.cc
index 02779fa598bf6..1288bc7d90554 100644
--- a/caffe2/operators/spatial_softmax_with_loss_op.cc
+++ b/caffe2/operators/spatial_softmax_with_loss_op.cc
@@ -14,33 +14,33 @@ REGISTER_CPU_OPERATOR(
 OPERATOR_SCHEMA(SpatialSoftmaxWithLoss)
     .NumInputs(2, 3)
     .NumOutputs(2)
-    .TensorInferenceFunction([](const OperatorDef& def,
-                                const vector<TensorShape>& in) {
-      ArgumentHelper helper(def);
-      vector<TensorShape> out(2);
+    .TensorInferenceFunction(
+        [](const OperatorDef& def, const vector<TensorShape>& in) {
+          ArgumentHelper helper(def);
+          vector<TensorShape> out(2);
 
-      auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
-      auto labels = in[1]; // Tensor with shape [batch_size, ]
-      auto batch_size = logits.dims().Get(0);
-      auto num_classes = logits.dims().Get(1);
+          auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
+          auto labels = in[1]; // Tensor with shape [batch_size, ]
+          auto batch_size = logits.dims().Get(0);
+          auto num_classes = logits.dims().Get(1);
 
-      CAFFE_ENFORCE_EQ(logits.dims_size(), 4);
-      CAFFE_ENFORCE_EQ(labels.dims_size(), 3);
-      out[0].set_data_type(logits.data_type());
-      out[0].add_dims(batch_size);
-      out[0].add_dims(num_classes);
-      out[0].add_dims(in[0].dims(2));
-      out[0].add_dims(in[0].dims(3));
-      // Output 2 is scalar shape, so no dims added
-      return out;
-    })
+          CAFFE_ENFORCE_EQ(logits.dims_size(), 4);
+          CAFFE_ENFORCE_EQ(labels.dims_size(), 3);
+          out[0].set_data_type(logits.data_type());
+          out[0].add_dims(batch_size);
+          out[0].add_dims(num_classes);
+          out[0].add_dims(in[0].dims(2));
+          out[0].add_dims(in[0].dims(3));
+          // Output 2 is scalar shape, so no dims added
+          return out;
+        })
     .SetDoc(R"DOC(
 Combined Spatial Softmax and Cross-Entropy loss operator.
 Similar to SoftmaxWithLoss, this operator computes the spatial softmax
 normalized values for each layer in the batch of the given input, after which
 cross-entropy loss is computed. This operator is numerically more stable than
 separate Softmax and CrossEntropy ops. The inputs are a 2-D tensor
-(Tensor) of size (batch_size x input_feature_dimensions) and tensor of
+(Tensor<float>) of size (batch_size x input_feature_dimensions) and tensor of
 labels (ground truth).
 Output is tensor with the probability for each label in a pixel for each example
 (N x D x W x H) and averaged loss (scalar).
@@ -78,7 +78,7 @@ bool SpatialSoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
         D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
   }
 
-  float* Pdata = P->template mutable_data<float>();
+  float* Pdata = P->mutable_data<float>();
   const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
   CAFFE_ENFORCE_EQ(X.ndim(), 4);
   CAFFE_ENFORCE_EQ(T.ndim(), 3);
@@ -120,7 +120,7 @@ bool SpatialSoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
 
   // Compute the avg cross-entropy loss
   avg_loss->Resize(vector<TIndex>());
-  float* avg_loss_data = avg_loss->template mutable_data<float>();
+  float* avg_loss_data = avg_loss->mutable_data<float>();
   const int* label_data = T.data<int>();
 
   float sum_label_xent = 0.0f;
@@ -175,13 +175,13 @@ bool SpatialSoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
   int W = X.dim32(3);
 
   const float* Pdata = P.data<float>();
-  float* dX_data = dX->template mutable_data<float>();
+  float* dX_data = dX->mutable_data<float>();
   const int* label_data = T.data<int>();
 
   // Copy softmax probabilities into dX. All but the neuron
   // corresponding to the correct label has gradient equaling e(x_j)
   // which is the probability under softmax.
-  context_.CopyFromCPU<float>(P.size(), Pdata, dX_data);
+  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
 
   float total_weight = 0.0f;
   for (int y = 0; y < H; ++y) {
@@ -228,7 +228,7 @@ bool SpatialSoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
       dX->size(),
       d_avg_loss.data<float>(),
       dX->data<float>(),
-      dX->template mutable_data<float>(),
+      dX->mutable_data<float>(),
       &context_);
   return true;
 }
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.h b/caffe2/operators/spatial_softmax_with_loss_op.h
index 0c1d69087e681..d466063d45c1f 100644
--- a/caffe2/operators/spatial_softmax_with_loss_op.h
+++ b/caffe2/operators/spatial_softmax_with_loss_op.h
@@ -28,13 +28,12 @@ class SpatialSoftmaxWithLossOp final : public Operator<Context> {
   float scale_;
   StorageOrder order_;
 
-  Tensor losses_{Context::GetDeviceType()}; // Per example loss
-  Tensor rowmax_{Context::GetDeviceType()}; // per example row max
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
-  Tensor sum_multiplier_{
-      Context::GetDeviceType()}; // Vector of ones for summing via dot prod
-  Tensor total_weight_ptr_{Context::GetDeviceType()};
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> losses_; // Per example loss
+  Tensor<Context> rowmax_; // per example row max
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
+  Tensor<Context> total_weight_ptr_;
+  Tensor<Context> scratch_;
 };
 
 template <typename T, class Context>
@@ -56,12 +55,12 @@ class SpatialSoftmaxWithLossGradientOp final : public Operator<Context> {
 
  protected:
   float scale_;
-  Tensor sum_multiplier_{Context::GetDeviceType()};
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
-  Tensor total_weight_ptr_{Context::GetDeviceType()};
+  Tensor<Context> sum_multiplier_;
+  Tensor<Context> weights_; // unignored weights
+  Tensor<Context> total_weight_ptr_;
   StorageOrder order_;
   bool only_loss_;
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc
index 508dd1ae82060..64a0c1a888800 100644
--- a/caffe2/operators/stats_ops.cc
+++ b/caffe2/operators/stats_ops.cc
@@ -35,9 +35,9 @@ class StatRegistryExportOp : public Operator<CPUContext> {
     keys->Resize(data.size());
     values->Resize(data.size());
     timestamps->Resize(data.size());
-    auto* pkeys = keys->template mutable_data<std::string>();
-    auto* pvals = values->template mutable_data<int64_t>();
-    auto* ptimestamps = timestamps->template mutable_data<int64_t>();
+    auto* pkeys = keys->mutable_data<std::string>();
+    auto* pvals = values->mutable_data<int64_t>();
+    auto* ptimestamps = timestamps->mutable_data<int64_t>();
     int i = 0;
     for (const auto& stat : data) {
       pkeys[i] = std::move(stat.key);
@@ -153,7 +153,7 @@ struct TimerGetAndEndOp : public Operator<CPUContext> {
   bool RunOnDevice() override {
     int64_t nanos = OperatorBase::Input<TimerInstance*>(0)->get_ns();
     OperatorBase::Input<TimerInstance*>(0)->end();
-    auto* res = Output(0);
+    auto* res = OperatorBase::Output<TensorCPU>(0);
     res->Resize(1);
     res->template mutable_data<int64_t>()[0] = nanos;
     return true;
@@ -166,7 +166,7 @@ struct TimerGetOp : public Operator<CPUContext> {
 
   bool RunOnDevice() override {
     int64_t nanos = OperatorBase::Input<TimerInstance*>(0)->get_ns();
-    auto* res = Output(0);
+    auto* res = OperatorBase::Output<TensorCPU>(0);
     res->Resize();
     res->template mutable_data<int64_t>()[0] = nanos;
     return true;
diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc
index 672ca24d073e3..819bb6a6c5b09 100644
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@@ -15,7 +15,7 @@ bool StringJoinOp<CPUContext>::DoRunWithType() {
   int rowSize = (input.ndim() == 2) ? input.dim(1) : 1;
   if (this->axis_ == 0) {
     output->Resize(input.dim(0));
-    auto* outputData = output->template mutable_data<std::string>();
+    auto* outputData = output->mutable_data<std::string>();
 
     int offset = 0;
     for (int i = 0; i < input.dim(0); ++i) {
@@ -29,7 +29,7 @@ bool StringJoinOp<CPUContext>::DoRunWithType() {
     }
   } else if (this->axis_ == 1) {
     output->Resize(input.dim(1));
-    auto* outputData = output->template mutable_data<std::string>();
+    auto* outputData = output->mutable_data<std::string>();
 
     for (int j = 0; j < input.dim(1); ++j) {
       std::stringstream stream;
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index ece70ffd2425e..3d6fb4720ddb2 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
  public:
   bool runOp(const TensorCPU& input) {
     auto* blob = ws_.CreateBlob("X");
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = blob->GetMutable<TensorCPU>();
     tensor->ResizeLike(input);
     tensor->ShareData(input);
 
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsType<Tensor>(CPU));
+    EXPECT_TRUE(output->IsType<TensorCPU>());
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
@@ -42,9 +42,9 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
   std::vector<std::string> input = {"a", "xx", "c"};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(input.size());
-  auto* data = tensor->template mutable_data<std::string>();
+  auto* data = tensor->mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
     *data++ = input[i];
   }
@@ -62,9 +62,9 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                  {"dd", "ee", "ff"}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(input.size(), input[0].size());
-  auto* data = tensor->template mutable_data<std::string>();
+  auto* data = tensor->mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
     for (int j = 0; j < input[0].size(); ++j) {
       *data++ = input[i][j];
@@ -82,9 +82,9 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
   std::vector<float> input = {3.90f, 5.234f, 8.12f};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(input.size());
-  auto* data = tensor->template mutable_data<float>();
+  auto* data = tensor->mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
     *data++ = input[i];
   }
@@ -102,9 +102,9 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                            {4.67f, 5.90f, 6.32f}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(input.size(), input[0].size());
-  auto* data = tensor->template mutable_data<float>();
+  auto* data = tensor->mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
     for (int j = 0; j < input[0].size(); ++j) {
       *data++ = input[i][j];
@@ -122,9 +122,9 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
   std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(input.size(), input[0].size());
-  auto* data = tensor->template mutable_data<int64_t>();
+  auto* data = tensor->mutable_data<int64_t>();
   for (int i = 0; i < input.size(); ++i) {
     for (int j = 0; j < input[0].size(); ++j) {
       *data++ = input[i][j];
diff --git a/caffe2/operators/stump_func_op.cu b/caffe2/operators/stump_func_op.cu
index 9e38da2bcebd0..2ea3108e73ad3 100644
--- a/caffe2/operators/stump_func_op.cu
+++ b/caffe2/operators/stump_func_op.cu
@@ -42,7 +42,7 @@ bool StumpFuncOp<float, float, CUDAContext>::RunOnDevice() {
   const float* in_data = in.data<float>();
   auto* out = Output(0);
   out->ResizeLike(in);
-  float* out_data = out->template mutable_data<float>();
+  float* out_data = out->mutable_data<float>();
   StumpFuncKernel<<<CAFFE_GET_BLOCKS(in.size()), CAFFE_CUDA_NUM_THREADS,
     0, context_.cuda_stream()>>>(
       in.size(), threshold_, low_value_, high_value_, in_data, out_data);
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index 8f1e0895a2859..ca4a762587bfc 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsType<Tensor>(CPU)) {
+    if (!noiseBlob->IsType<TensorCPU>()) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = noiseBlob->template GetMutable<TensorCPU>();
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
@@ -115,13 +115,13 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
         X.data<uint8_t>(),
         mean.data<float>(),
         noise.data<float>(),
-        Y->template mutable_data<float>());
+        Y->mutable_data<float>());
 
     return true;
   }
 
 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
-  void initNoiseCPU(Tensor* noise, int size) {
+  void initNoiseCPU(Tensor<CPUContext>* noise, int size) {
     noise->Resize(size);
 
     math::RandGaussian<float, CPUContext>(
@@ -134,7 +134,7 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  void initNoiseCPUNeon(Tensor* noise, int size) {
+  void initNoiseCPUNeon(Tensor<CPUContext>* noise, int size) {
     // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
     // the inner loop is vectorized. Round up to the next highest
     // multiple of kNeonNoiseReadSize
@@ -429,7 +429,7 @@ class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp
         W,
         X.data<float>(),
         mean.data<float>(),
-        Y->template mutable_data<uint8_t>());
+        Y->mutable_data<uint8_t>());
 
     return true;
   }
diff --git a/caffe2/operators/summarize_op.cc b/caffe2/operators/summarize_op.cc
index 5ba3b4447bcca..6ae7ed0bfc43d 100644
--- a/caffe2/operators/summarize_op.cc
+++ b/caffe2/operators/summarize_op.cc
@@ -33,7 +33,7 @@ bool SummarizeOp<float, CPUContext>::RunOnDevice() {
   if (OutputSize()) {
     auto* Y = Output(0);
     Y->Resize(NUM_STATS);
-    float* Ydata = Y->template mutable_data<float>();
+    float* Ydata = Y->mutable_data<float>();
     Ydata[MIN_IDX] = min;
     Ydata[MAX_IDX] = max;
     Ydata[MEAN_IDX] = static_cast<float>(mean);
@@ -50,7 +50,7 @@ OPERATOR_SCHEMA(Summarize)
     .NumInputs(1)
     .NumOutputs(0, 1)
     .SetDoc(R"DOC(
-Summarize computes four statistics of the input tensor (Tensor)- min,
+Summarize computes four statistics of the input tensor (Tensor<float>)- min,
 max, mean and standard deviation. The output will be written to a 1-D tensor of
 size 4 if an output tensor is provided. Else, if the argument 'to_file' is
 greater than 0, the values are written to a log file in the root folder.
@@ -59,11 +59,11 @@ greater than 0, the values are written to a log file in the root folder.
         "to_file",
         "(int, default 0) flag to indicate if the summarized "
         "statistics have to be written to a log file.")
-    .Input(0, "data", "The input data as Tensor.")
+    .Input(0, "data", "The input data as Tensor<float>.")
     .Output(
         0,
         "output",
-        "1-D tensor (Tensor) of size 4 containing min, "
+        "1-D tensor (Tensor<float>) of size 4 containing min, "
         "max, mean and standard deviation");
 
 SHOULD_NOT_DO_GRADIENT(Summarize);
diff --git a/caffe2/operators/summarize_op.cu b/caffe2/operators/summarize_op.cu
index 13c1a1b8793e1..89dd4c01003df 100644
--- a/caffe2/operators/summarize_op.cu
+++ b/caffe2/operators/summarize_op.cu
@@ -96,12 +96,12 @@ bool SummarizeOp<float, CUDAContext>::RunOnDevice() {
                  << standard_deviation << std::endl;
   }
   if (OutputSize()) {
-    auto* Y = Output(0);
+    auto* Y = OperatorBase::Output<TensorCUDA>(0);
     Y->Resize(4);
     float output_buffer[NUM_STATS] = {result.min, result.max, result.mean,
                                standard_deviation};
-    context_.CopyFromCPU<float>(
-        NUM_STATS, output_buffer, Y->template mutable_data<float>());
+    context_.Copy<float, CPUContext, CUDAContext>(
+        NUM_STATS, output_buffer, Y->mutable_data<float>());
   }
   return true;
 }
diff --git a/caffe2/operators/swish_op.cc b/caffe2/operators/swish_op.cc
index f68b86c3f0d85..a636d23d85f7c 100644
--- a/caffe2/operators/swish_op.cc
+++ b/caffe2/operators/swish_op.cc
@@ -58,8 +58,8 @@ OPERATOR_SCHEMA(Swish)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(
-Swish takes one input data (Tensor) and produces one output data
-(Tensor) where the swish function, y = x / (1 + exp(-x)), is applied to the
+Swish takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the swish function, y = x / (1 + exp(-x)), is applied to the
 tensor elementwise.
 )DOC")
     .Input(0, "X", "1D input tensor")
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index f8ff48588f30d..e9e55b8fa7876 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -43,7 +43,7 @@ TensorProtosDBInput<Context>::TensorProtosDBInput(
 template <class Context>
 bool TensorProtosDBInput<Context>::Prefetch() {
   const db::DBReader& reader = OperatorBase::Input<db::DBReader>(0);
-  TensorDeserializer deserializer;
+  TensorDeserializer<CPUContext> deserializer;
   if (batch_size_ == 0) {
     // We do not need to construct a batch. As a result, we will simply
     // deserialize everything into the target prefetched blob.
@@ -56,13 +56,11 @@ bool TensorProtosDBInput<Context>::Prefetch() {
         protos.mutable_protos(i)->clear_device_detail();
       }
       deserializer.Deserialize(
-          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
+          protos.protos(i),
+          prefetched_blobs_[i].template GetMutable<TensorCPU>());
     }
   } else {
-    vector<Tensor> temp_tensors;
-    for (int i = 0; i < OutputSize(); ++i) {
-      temp_tensors.emplace_back(CPU);
-    }
+    vector<TensorCPU> temp_tensors(OutputSize());
     for (int item_id = 0; item_id < batch_size_; ++item_id) {
       reader.Read(&key_, &value_);
       TensorProtos protos;
@@ -74,18 +72,18 @@ bool TensorProtosDBInput<Context>::Prefetch() {
           vector<int> dims(
               protos.protos(i).dims().begin(), protos.protos(i).dims().end());
           dims.insert(dims.begin(), batch_size_);
-          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
+          prefetched_blobs_[i].template GetMutable<TensorCPU>()->Resize(dims);
         }
       }
       for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
+        TensorCPU* dst = prefetched_blobs_[i].template GetMutable<TensorCPU>();
         TensorCPU& src = temp_tensors[i];
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
         }
         deserializer.Deserialize(protos.protos(i), &src);
         DCHECK_EQ(src.size() * batch_size_, dst->size());
-        this->context_.CopyItemsSameDevice(
+        this->context_.template CopyItems<CPUContext, CPUContext>(
             src.meta(),
             src.size(),
             src.raw_data(),
@@ -100,9 +98,8 @@ bool TensorProtosDBInput<Context>::Prefetch() {
 template <class Context>
 bool TensorProtosDBInput<Context>::CopyPrefetched() {
   for (int i = 0; i < OutputSize(); ++i) {
-    OperatorBase::template Output<Tensor>(i, Context::GetDeviceType())
-        ->CopyFrom(
-            prefetched_blobs_[i].template Get<TensorCPU>(), &this->context_);
+    OperatorBase::Output<Tensor<Context>>(i)->CopyFrom(
+        prefetched_blobs_[i].template Get<TensorCPU>(), &this->context_);
   }
   return true;
 }
diff --git a/caffe2/operators/thresholded_relu_op.cc b/caffe2/operators/thresholded_relu_op.cc
index bba4d496e65db..8b5e6b514478c 100644
--- a/caffe2/operators/thresholded_relu_op.cc
+++ b/caffe2/operators/thresholded_relu_op.cc
@@ -12,11 +12,11 @@ bool ThresholdedReluOp<float, CPUContext>::RunOnDevice() {
   Y->ResizeLike(X);
 
   ConstEigenVectorArrayMap<float> Xvec(X.data<float>(), X.size());
-  EigenVectorArrayMap<float> Yvec(Y->template mutable_data<float>(), Y->size());
+  EigenVectorArrayMap<float> Yvec(Y->mutable_data<float>(), Y->size());
   Yvec = (Xvec > alpha_).select(Xvec, 0.f);
   /* Naive implementation
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->template mutable_data<float>();
+  float* Ydata = Y->mutable_data<float>();
   for (int i = 0; i < X.size(); ++i) {
     Xdata[i] -= alpha_;
     Ydata[i] = std::max(Xdata[i], 0.0f);
@@ -35,7 +35,7 @@ bool ThresholdedReluGradientOp<float, CPUContext>::RunOnDevice() {
 
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  float* dXdata = dX->mutable_data<float>();
   EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
   ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
   ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
diff --git a/caffe2/operators/thresholded_relu_op.cu b/caffe2/operators/thresholded_relu_op.cu
index 5a5027c7faed0..a12ee62d42b28 100644
--- a/caffe2/operators/thresholded_relu_op.cu
+++ b/caffe2/operators/thresholded_relu_op.cu
@@ -30,7 +30,7 @@ bool ThresholdedReluOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->template mutable_data<float>(), alpha_);
+      X.size(), X.data<float>(), Y->mutable_data<float>(), alpha_);
   return true;
 }
 
@@ -47,10 +47,7 @@ bool ThresholdedReluGradientOp<float, CUDAContext>::RunOnDevice() {
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      Y.size(),
-      Y.data<float>(),
-      dY.data<float>(),
-      dX->template mutable_data<float>());
+      Y.size(), Y.data<float>(), dY.data<float>(), dX->mutable_data<float>());
   return true;
 }
 
diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h
index bde0b41ebe8b1..046aaa55f7a94 100644
--- a/caffe2/operators/tile_op.h
+++ b/caffe2/operators/tile_op.h
@@ -34,7 +34,7 @@ class TileOp : public Operator<Context> {
           "Input `tiles` should be a vector of size 1.");
 
       const auto& input1 = Input(1);
-      context_.CopyItemsToCPU(
+      context_.template CopyItems<Context, CPUContext>(
           input1.meta(),
           1,
           static_cast<const char*>(input1.raw_data()),
@@ -46,7 +46,7 @@ class TileOp : public Operator<Context> {
             "Input `axis` should be a vector of size 1.");
 
         const auto& input2 = Input(2);
-        context_.CopyItemsToCPU(
+        context_.template CopyItems<Context, CPUContext>(
             input2.meta(),
             1,
             static_cast<const char*>(input2.raw_data()),
@@ -114,7 +114,8 @@ class TileOp : public Operator<Context> {
       char* output_data) {
     for (auto i = 0; i < outer_dim; ++i) {
       for (auto t = 0; t < tiles_; ++t) {
-        context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
+        context_.template CopyItems<Context, Context>(
+            meta, inner_dim, input_data, output_data);
         output_data += inner_dim * item_size;
       }
       input_data += inner_dim * item_size;
@@ -148,7 +149,7 @@ class TileGradientOp : public Operator<Context> {
           "Input `tiles` should be a vector of size 1.");
 
       const auto& input1 = Input(1);
-      context_.CopyItemsToCPU(
+      context_.template CopyItems<Context, CPUContext>(
           input1.meta(),
           1,
           static_cast<const char*>(input1.raw_data()),
@@ -160,7 +161,7 @@ class TileGradientOp : public Operator<Context> {
             "Input `axis` should be a vector of size 1.");
 
         const auto& input2 = Input(2);
-        context_.CopyItemsToCPU(
+        context_.template CopyItems<Context, CPUContext>(
             input2.meta(),
             1,
             static_cast<const char*>(input2.raw_data()),
@@ -230,7 +231,8 @@ class TileGradientOp : public Operator<Context> {
       const char* input_data,
       char* output_data) {
     for (auto i = 0; i < outer_dim; ++i) {
-      context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
+      context_.template CopyItems<Context, Context>(
+          meta, inner_dim, input_data, output_data);
       input_data += inner_dim * item_size;
       for (auto t = 1; t < tiles_; ++t) {
         math::Axpy<T, Context>(
diff --git a/caffe2/operators/top_k.cu b/caffe2/operators/top_k.cu
index 6562b7fa5030f..ddcb7c65d52a1 100644
--- a/caffe2/operators/top_k.cu
+++ b/caffe2/operators/top_k.cu
@@ -166,18 +166,18 @@ class TopKCudaOp : public Operator<Context> {
   int axis_;
 
   // Buffers for CUDAContext.
-  Tensor input_transposed_buffer_{CUDA};
-  Tensor values_transposed_buffer_{CUDA};
-  Tensor indices_transposed_buffer_{CUDA};
+  Tensor<Context> input_transposed_buffer_;
+  Tensor<Context> values_transposed_buffer_;
+  Tensor<Context> indices_transposed_buffer_;
 
   // Shape tensors on device for CUDAContext.
-  Tensor input_dims_device_{CUDA};
-  Tensor input_transposed_dims_device_{CUDA};
-  Tensor input_axes_device_{CUDA};
+  Tensor<Context> input_dims_device_;
+  Tensor<Context> input_transposed_dims_device_;
+  Tensor<Context> input_axes_device_;
 
-  Tensor output_dims_device_{CUDA};
-  Tensor output_transposed_dims_device_{CUDA};
-  Tensor output_transposed_axes_device_{CUDA};
+  Tensor<Context> output_dims_device_;
+  Tensor<Context> output_transposed_dims_device_;
+  Tensor<Context> output_transposed_axes_device_;
 };
 
 template <typename T, typename Context>
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index 6586b014689de..13196bf3761b7 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
     int cores_idx = 0;
 
     // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
+    auto Y_buf = Y_temp_->GetMutable<Tensor<Context>>();
     Y_buf->ResizeLike(X);
     Y_buf->CopyFrom(X);
 
@@ -104,7 +104,7 @@ class TTLinearOp final : public Operator<Context> {
 
       // Resize operation
       Y_buf->Resize(Y->dim32(0), Y->dim32(1));
-      context_.template CopyFromCPU<float>(
+      context_.template Copy<float, CPUContext, CPUContext>(
           Y->size(),
           Y->template data<float>(),
           Y_buf->template mutable_data<float>());
@@ -160,7 +160,7 @@ class TTLinearOp final : public Operator<Context> {
   }
 
  protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
   std::vector<int> inp_sizes_;
   std::vector<int> out_sizes_;
   std::vector<int> tt_ranks_;
@@ -181,7 +181,7 @@ class TTLinearGradientOp : public Operator<Context> {
   }
 
  protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor<Context> bias_multiplier_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/unique_ops.cu b/caffe2/operators/unique_ops.cu
index 90252bf401209..992488f0c3dd1 100644
--- a/caffe2/operators/unique_ops.cu
+++ b/caffe2/operators/unique_ops.cu
@@ -73,7 +73,8 @@ bool UniqueOp<CUDAContext>::DoRunWithType() {
   const T* input = inputTensor.template data<T>();
   thrust_unique_buffer_.Resize(N);
   auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
-  context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);
+  context_.template CopyItems<CUDAContext, CUDAContext>(
+      inputTensor.meta(), N, input, buffer);
 
   // Create two vectors of {0, 1, ..., N-1} on CUDA device
   thrust::device_vector<int> order1(N), order2(N);
@@ -114,7 +115,8 @@ bool UniqueOp<CUDAContext>::DoRunWithType() {
 
   uniqueTensor->Resize(K);
   T* unique = uniqueTensor->template mutable_data<T>();
-  context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique);
+  context_.template CopyItems<CUDAContext, CUDAContext>(
+      thrust_unique_buffer_.meta(), K, buffer, unique);
 
   // Compute the remapping. For example, for the number 1, if we look at
   // order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
diff --git a/caffe2/operators/unique_ops.h b/caffe2/operators/unique_ops.h
index 5def615fbfb42..d8af029f16e9c 100644
--- a/caffe2/operators/unique_ops.h
+++ b/caffe2/operators/unique_ops.h
@@ -47,9 +47,9 @@ class UniqueOp : public Operator<Context> {
 
  private:
   vector<int> order_;
-  Tensor thrust_unique_buffer_{Context::GetDeviceType()};
-  Tensor cuda_order_buffer_{Context::GetDeviceType()};
-  Tensor second_order_buffer_{Context::GetDeviceType()};
+  Tensor<Context> thrust_unique_buffer_;
+  Tensor<Context> cuda_order_buffer_;
+  Tensor<Context> second_order_buffer_;
 
  public:
   OUTPUT_TAGS(UNIQUE, REMAPPING);
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index b20ac903999e9..1abf2130953a7 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -1276,7 +1276,7 @@ template <typename T>
 bool RangeOp<CPUContext>::DoRunOnDevice(
     const T& start,
     const T& step,
-    Tensor* output) {
+    Tensor<CPUContext>* output) {
   auto* output_data = output->template mutable_data<T>();
   for (int i = 0; i < output->size(); ++i) {
     output_data[i] = i * step + start;
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index c97f3a72e50d8..a340734e690b4 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -46,7 +46,7 @@ class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
 
   bool RunOnDevice() override {
     auto& input = Input(0);
-    auto* output = OperatorBase::Output<Tensor>(0, CUDA);
+    auto* output = OperatorBase::Output<Tensor<CUDAContext>>(0);
     CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
     output->ResizeLike(input);
     context.template CopyItems<CUDAContext, CUDAContext>(
@@ -143,7 +143,7 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
               << std::endl;
 
     for (int j = 0; j < InputSize(); j++) {
-      Tensor cpu_X(CPU);
+      TensorCPU cpu_X;
       cpu_X.ResizeLike(Input(j));
       // Hack to cause allocaiton happen here, so it won't happen
       // when we do CopyFrom. We need the mutex then because host->gpu
@@ -192,7 +192,7 @@ ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
 
 template <>
 bool MaxOp<float, CUDAContext>::Compute() {
-  float* output_data = Output(0)->template mutable_data<float>();
+  float* output_data = Output(0)->mutable_data<float>();
   const int N = Input(0).size();
 
   // Run pairwise-maxes
@@ -223,7 +223,7 @@ ElwiseMinKernel(const float* X, const float* Y, float* minout, const int N) {
 
 template <>
 bool MinOp<float, CUDAContext>::Compute() {
-  float* output_data = Output(0)->template mutable_data<float>();
+  float* output_data = Output(0)->mutable_data<float>();
   const int N = Input(0).size();
 
   // Run pairwise-mines
@@ -274,7 +274,7 @@ bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
         output.data<float>(),
         input.data<float>(),
         grad_output.data<float>(),
-        grad_input->template mutable_data<float>());
+        grad_input->mutable_data<float>());
   }
   return true;
 }
@@ -299,7 +299,7 @@ __global__ void GatherKernel(
 template <>
 bool GatherOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
+      this, OperatorBase::Input<TensorCUDA>(INDICES));
 }
 
 template <>
@@ -501,14 +501,13 @@ template <typename T>
 bool RangeOp<CUDAContext>::DoRunOnDevice(
     const T& start,
     const T& step,
-    Tensor* output) {
+    Tensor<CUDAContext>* output) {
   int N = output->size();
   RangeKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
-      context_.cuda_stream()>>>(
-      N, output->template mutable_data<T>(), start, step);
+      context_.cuda_stream()>>>(N, output->mutable_data<T>(), start, step);
   return true;
 }
 
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index ce9a1c0c70279..a0eb0f3c531f0 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -26,7 +26,7 @@ class NanCheckOp final : public Operator<Context> {
 
  private:
   TensorPrinter tensorPrinter_;
-  Tensor scratch_{Context::GetDeviceType()};
+  Tensor<Context> scratch_;
 };
 
 struct GetNanCheckGradient : public GradientMakerBase {
@@ -54,7 +54,7 @@ class WallClockTimeOp final : public Operator<Context> {
             std::chrono::high_resolution_clock::now().time_since_epoch())
             .count());
 
-    TensorCPU* output = Output(0);
+    TensorCPU* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize();
     *output->template mutable_data<int64_t>() = nanoseconds;
 
@@ -90,8 +90,8 @@ class PrintOp final : public Operator<Context> {
       return true;
     }
 
-    if (!OperatorBase::InputIsType<Tensor>(0, Context::GetDeviceType()) &&
-        !OperatorBase::InputIsType<Tensor>(0, CPU)) {
+    if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
+        !OperatorBase::InputIsType<TensorCPU>(0)) {
       LOG(INFO) << "Blob of type: "
                 << OperatorBase::Inputs().at(0)->meta().name();
       return true;
@@ -112,9 +112,9 @@ class PrintOp final : public Operator<Context> {
         unsigned char,
         std::string>;
 
-    if (OperatorBase::InputIsType<Tensor>(0, CPU)) {
+    if (OperatorBase::InputIsType<TensorCPU>(0)) {
       return DispatchHelper<Types>::call(
-          this, OperatorBase::Input<Tensor>(0, CPU));
+          this, OperatorBase::Input<TensorCPU>(0));
     } else {
       return DispatchHelper<Types>::call(this, Input(0));
     }
@@ -127,9 +127,9 @@ class PrintOp final : public Operator<Context> {
     // pointing to the right instantiation. Note that tensor_copy_if_needed
     // will handle memory deallocation itself so no smart pointer is needed.
     const TensorCPU* tensor;
-    Tensor tensor_copy_if_needed(CPU);
-    if (OperatorBase::InputIsType<Tensor>(0, CPU)) {
-      tensor = &OperatorBase::Input<Tensor>(0, CPU);
+    TensorCPU tensor_copy_if_needed;
+    if (OperatorBase::InputIsType<TensorCPU>(0)) {
+      tensor = &OperatorBase::Input<TensorCPU>(0);
     } else {
       tensor_copy_if_needed.CopyFrom(Input(0), &context_);
       // Make sure that the copy is finished.
@@ -215,7 +215,7 @@ class FlattenToVecOp : public Operator<Context> {
         input.dims().size(), 1, "The rank of the tensor must be >= 1.");
     output->Resize(input.size());
 
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<Context, Context>(
         input.meta(),
         input.size(),
         input.raw_data(),
@@ -237,7 +237,7 @@ class ResizeLikeOp : public Operator<Context> {
     auto* output = Output(0);
     CAFFE_ENFORCE_EQ(input0.size(), input1.size());
     output->ResizeLike(Input(1));
-    context_.CopyItemsSameDevice(
+    context_.template CopyItems<Context, Context>(
         input0.meta(),
         input0.size(),
         input0.raw_data(),
@@ -532,10 +532,10 @@ class ScatterWeightedSumOp : public Operator<Context> {
     }
     return true;
   }
-  Tensor x_data_host_{CPU};
-  Tensor weights_host_{CPU};
-  Tensor x_data_device_{Context::GetDeviceType()};
-  Tensor weights_device_{Context::GetDeviceType()};
+  Tensor<CPUContext> x_data_host_;
+  Tensor<CPUContext> weights_host_;
+  Tensor<Context> x_data_device_;
+  Tensor<Context> weights_device_;
 };
 
 /**
@@ -663,7 +663,7 @@ class ScatterAssignOp : public Operator<Context> {
       // double-checking the indices, but it's fine as it's DCHECK only
       DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                   << ", range 0 to " << N;
-      context_.template CopySameDevice<T>(
+      context_.template Copy<T, Context, Context>(
           block_size, slicesData + block_size * i, data + block_size * idx);
     }
   }
@@ -678,8 +678,8 @@ class CopyOp : public Operator<Context> {
   USE_SIMPLE_CTOR_DTOR(CopyOp);
 
   bool RunOnDevice() override {
-    auto& input = OperatorBase::Input<Tensor>(0, SrcContext::GetDeviceType());
-    auto* output = OperatorBase::Output<Tensor>(0, DstContext::GetDeviceType());
+    auto& input = OperatorBase::Input<Tensor<SrcContext>>(0);
+    auto* output = OperatorBase::Output<Tensor<DstContext>>(0);
     output->ResizeLike(input);
     this->context_.template CopyItems<SrcContext, DstContext>(
         input.meta(),
@@ -943,7 +943,7 @@ class HasElementsOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& input = Input(0);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<TIndex>{});
     *output->template mutable_data<bool>() = input.size() > 0;
     return true;
@@ -958,7 +958,7 @@ class IsEmptyOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& input = Input(0);
-    auto* output = Output(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<TIndex>{});
     *output->template mutable_data<bool>() = (input.size() == 0);
     return true;
@@ -1026,7 +1026,7 @@ class GatherOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(INDICES, CPU));
+        this, OperatorBase::Input<TensorCPU>(INDICES));
   }
 
   template <typename Index>
@@ -1059,7 +1059,7 @@ class GatherOp : public Operator<Context> {
           " data_dim=",
           data.dim(0));
       auto src = src_base + idx * block_bytesize;
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           data.meta(), block_size, src, out + block_bytesize * i);
     }
     return true;
@@ -1076,7 +1076,7 @@ class GatherRangesOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(RANGES, CPU));
+        this, OperatorBase::Input<TensorCPU>(RANGES));
   }
 
   template <typename Index>
@@ -1123,7 +1123,7 @@ class GatherRangesOp : public Operator<Context> {
       auto rangeSizeBytes = rangeLength * itemsize;
       CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
       CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           data.meta(),
           rangeLength,
           rawData + rangeStart * itemsize,
@@ -1155,7 +1155,7 @@ class LengthsGatherOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<Tensor>(INDICES, CPU));
+        this, OperatorBase::Input<TensorCPU>(INDICES));
   }
 
   template <typename Index>
@@ -1202,7 +1202,7 @@ class LengthsGatherOp : public Operator<Context> {
     for (size_t i = 0; i < indices.size(); ++i) {
       auto idx = indices_data[i];
       auto length = lengths_data[idx];
-      context_.CopyItemsSameDevice(
+      context_.template CopyItems<Context, Context>(
           items.meta(),
           length * block_size,
           src_base + offsets_[idx] * block_bytesize,
@@ -1252,7 +1252,7 @@ class UnsafeCoalesceOp final : public Operator<Context> {
     size_t coalesced_offset = 0;
     for (auto i = 0; i < InputSize(); ++i) {
       const auto input_nbytes = Input(i).nbytes();
-      context_.CopyBytesSameDevice(
+      context_.template CopyBytes<Context, Context>(
           input_nbytes,
           (const uint8_t*)Input(i).raw_data(),
           coalesced->template mutable_data<uint8_t>() + coalesced_offset);
@@ -1353,7 +1353,7 @@ class RangeOp : public Operator<Context> {
     if (std::is_same<Context, TensorCPU>::value) {
       return Input(index).template data<T>()[0];
     } else {
-      local_.CopyFrom(Input(index));
+      local_.template CopyFrom<Context>(Input(index));
       return local_.template data<T>()[0];
     }
   }
@@ -1409,11 +1409,11 @@ class RangeOp : public Operator<Context> {
   }
 
   template <typename T>
-  bool DoRunOnDevice(const T& start, const T& step, Tensor* output);
+  bool DoRunOnDevice(const T& start, const T& step, Tensor<Context>* output);
 
  private:
   // local CPU tensor for copying constants.
-  Tensor local_{CPU};
+  TensorCPU local_;
 };
 
 class ThrowExceptionOp : public Operator<CPUContext> {
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index fb8baba549768..4b9b10eafbc82 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -19,10 +19,10 @@ static void AddConstInput(
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
-      tensor->size(), value, tensor->template mutable_data<float>(), &context);
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
   return;
 }
 
@@ -43,7 +43,7 @@ TEST(UtilityOpGPUTest, testReshapeWithScalar) {
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
   Blob* XNew = ws.GetBlob("XNew");
-  const Tensor& XNewTensor = XNew->Get<Tensor>();
+  const Tensor<CUDAContext>& XNewTensor = XNew->Get<Tensor<CUDAContext>>();
   EXPECT_EQ(1, XNewTensor.ndim());
   EXPECT_EQ(1, XNewTensor.size());
 }
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index 7b4bcb3144f3e..74705173fa3de 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -16,10 +16,10 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
-      tensor->size(), value, tensor->template mutable_data<float>(), &context);
+      tensor->size(), value, tensor->mutable_data<float>(), &context);
   return;
 }
 
@@ -37,7 +37,7 @@ TEST(UtilityOpTest, testReshapeWithScalar) {
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
   Blob* XNew = ws.GetBlob("XNew");
-  const TensorCPU& XNewTensor = XNew->Get<Tensor>();
+  const TensorCPU& XNewTensor = XNew->Get<Tensor<CPUContext>>();
   EXPECT_EQ(1, XNewTensor.ndim());
   EXPECT_EQ(1, XNewTensor.size());
 }
diff --git a/caffe2/operators/weighted_multi_sampling_op.cc b/caffe2/operators/weighted_multi_sampling_op.cc
index 57d50ed34b53d..6f565c6ec1219 100644
--- a/caffe2/operators/weighted_multi_sampling_op.cc
+++ b/caffe2/operators/weighted_multi_sampling_op.cc
@@ -84,14 +84,14 @@ OPERATOR_SCHEMA(WeightedMultiSampling)
     .SetDoc(R"DOC(
 The operator performs sampling based on the input sampling weights.
 All weights are cummulative probability thus sorted. The output is
-a 1-D tensor (Tensor). If two inputs are given, the second input
+a 1-D tensor (Tensor<int>). If two inputs are given, the second input
 is used to provide shape of the output sample tensor. Otherwise, we use
 argument `num_samples` to determine the number of samples to generate.
 )DOC")
     .Input(
         0,
         "sampling_cdf",
-        "An optional 1-D Tensor."
+        "An optional 1-D Tensor<float>."
         "Input cumulative sampling probability (such as [0.2, 0.5, 0.8, 1.5])."
         " All weights must be non-negative numbers. Note that the last value of"
         " CDF is not necessary 1. If the last value is not 1, all values in"
@@ -105,7 +105,7 @@ argument `num_samples` to determine the number of samples to generate.
         "sampled_indexes",
         "The output tensor contains indices sampled from distribution given"
         "by the weight vector in the input tensor"
-        "The output is a 1-D Tensor of size determined by argument"
+        "The output is a 1-D Tensor<int> of size determined by argument"
         "`num_samples` or the second input tensor.")
     .Arg("num_samples", "number of samples to sample from the input data");
 
diff --git a/caffe2/operators/weighted_sample_op.cc b/caffe2/operators/weighted_sample_op.cc
index c8b278b30f8c9..2ffd35f6e6419 100644
--- a/caffe2/operators/weighted_sample_op.cc
+++ b/caffe2/operators/weighted_sample_op.cc
@@ -85,33 +85,33 @@ OPERATOR_SCHEMA(WeightedSample)
     .SetDoc(R"DOC(
 The operator performs sampling based on the input sampling weights for
 each batch. All weights must be non-negative numbers.
-The input is a 2-D tensor (Tensor) of size (batch_size x weights_dim).
+The input is a 2-D tensor (Tensor<float>) of size (batch_size x weights_dim).
 For each batch, an index is randomly sampled from the distribution given by
 the weights of the corresponding batch.
-The output is a 1-D tensor (Tensor) of size (batch_size x 1) and
+The output is a 1-D tensor (Tensor<int>) of size (batch_size x 1) and
 contains the index(es) of the sampled output.
 )DOC")
     .Input(
         0,
         "sampling_weights",
-        "A 2-D Tensor of size (batch_size x weights_dim)."
+        "A 2-D Tensor<float> of size (batch_size x weights_dim)."
         "All weights must be non-negative numbers.")
     .Input(
         1,
         "sampling_values",
-        "An optional 2-D Tensor of size (batch_size x weights_dim)."
+        "An optional 2-D Tensor<float> of size (batch_size x weights_dim)."
         "Its values correspond to the sampling weights.")
     .Output(
         0,
         "sampled_indexes",
         "The output tensor contains index(es) sampled from distribution given"
         "by the weight vector(s) in the input tensor"
-        "The output is a 1-D Tensor of size (batch_size x 1)")
+        "The output is a 1-D Tensor<int> of size (batch_size x 1)")
     .Output(
         1,
         "sampled_values",
         "The output tensor contains value(s) selected by the sampled index(es)"
-        "It is a 1-D Tensor of size (batch_size x 1)");
+        "It is a 1-D Tensor<float> of size (batch_size x 1)");
 
 SHOULD_NOT_DO_GRADIENT(WeightedSample);
 } // namespace caffe2
diff --git a/caffe2/operators/weighted_sample_op.cu b/caffe2/operators/weighted_sample_op.cu
index ba44868aa3b46..fa247c61256a3 100644
--- a/caffe2/operators/weighted_sample_op.cu
+++ b/caffe2/operators/weighted_sample_op.cu
@@ -58,7 +58,7 @@ bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
 
     const float* in_weights_data = in_weights.data<float>();
     const float* in_val_data = nullptr;
-    int* out_idx_data = out_idx->template mutable_data<int>();
+    int* out_idx_data = out_idx->mutable_data<int>();
     float* out_val_data = nullptr;
 
     if (OutputSize() == 2) {
@@ -71,7 +71,7 @@ bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
 
       auto* out_val = Output(1);
       out_val->Resize(batch_size, 1);
-      out_val_data = out_val->template mutable_data<float>();
+      out_val_data = out_val->mutable_data<float>();
     }
 
     float* unif_samples_data = unif_samples_.mutable_data<float>();
@@ -92,11 +92,11 @@ bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
         out_val_data);
   } else {
     out_idx->Resize(0);
-    out_idx->template mutable_data<int>();
+    out_idx->mutable_data<int>();
     if (OutputSize() == 2) {
       auto* out_val = Output(1);
       out_val->Resize(0);
-      out_val->template mutable_data<float>();
+      out_val->mutable_data<float>();
     }
   }
 
diff --git a/caffe2/operators/weighted_sample_op.h b/caffe2/operators/weighted_sample_op.h
index ac5a7cdd57699..e870511f2c28a 100644
--- a/caffe2/operators/weighted_sample_op.h
+++ b/caffe2/operators/weighted_sample_op.h
@@ -22,7 +22,7 @@ class WeightedSampleOp final : public Operator<Context> {
 
  private:
   vector<float> cum_mass_;
-  Tensor unif_samples_{Context::GetDeviceType()};
+  Tensor<Context> unif_samples_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h
index 258862b690e4a..dff6f1190be6c 100644
--- a/caffe2/operators/while_op.h
+++ b/caffe2/operators/while_op.h
@@ -35,7 +35,7 @@ class WhileOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
+        this->template InputIsType<Tensor<Context>>(0),
         "Invalid condition in While operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/operators/workspace_ops.cc b/caffe2/operators/workspace_ops.cc
index 24655af325591..d9775aa3a7752 100644
--- a/caffe2/operators/workspace_ops.cc
+++ b/caffe2/operators/workspace_ops.cc
@@ -15,8 +15,7 @@ class GetAllBlobNamesOp final : public Operator<CPUContext> {
     auto* out = Output(0);
     const auto& blobs = include_shared_ ? ws_->Blobs() : ws_->LocalBlobs();
     out->Resize(blobs.size());
-    std::copy(
-        blobs.begin(), blobs.end(), out->template mutable_data<std::string>());
+    std::copy(blobs.begin(), blobs.end(), out->mutable_data<std::string>());
     return true;
   }
 
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index 8a1b736399562..dcab984f0244c 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -40,10 +40,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
       continue;
     }
 
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
-  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                              \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                    \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");           \
+  auto name##Tensor = ws->GetBlob(name->getName())->GetMutable<TensorCPU>(); \
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 75baec0e9be66..c1c6c310786c5 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -175,11 +175,11 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
 
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = blob->GetMutableTensor(CPU);
+      auto* cpu_tensor = blob->GetMutable<TensorCPU>();
       std::vector<TIndex> dims;
       std::copy(t.dims().begin(), t.dims().end(), dims.begin());
       cpu_tensor->Resize(dims);
-      context.CopyBytesSameDevice(
+      context.template CopyBytes<CPUContext, CPUContext>(
           cpu_tensor->size() * sizeof(float),
           static_cast<const void*>(t.raw_data().data()),
           cpu_tensor->raw_mutable_data(TypeMeta::Make<float>()));
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index b7784fffa5059..04df247d821da 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -62,7 +62,7 @@ CAFFE_DEFINE_TYPED_REGISTRY(
     BlobFeederBase,
     std::unique_ptr);
 
-REGISTER_BLOB_FETCHER((TypeMeta::Id<Tensor>()), TensorFetcher);
+REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorCPU>()), TensorFetcher<CPUContext>);
 REGISTER_BLOB_FEEDER(CPU, TensorFeeder<CPUContext>);
 
 Workspace* GetCurrentWorkspace() {
@@ -326,7 +326,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "tensor",
-          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
+          [](Blob* blob) { return py::cast(blob->GetMutable<TensorCPU>()); },
           py::return_value_policy::reference_internal)
       .def(
           "_feed",
@@ -403,7 +403,7 @@ void addObjectMethods(py::module& m) {
               // keep this behavior for backward compatibility
               t->mutable_data<float>();
             }
-            auto res = TensorFetcher().FetchTensor(*t, false);
+            auto res = TensorFetcher<CPUContext>().FetchTensor(*t, false);
             return res.obj;
           },
           "Return numpy array pointing to this tensor's data if possible. "
@@ -422,17 +422,17 @@ void addObjectMethods(py::module& m) {
       .def(
           "fetch",
           [](TensorCPU* t) {
-            auto res = TensorFetcher().FetchTensor(*t, true);
+            auto res = TensorFetcher<CPUContext>().FetchTensor(*t, true);
             return res.obj;
           },
           "Copy data from this tensor into a new numpy array.")
       .def(
           "init",
-          [](Tensor* t, std::vector<TIndex> dims, int caffe_type) {
+          [](TensorCPU* t, std::vector<TIndex> dims, int caffe_type) {
             const auto& meta =
                 DataTypeToTypeMeta((TensorProto::DataType)caffe_type);
             CAFFE_ENFORCE(
-                !TensorFetcher().NeedsCopy(t, meta),
+                !TensorFetcher<CPUContext>().NeedsCopy(meta),
                 "Cannot init tensor of this type. Use `feed` instead.");
             t->Resize(dims);
             t->raw_mutable_data(meta);
@@ -725,15 +725,14 @@ void addObjectMethods(py::module& m) {
             for (const auto pair : inputs) {
               const auto& name = pair.first;
               const auto& input = pair.second;
-              tensors_data.emplace(name, Tensor(CPU));
               CAFFE_ENFORCE(
                   PyArray_Check(input.ptr()),
                   "Input must be of type numpy array.");
               PyArrayObject* array =
                   reinterpret_cast<PyArrayObject*>(input.ptr());
               TensorFeeder<CPUContext>().FeedTensor(
-                  DeviceOption(), array, &tensors_data.at(name));
-              tensors.insert(std::make_pair(name, &tensors_data.at(name)));
+                  DeviceOption(), array, &tensors_data[name]);
+              tensors.insert(std::make_pair(name, &tensors_data[name]));
             }
 
 
@@ -741,7 +740,8 @@ void addObjectMethods(py::module& m) {
             instance.RunMap(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
             }
             return pyout;
           })
@@ -750,10 +750,7 @@ void addObjectMethods(py::module& m) {
           [](caffe2::onnx::Caffe2BackendRep& instance,
              std::vector<py::object> inputs) -> std::vector<py::object> {
             Predictor::TensorVector tensors;
-            std::vector<TensorCPU> tensors_data;
-            for (auto i = 0; i < inputs.size(); ++i) {
-              tensors_data.emplace_back(caffe2::CPU);
-            }
+            std::vector<TensorCPU> tensors_data(inputs.size());
             for (auto i = 0; i < inputs.size(); ++i) {
               auto input = inputs[i];
               CAFFE_ENFORCE(
@@ -769,7 +766,8 @@ void addObjectMethods(py::module& m) {
             instance.Run(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
             }
             return pyout;
           });
@@ -860,10 +858,7 @@ void addObjectMethods(py::module& m) {
           [](Predictor& instance,
              std::vector<py::object> inputs) -> std::vector<py::object> {
             Predictor::TensorVector tensors;
-            std::vector<Tensor> tensors_data;
-            for (auto i = 0; i < inputs.size(); ++i) {
-              tensors_data.emplace_back(CPU);
-            }
+            std::vector<TensorCPU> tensors_data(inputs.size());
             for (auto i = 0; i < inputs.size(); ++i) {
               auto input = inputs[i];
               CAFFE_ENFORCE(
@@ -879,7 +874,8 @@ void addObjectMethods(py::module& m) {
             instance.run(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
             }
             return pyout;
           })
@@ -892,21 +888,21 @@ void addObjectMethods(py::module& m) {
             for (const auto pair : inputs) {
               const auto& name = pair.first;
               const auto& input = pair.second;
-              tensors_data.emplace(name, Tensor(CPU));
               CAFFE_ENFORCE(
                   PyArray_Check(input.ptr()),
                   "Input must be of type numpy array.");
               PyArrayObject* array =
                   reinterpret_cast<PyArrayObject*>(input.ptr());
               TensorFeeder<CPUContext>().FeedTensor(
-                  DeviceOption(), array, &tensors_data.at(name));
-              tensors.insert(std::make_pair(name, &tensors_data.at(name)));
+                  DeviceOption(), array, &tensors_data[name]);
+              tensors.insert(std::make_pair(name, &tensors_data[name]));
             }
             std::vector<TensorCPU*> out;
             instance.run_map(tensors, &out);
             std::vector<py::object> pyout;
             for (auto t : out) {
-              pyout.push_back(TensorFetcher().FetchTensor(*t, true).obj);
+              pyout.push_back(
+                  TensorFetcher<CPUContext>().FetchTensor(*t, true).obj);
             }
             return pyout;
           });
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 894c420afa94b..f46972a05561c 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -90,20 +90,19 @@ static_assert(
 int CaffeToNumpyType(const TypeMeta& meta);
 const TypeMeta& NumpyTypeToCaffe(int numpy_type);
 
+template <class Context>
 class TensorFetcher : public BlobFetcherBase {
  public:
   pybind11::object Fetch(const Blob& blob) override {
-    return FetchTensor(blob.Get<Tensor>(), true).obj;
+    return FetchTensor(blob.Get<Tensor<Context>>(), true).obj;
   }
 
-  // Checks whether the data with type `meta` needs to be copied in the context
-  // of `tensor`
-  bool NeedsCopy(const Tensor* tensor, const TypeMeta& meta) const {
-    return tensor->GetStaticContext() != GetCPUStaticContext() ||
+  bool NeedsCopy(const TypeMeta& meta) const {
+    return !std::is_same<Context, CPUContext>::value ||
         CaffeToNumpyType(meta) == NPY_OBJECT;
   }
 
-  FetchedBlob FetchTensor(const Tensor& tensor, bool force_copy) {
+  FetchedBlob FetchTensor(const Tensor<Context>& tensor, bool force_copy) {
     FetchedBlob result;
     CAFFE_ENFORCE_GE(tensor.size(), 0, "Trying to fetch unitilized tensor");
     const int numpy_type = CaffeToNumpyType(tensor.meta());
@@ -116,7 +115,7 @@ class TensorFetcher : public BlobFetcherBase {
     for (const auto dim : tensor.dims()) {
       npy_dims.push_back(dim);
     }
-    result.copied = force_copy || NeedsCopy(&tensor, tensor.meta());
+    result.copied = force_copy || NeedsCopy(tensor.meta());
     void* outPtr;
     if (result.copied) {
       result.obj = py::reinterpret_steal<py::object>(
@@ -124,7 +123,7 @@ class TensorFetcher : public BlobFetcherBase {
       outPtr = static_cast<void*>(
           PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
     } else {
-      outPtr = const_cast<Tensor&>(tensor).raw_mutable_data();
+      outPtr = const_cast<Tensor<Context>&>(tensor).raw_mutable_data();
       result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
           tensor.ndim(), npy_dims.data(), numpy_type, outPtr));
     }
@@ -147,9 +146,10 @@ class TensorFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      auto context = tensor.GetStaticContext()->CreateContext();
-      context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
-      context->FinishDeviceComputation();
+      Context context;
+      context.template CopyBytes<Context, CPUContext>(
+          tensor.nbytes(), tensor.raw_data(), outPtr);
+      context.FinishDeviceComputation();
     }
     return result;
   }
@@ -161,7 +161,7 @@ class TensorFeeder : public BlobFeederBase {
   void FeedTensor(
       const DeviceOption& option,
       PyArrayObject* original_array,
-      Tensor* tensor) {
+      Tensor<Context>* tensor) {
     PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
     auto g = MakeGuard([&]() { Py_XDECREF(array); });
 
@@ -220,7 +220,7 @@ class TensorFeeder : public BlobFeederBase {
             "instead of unicode strings.");
         break;
       default:
-        context.CopyBytesFromCPU(
+        context.template CopyBytes<CPUContext, Context>(
             tensor->size() * meta.itemsize(),
             static_cast<void*>(PyArray_DATA(array)),
             tensor->raw_mutable_data(meta));
@@ -230,10 +230,7 @@ class TensorFeeder : public BlobFeederBase {
 
   virtual void
   Feed(const DeviceOption& option, PyArrayObject* original_array, Blob* blob) {
-    FeedTensor(
-        option,
-        original_array,
-        blob->GetMutableTensor(Context::GetDeviceType()));
+    FeedTensor(option, original_array, blob->GetMutable<Tensor<Context>>());
   }
 };
 
@@ -319,26 +316,29 @@ class PythonOpBase : public Operator<Context> {
         const auto* blob = &InputBlob(i);
         // Allow CPU tensors in addition to operator context's tensors
         py::object py_obj;
-        if (blob->template IsType<Tensor>()) {
+        if (blob->template IsType<Tensor<CPUContext>>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                const_cast<Tensor*>(&blob->template Get<Tensor>()), cpu_option);
+                const_cast<Tensor<CPUContext>*>(
+                    &blob->template Get<Tensor<CPUContext>>()),
+                cpu_option);
             // copy wrapper
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                &blob->template Get<Tensor>(),
+                &blob->template Get<Tensor<CPUContext>>(),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                const_cast<Tensor*>(&blob->template Get<Tensor>()),
+                const_cast<Tensor<Context>*>(
+                    &blob->template Get<Tensor<Context>>()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                &blob->template Get<Tensor>(),
+                &blob->template Get<Tensor<Context>>(),
                 py::return_value_policy::reference);
           }
         }
@@ -365,31 +365,31 @@ class PythonOpBase : public Operator<Context> {
 
         // make sure output blob is initialized before creating the binding
         if (forced_cpu_outputs_.count(i)) {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          blob->template GetMutable<Tensor<CPUContext>>();
         } else {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          blob->template GetMutable<Tensor<Context>>();
         }
 
         py::object py_obj;
-        if (blob->template IsType<Tensor>()) {
+        if (blob->template IsType<Tensor<CPUContext>>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
+                blob->template GetMutable<Tensor<CPUContext>>(), cpu_option);
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                blob->template GetMutable<Tensor<CPUContext>>(),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                blob->template GetMutable<Tensor<Context>>(),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                blob->template GetMutable<Tensor<Context>>(),
                 py::return_value_policy::reference);
           }
         }
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 37bf82e90bc30..1ba3f0fa24476 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -23,7 +23,7 @@ const TypeMeta& DLTypeToCaffe(const DLDataType& dl_type);
 template <class Context>
 class DLPackWrapper {
  public:
-  DLPackWrapper(Tensor* tensor, DeviceOption device_option)
+  DLPackWrapper(Tensor<Context>* tensor, DeviceOption device_option)
       : tensor(tensor), device_option(device_option) {}
 
   py::object data() {
@@ -120,7 +120,7 @@ class DLPackWrapper {
         });
   }
 
-  Tensor* tensor;
+  Tensor<Context>* tensor;
   DeviceOption device_option;
   DLManagedTensor managed_tensor;
 };
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index 0a8b10aa54e93..9ceec10dbd71c 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -31,6 +31,7 @@ REGISTER_CUDA_OPERATOR(
     PythonDLPackGradient,
     PythonGradientOp<CUDAContext, true>);
 
+REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorCUDA>()), TensorFetcher<CUDAContext>);
 REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);
 
 namespace py = pybind11;
diff --git a/caffe2/python/pybind_state_hip.cc b/caffe2/python/pybind_state_hip.cc
index bb4b4c715c5a8..b770ea00001e3 100644
--- a/caffe2/python/pybind_state_hip.cc
+++ b/caffe2/python/pybind_state_hip.cc
@@ -20,6 +20,7 @@ REGISTER_HIP_OPERATOR(
 REGISTER_HIP_OPERATOR(PythonDLPack, PythonOp<HIPContext, true>);
 REGISTER_HIP_OPERATOR(PythonDLPackGradient, PythonGradientOp<HIPContext, true>);
 
+REGISTER_BLOB_FETCHER((TypeMeta::Id<TensorHIP>()), TensorFetcher<HIPContext>);
 REGISTER_BLOB_FEEDER(HIP, TensorFeeder<HIPContext>);
 
 namespace py = pybind11;
diff --git a/caffe2/python/pybind_state_int8.cc b/caffe2/python/pybind_state_int8.cc
index 1df33130acb05..683a4cee503ca 100644
--- a/caffe2/python/pybind_state_int8.cc
+++ b/caffe2/python/pybind_state_int8.cc
@@ -45,7 +45,8 @@ class Int8TensorFetcher : public BlobFetcherBase {
     void* ptr = static_cast<void*>(
         PyArray_DATA(reinterpret_cast<PyArrayObject*>(data_array.ptr())));
     CPUContext context;
-    context.CopyBytesSameDevice(src.t.nbytes(), src.t.raw_data(), ptr);
+    context.template CopyBytes<CPUContext, CPUContext>(
+        src.t.nbytes(), src.t.raw_data(), ptr);
     context.FinishDeviceComputation();
 
     auto result = pybind11::cast<pybind11::object>(
diff --git a/caffe2/queue/blobs_queue_db.h b/caffe2/queue/blobs_queue_db.h
index 317f371ac48cf..7d4ac146f05df 100644
--- a/caffe2/queue/blobs_queue_db.h
+++ b/caffe2/queue/blobs_queue_db.h
@@ -16,8 +16,8 @@ namespace {
 const std::string& GetStringFromBlob(Blob* blob) {
   if (blob->template IsType<string>()) {
     return blob->template Get<string>();
-  } else if (blob->template IsType<Tensor>()) {
-    return *blob->template Get<Tensor>().template data<string>();
+  } else if (blob->template IsType<Tensor<CPUContext>>()) {
+    return *blob->template Get<Tensor<CPUContext>>().template data<string>();
   } else {
     CAFFE_THROW("Unsupported Blob type");
   }
diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
index 8e924176a02b0..4ed6acaa1b69a 100644
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@@ -146,7 +146,7 @@ class SafeDequeueBlobsOp final : public Operator<Context> {
       }
       for (int col = 0; col < size; ++col) {
         auto* out = this->Output(col);
-        const auto& in = blobPtrs_.at(col)->template Get<Tensor>();
+        const auto& in = blobPtrs_.at(col)->template Get<Tensor<Context>>();
         if (i == 0) {
           out->CopyFrom(in);
         } else {
diff --git a/caffe2/queue/rebatching_queue.cc b/caffe2/queue/rebatching_queue.cc
index cfb43a99f491b..6be252f44a11f 100644
--- a/caffe2/queue/rebatching_queue.cc
+++ b/caffe2/queue/rebatching_queue.cc
@@ -50,12 +50,12 @@ void concat(
         continue;
       }
 
-      context.CopyItemsToCPU(
+      context.CopyItems<CPUContext, CPUContext>(
           input.meta(),
           input.size(),
           input.raw_data() /* src */,
           destinations[j] /* dst */
-      );
+          );
 
       destinations[j] =
           (char*)destinations[j] + input.size() * input.itemsize();
@@ -84,8 +84,8 @@ std::vector<std::vector<TensorCPU>> split(
     CAFFE_ENFORCE_EQ(input.dims().at(0), outputSize);
 
     for (int i = 0; i < outputSize; ++i) {
-      outputs[i].push_back(Tensor(outputDims, CPU));
-      context.CopyItemsToCPU(
+      outputs[i].push_back(TensorCPU(outputDims));
+      context.CopyItems<CPUContext, CPUContext>(
           input.meta(),
           innerSize,
           (char*)input.raw_data() + i * innerSize * itemSize /* src */,
diff --git a/caffe2/queue/rebatching_queue_ops.h b/caffe2/queue/rebatching_queue_ops.h
index 5c9059c05b6eb..80749a42692e7 100644
--- a/caffe2/queue/rebatching_queue_ops.h
+++ b/caffe2/queue/rebatching_queue_ops.h
@@ -30,7 +30,7 @@ class EnqueueRebatchingQueueOp : public Operator<CPUContext> {
     auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
     CHECK(queue);
     CAFFE_ENFORCE_EQ(InputSize(), queue->numBlobs() + 1);
-    std::vector<const Tensor*> inputTensors;
+    std::vector<const TensorCPU*> inputTensors;
     inputTensors.reserve(InputSize() - 1);
     for (int i = 1; i < InputSize(); ++i) {
       inputTensors.push_back(&Input(i));
@@ -54,7 +54,7 @@ class DequeueRebatchingQueueOp : public Operator<CPUContext> {
     auto& queue = Inputs()[0]->template Get<RebatchingQueuePtr>();
     CHECK(queue);
 
-    std::vector<Tensor*> outputTensors;
+    std::vector<TensorCPU*> outputTensors;
     outputTensors.reserve(OutputSize());
     for (int i = 0; i < OutputSize(); ++i) {
       outputTensors.push_back(Output(i));
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index c25509b3d00af..bb30247ca5f84 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -88,7 +88,7 @@ class AdamOp final : public Operator<Context> {
         epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
   bool RunOnDevice() override {
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size());
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size());
@@ -98,7 +98,7 @@ class AdamOp final : public Operator<Context> {
     Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
 
     const auto iter =
-        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
+        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
 
     const auto t = iter + 1;
     const auto correction =
@@ -177,7 +177,7 @@ class SparseAdamOp final : public Operator<Context> {
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     const auto iter =
-        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
+        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
 
     const auto t = iter + 1;
     const auto correction =
@@ -287,7 +287,7 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     const auto iter =
-        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
+        OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
 
     const auto t = iter + 1;
     const auto correction =
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
index 8fdde749a4636..8eb1b8835c96d 100644
--- a/caffe2/sgd/adam_op_gpu.cu
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -129,7 +129,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
   auto N = Input(GRAD).size();
   auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
   const auto iter =
-    OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
+      OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
   const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
       (1.0f - std::pow(beta1_, iter + 1));
 
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h
index 556b8a21f0524..85a9d53396fcd 100644
--- a/caffe2/sgd/fp16_momentum_sgd_op.h
+++ b/caffe2/sgd/fp16_momentum_sgd_op.h
@@ -35,10 +35,9 @@ class FP16MomentumSGDUpdateOp final : public Operator<Context> {
         fp32_update_(OperatorBase::GetSingleArgument<int>("fp32_update", 0)) {}
 
   bool RunOnDevice() override {
-    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h
index d94de7b7ac262..25ca516eeeeea 100644
--- a/caffe2/sgd/fp32_momentum_sgd_op.h
+++ b/caffe2/sgd/fp32_momentum_sgd_op.h
@@ -31,10 +31,9 @@ class FP32MomentumSGDUpdateOp final : public Operator<Context> {
         nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
 
   bool RunOnDevice() override {
-    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
index 91709f47f3453..13681d7db341d 100644
--- a/caffe2/sgd/iter_op.h
+++ b/caffe2/sgd/iter_op.h
@@ -38,20 +38,19 @@ class IterOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     if (InputSize() == 0) {
-      LOG(INFO) << "[Input size is zero]";
-      if (!OperatorBase::OutputIsType<Tensor>(0, CPU)) {
+      if (!OperatorBase::OutputIsType<TensorCPU>(0)) {
         // This is the first run; set the iter to start with 0.
         LOG(ERROR) << "You are using an old definition of IterOp that will "
                       "be deprecated soon. More specifically, IterOp now "
                       "requires an explicit in-place input and output.";
 
-        auto* output = OperatorBase::Output<Tensor>(0, CPU);
+        auto* output = OperatorBase::Output<TensorCPU>(0);
         VLOG(1) << "Initializing iter counter.";
         output->Resize(1);
         output->template mutable_data<int64_t>()[0] = 0;
       }
     }
-    IncrementIter(OperatorBase::Output<Tensor>(0, CPU));
+    IncrementIter(OperatorBase::Output<TensorCPU>(0));
     return true;
   }
 };
@@ -68,7 +67,7 @@ class AtomicIterOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto& mutex = OperatorBase::Input<std::unique_ptr<std::mutex>>(0);
     std::lock_guard<std::mutex> lg(*mutex);
-    IncrementIter(OperatorBase::Output<Tensor>(0, CPU));
+    IncrementIter(OperatorBase::Output<TensorCPU>(0));
     CAFFE_EVENT(stats_, num_iter);
     return true;
   }
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index bd813ce653dfb..0a47b6c5fd6d5 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -27,12 +27,12 @@ class LearningRateOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     int64_t iter =
-        OperatorBase::Input<Tensor>(0, CPU).template data<int64_t>()[0];
+        OperatorBase::Input<TensorCPU>(0).template data<int64_t>()[0];
     T learning_rate = cur_base_lr_ * (*functor_)(iter);
     // Write to output.
     auto* output = Output(0);
     output->Resize(vector<TIndex>());
-    context_.template CopyFromCPU<T>(
+    context_.template Copy<T, CPUContext, Context>(
         1, &learning_rate, Output(0)->template mutable_data<T>());
     return true;
   }
diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h
index f3f75f642164e..23da3d420c82b 100644
--- a/caffe2/sgd/momentum_sgd_op.h
+++ b/caffe2/sgd/momentum_sgd_op.h
@@ -45,10 +45,9 @@ class MomentumSGDOp final : public Operator<Context> {
         nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
 
   bool RunOnDevice() override {
-    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
@@ -85,10 +84,9 @@ class MomentumSGDUpdateOp final : public Operator<Context> {
         nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
 
   bool RunOnDevice() override {
-    auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(GRAD));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor<Context>>(MOMENTUM));
     CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
     CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h
index 06ecc177c8b69..02403ea53692d 100644
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@@ -126,21 +126,21 @@ CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
 CAFFE2_YF_READ_INPUT(GRAD, grad)
 #undef CAFFE2_YF_READ_OUTPUT
 
-CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
-CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
-CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
-CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
-CAFFE_ENFORCE_EQ(param_tensor.ndim(), g_avg_tensor.ndim());
-CAFFE_ENFORCE_EQ(param_tensor.ndim(), g2_avg_tensor.ndim());
-CAFFE_ENFORCE_EQ(param_tensor.ndim(), grad_tensor.ndim());
-for (int i = 0; i < param_tensor.ndim(); ++i) {
-  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
-  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
-  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
-  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
+    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
+    CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
+    CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), g_avg_tensor.ndim());
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), g2_avg_tensor.ndim());
+    CAFFE_ENFORCE_EQ(param_tensor.ndim(), grad_tensor.ndim());
+    for (int i = 0; i < param_tensor.ndim(); ++i) {
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
+      CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
     }
 
-    iter_ = OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
+    iter_ = OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
 
     D_ = param_tensor.size();
 
@@ -229,8 +229,8 @@ for (int i = 0; i < param_tensor.ndim(); ++i) {
   int D_;
 
 // Temporary memory on device, listed all variables used in calculations
-#define CAFFE2_YF_DEFINE_TENSOR(NAME)              \
-  Tensor NAME##_tensor_{Context::GetDeviceType()}; \
+#define CAFFE2_YF_DEFINE_TENSOR(NAME) \
+  Tensor<Context> NAME##_tensor_;     \
   T* NAME##_;
 
   CAFFE2_YF_DEFINE_TENSOR(aux_vector)
@@ -255,7 +255,7 @@ for (int i = 0; i < param_tensor.ndim(); ++i) {
   CAFFE2_YF_DEFINE_TENSOR(mu_deb)
   CAFFE2_YF_DEFINE_TENSOR(variance)
 
-  Tensor scratch_tensor_{Context::GetDeviceType()};
+  Tensor<Context> scratch_tensor_;
 
 #undef CAFFE2_YF_DEFINE_TENSOR
 
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
index 616587a39fda1..6d42cf6f2bd5b 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
@@ -438,9 +438,9 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
   }
 
   bool RunOnDeviceWithOrderNCHW() override {
-    const Tensor& X = Input(0);
+    const Tensor<CPUContext>& X = Input(0);
     auto& filter = Input(1);
-    Tensor* Y = Output(0);
+    Tensor<CPUContext>* Y = Output(0);
     const int N = X.dim32(0), C = X.dim32(1);
     CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
     const int M = filter.dim32(0);
@@ -536,7 +536,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
   }
 
  private:
-  Tensor bias_{CPU};
+  Tensor<CPUContext> bias_;
 };
 
 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, DEPTHWISE_3x3, Depthwise3x3ConvOp);
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 476930ce4f904..0a759c81aa9de 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index 05c945106c52d..8e1a0b264c2e5 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -197,7 +197,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
   initNNPACK();
   pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
 
-  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
+  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor<CPUContext>* buffer) {
     if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
       transformedFilters_.resize(group_);
 
@@ -231,11 +231,11 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
 
         for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] = ws_->CreateBlob(
-                                          "__transformed_kernel_" +
-                                          to_string(__sync_fetch_and_add(
-                                              &precomputed_transform_id, 1)))
-                                       ->GetMutableTensor(CPU);
+          transformedFilters_[g] =
+              ws_->CreateBlob(
+                     "__transformed_kernel_" +
+                     to_string(__sync_fetch_and_add(&precomputed_transform_id, 1)))
+                  ->GetMutable<TensorCPU>();
           transformedFilters_[g]->Resize(transformedFilterElements);
 
           status = nnp_convolution_inference(
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index ddc451264abca..c94faaa029c57 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = blob->GetMutable<TensorCPU>();
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
index 8312ed33ddd6f..a2aa32fb56db7 100644
--- a/caffe2/utils/filler.h
+++ b/caffe2/utils/filler.h
@@ -13,14 +13,14 @@ template <class Context_t>
 class TensorFiller {
  public:
   template <class Type>
-  void Fill(Tensor* tensor) const {
+  void Fill(Tensor<Context_t>* tensor) const {
     CAFFE_ENFORCE(context_, "context is null");
     CAFFE_ENFORCE(tensor, "tensor is null");
     auto min = static_cast<Type>(min_);
     auto max = static_cast<Type>(max_);
     CAFFE_ENFORCE_LE(min, max);
 
-    Tensor temp_tensor(shape_, Context_t::GetDeviceType());
+    Tensor<Context_t> temp_tensor(shape_);
     tensor->swap(temp_tensor);
     Type* data = tensor->template mutable_data<Type>();
     Context_t* context = static_cast<Context_t*>(context_);
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index f962c20b58126..ae54faa4e628f 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{10, 6};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = blobW->GetMutable<Tensor<HIPContext>>();
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -60,7 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -81,7 +81,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -102,7 +102,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{6, 10};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = blobW->GetMutable<Tensor<HIPContext>>();
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -160,7 +160,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -181,7 +181,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -201,7 +201,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
@@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
   vector<int> shapeA{5, 10};
   vector<int> shapeX{10};
   vector<int> shapeY{5};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = blobA->GetMutable<Tensor<HIPContext>>();
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 50);
@@ -256,7 +256,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@@ -274,7 +274,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@@ -292,7 +292,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
   vector<int> shapeA{6, 10};
   vector<int> shapeX{6};
   vector<int> shapeY{10};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = blobA->GetMutable<Tensor<HIPContext>>();
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = blobX->GetMutable<Tensor<HIPContext>>();
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = blobY->GetMutable<Tensor<HIPContext>>();
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = blobY_host->GetMutable<Tensor<CPUContext>>();
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 60);
@@ -346,7 +346,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
@@ -364,7 +364,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
@@ -382,7 +382,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
+  tensorY_host->CopyFrom<HIPContext, HIPContext>(*tensorY, &context);
   context.FinishDeviceComputation();
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
diff --git a/caffe2/utils/hip/math_hip.cc b/caffe2/utils/hip/math_hip.cc
index 59e93a7c8b4d5..dfe5beb72df8e 100644
--- a/caffe2/utils/hip/math_hip.cc
+++ b/caffe2/utils/hip/math_hip.cc
@@ -705,7 +705,7 @@ DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
-      Tensor* scratch_ptr,                                              \
+      Tensor<HIPContext>* scratch_ptr,                                  \
       HIPContext* context) {                                            \
     size_t memRequired = 0;                                             \
     cub::DeviceReduce::func(                                            \
@@ -1385,7 +1385,7 @@ void Dot<float, HIPContext>(
   float result;
   ROCBLAS_ENFORCE(
       rocblas_sdot(context->rocblas_handle(), n, a, 1, b, 1, &result));
-  context->CopyFromCPU<float>(1, &result, y);
+  context->Copy<float, CPUContext, HIPContext>(1, &result, y);
 }
 
 template <>
@@ -1474,7 +1474,7 @@ void SumGenericIter(
     IterT it,
     T*& dest,
     HIPContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<HIPContext>* scratch_ptr) {
   size_t memRequired = 0;
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->hip_stream());
@@ -1503,7 +1503,7 @@ void Sum<float, HIPContext>(
     const float* x,
     float* y,
     HIPContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<HIPContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<float>(N, x, y, context, scratch_ptr);
   } else {
@@ -1526,7 +1526,7 @@ void Sum<int32_t, HIPContext>(
     const int32_t* x,
     int32_t* y,
     HIPContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<HIPContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
   } else {
@@ -1559,7 +1559,7 @@ struct FloatTransform {
       const T* x,                                                         \
       T* y,                                                               \
       HIPContext* context,                                                \
-      Tensor* scratch_ptr) {                                              \
+      Tensor<HIPContext>* scratch_ptr) {                                  \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
       FloatTransform<T> transform;                                        \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
@@ -1606,7 +1606,7 @@ void SumSqr<float, HIPContext>(
     const float* x,
     float* y,
     HIPContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<HIPContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SqrTransform<float> transform;
     cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
@@ -1633,7 +1633,7 @@ void SumSqr<float, HIPContext>(
       const T* x,                                                     \
       T* y,                                                           \
       HIPContext* context,                                            \
-      Tensor* scratch_ptr) {                              \
+      Tensor<HIPContext>* scratch_ptr) {                              \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {            \
       FloatTransform<T> float_transform;                              \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> \
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index b25dffc7a723b..75ae1778aa3c8 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -19,6 +19,7 @@ extern "C" {
 
 namespace caffe2 {
 
+template <class Context>
 class Tensor;
 
 // An empty class as a placeholder for a math function that has no specific
@@ -167,7 +168,7 @@ void ReduceMin(
     const int N,
     const T* x,
     T* y,
-    Tensor* scratch_ptr,
+    Tensor<Context>* scratch_ptr,
     Context* context);
 
 template <typename T, class Context>
@@ -175,7 +176,7 @@ void ReduceMax(
     const int N,
     const T* x,
     T* y,
-    Tensor* scratch_ptr,
+    Tensor<Context>* scratch_ptr,
     Context* context);
 
 template <typename T, class Context>
@@ -440,7 +441,7 @@ void Sum(
     const T* x,
     T* y,
     Context* context,
-    Tensor* scratch_ptr = nullptr);
+    Tensor<Context>* scratch_ptr = nullptr);
 
 // Sum of squares of vector x, and writes the result to a single value y.
 template <typename T, class Context>
@@ -449,7 +450,7 @@ void SumSqr(
     const T* x,
     T* y,
     Context* context,
-    Tensor* scratch_ptr = nullptr);
+    Tensor<Context>* scratch_ptr = nullptr);
 
 // Select does index selection of the rows a N*D matrix x, and gives the N
 // dimensional vector y that contains the selected data.
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index e01af7bf2f88d..9290fcd4260eb 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -807,28 +807,28 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
 // Eigen or via custom code.
 ////////////////////////////////////////////////////////////////////////////////
 
-#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
-  template <>                           \
-  void ReduceMin<T, CPUContext>(        \
-      const int N,                      \
-      const T* x,                       \
-      T* y,                             \
-      Tensor* /*scratch_ptr*/,          \
-      CPUContext* /*context*/) {        \
-    *y = *std::min_element(x, x + N);   \
+#define CAFFE2_SPECIALIZED_REDUCEMIN(T)    \
+  template <>                              \
+  void ReduceMin<T, CPUContext>(           \
+      const int N,                         \
+      const T* x,                          \
+      T* y,                                \
+      Tensor<CPUContext>* /*scratch_ptr*/, \
+      CPUContext* /*context*/) {           \
+    *y = *std::min_element(x, x + N);      \
   }
 CAFFE2_SPECIALIZED_REDUCEMIN(float)
 #undef CAFFE2_SPECIALIZED_REDUCEMIN
 
-#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
-  template <>                           \
-  void ReduceMax<T, CPUContext>(        \
-      const int N,                      \
-      const T* x,                       \
-      T* y,                             \
-      Tensor* /*scratch_ptr*/,          \
-      CPUContext* /*context*/) {        \
-    *y = *std::max_element(x, x + N);   \
+#define CAFFE2_SPECIALIZED_REDUCEMAX(T)    \
+  template <>                              \
+  void ReduceMax<T, CPUContext>(           \
+      const int N,                         \
+      const T* x,                          \
+      T* y,                                \
+      Tensor<CPUContext>* /*scratch_ptr*/, \
+      CPUContext* /*context*/) {           \
+    *y = *std::max_element(x, x + N);      \
   }
 CAFFE2_SPECIALIZED_REDUCEMAX(float)
 CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
@@ -1899,7 +1899,7 @@ void RandGaussian<float, CPUContext>(
       const T* x,                            \
       T* y,                                  \
       CPUContext* /* unused */,              \
-      Tensor* /* unused */) {                \
+      Tensor<CPUContext>* /* unused */) {    \
     *y = ConstEigenVectorMap<T>(x, N).sum(); \
   }
 
@@ -1915,7 +1915,7 @@ void SumSqr<float, CPUContext>(
     const float* x,
     float* y,
     CPUContext* /*context*/ /* unused */,
-    Tensor* /*scratch_ptr*/ /* unused */) {
+    Tensor<CPUContext>* /*scratch_ptr*/ /* unused */) {
   *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
 }
 
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 40b75e5d5732a..94d3233886360 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -639,7 +639,7 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
-      Tensor* scratch_ptr,                                              \
+      Tensor<CUDAContext>* scratch_ptr,                                 \
       CUDAContext* context) {                                           \
     size_t memRequired = 0;                                             \
     cub::DeviceReduce::func(                                            \
@@ -1627,7 +1627,7 @@ void Dot<float, CUDAContext>(
     CUDAContext* context) {
   float result;
   CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result));
-  context->CopyFromCPU<float>(1, &result, y);
+  context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
 }
 
 template <>
@@ -1713,7 +1713,7 @@ void SumGenericIter(
     IterT it,
     T*& dest,
     CUDAContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<CUDAContext>* scratch_ptr) {
   size_t memRequired = 0;
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->cuda_stream());
@@ -1742,7 +1742,7 @@ void Sum<float, CUDAContext>(
     const float* x,
     float* y,
     CUDAContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<CUDAContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<float>(N, x, y, context, scratch_ptr);
   } else {
@@ -1757,7 +1757,7 @@ void Sum<int32_t, CUDAContext>(
     const int32_t* x,
     int32_t* y,
     CUDAContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<CUDAContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
   } else {
@@ -1782,7 +1782,7 @@ struct FloatTransform {
       const T* x,                                                         \
       T* y,                                                               \
       CUDAContext* context,                                               \
-      Tensor* scratch_ptr) {                                              \
+      Tensor<CUDAContext>* scratch_ptr) {                                 \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
       FloatTransform<T> transform;                                        \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
@@ -1814,7 +1814,7 @@ void SumSqr<float, CUDAContext>(
     const float* x,
     float* y,
     CUDAContext* context,
-    Tensor* scratch_ptr) {
+    Tensor<CUDAContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SqrTransform<float> transform;
     cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
@@ -1833,7 +1833,7 @@ void SumSqr<float, CUDAContext>(
       const T* x,                                                       \
       T* y,                                                             \
       CUDAContext* context,                                             \
-      Tensor* scratch_ptr) {                                            \
+      Tensor<CUDAContext>* scratch_ptr) {                               \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {              \
       FloatTransform<T> float_transform;                                \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*>   \
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index eaf3ef1aac212..330f34181918c 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
   Blob* bloby = ws.CreateBlob("Y");
   Blob* bloby_host = ws.CreateBlob("Y_host");
 
-  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
-  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensorx0 = blobx0->GetMutable<Tensor<CUDAContext>>();
+  auto* tensorx1 = blobx1->GetMutable<Tensor<CUDAContext>>();
+  auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();
 
   vector<int> shapex0_vector{shapex0};
   vector<int> shapex1_vector{shapex1};
@@ -71,8 +71,8 @@ void executeGpuBinaryOpTest(
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
-  tensory_host->CopyFrom(*tensory, &context);
+  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
+  tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensory, &context);
   context.FinishDeviceComputation();
 
   for (int i = 0; i < shapey; ++i) {
@@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   vector<int> shapex{33 * 9, 25};
   vector<int> shapey{33, 25};
 
-  auto* tensorx = blobx->GetMutableTensor(CUDA);
+  auto* tensorx = blobx->GetMutable<Tensor<CUDAContext>>();
   tensorx->Resize(shapex);
   int stripe = 33 * 25;
   vector<float> tot(33, 0.0);
@@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
     }
   }
 
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();
   tensory->Resize(shapey);
   math::Set<float, CUDAContext>(
       stripe, 0.0, tensory->mutable_data<float>(), &context);
@@ -125,8 +125,8 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
-  tensory_host->CopyFrom(*tensory, &context);
+  auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();
+  tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensory, &context);
   context.FinishDeviceComputation();
 
   for (int k = 0; k < 33; k++) {
@@ -149,7 +149,7 @@ TEST(MathUtilGPUTest, testReduceMin) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor aux(CUDA);
+        Tensor<CUDAContext> aux;
         math::ReduceMin<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 11.0f; });
@@ -165,7 +165,7 @@ TEST(MathUtilGPUTest, testReduceMin) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor aux(CUDA);
+        Tensor<CUDAContext> aux;
         math::ReduceMin<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 11.0f; });
@@ -184,7 +184,7 @@ TEST(MathUtilGPUTest, testReduceMax) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor aux(CUDA);
+        Tensor<CUDAContext> aux;
         math::ReduceMax<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 11.0f; });
@@ -200,7 +200,7 @@ TEST(MathUtilGPUTest, testReduceMax) {
          const float* /*src1*/,
          float* dst,
          CUDAContext* context) {
-        Tensor aux(CUDA);
+        Tensor<CUDAContext> aux;
         math::ReduceMax<float, CUDAContext>(N0, src0, dst, &aux, context);
       },
       [](int /*i*/) { return 17.0f; });
@@ -258,9 +258,9 @@ class GemmBatchedGPUTest
     Blob* X_blob = ws_.CreateBlob("X");
     Blob* W_blob = ws_.CreateBlob("W");
     Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = X_blob->GetMutableTensor(CUDA);
-    W_ = W_blob->GetMutableTensor(CUDA);
-    Y_ = Y_blob->GetMutableTensor(CUDA);
+    X_ = X_blob->GetMutable<Tensor<CUDAContext>>();
+    W_ = W_blob->GetMutable<Tensor<CUDAContext>>();
+    Y_ = Y_blob->GetMutable<Tensor<CUDAContext>>();
     X_->Resize(std::vector<TIndex>{3, 5, 10});
     W_->Resize(std::vector<TIndex>{3, 6, 10});
     Y_->Resize(std::vector<TIndex>{3, 5, 6});
@@ -326,7 +326,7 @@ class GemmBatchedGPUTest
   }
 
   void VerifyOutput(const float value) const {
-    Tensor Y_cpu(*Y_, CPU);
+    TensorCPU Y_cpu(*Y_);
     for (int i = 0; i < Y_cpu.size(); ++i) {
       EXPECT_FLOAT_EQ(value, Y_cpu.template data<float>()[i]);
     }
@@ -335,9 +335,9 @@ class GemmBatchedGPUTest
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* W_ = nullptr;
-  Tensor* Y_ = nullptr;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* W_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
   bool trans_X_;
   bool trans_W_;
 };
@@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
   }
 
   void SetUpData(
@@ -396,14 +396,14 @@ class ReduceTensorGPUTest : public testing::Test {
     X_->Resize(X_dims);
     Y_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->CopyFromCPU<float>(
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
@@ -433,8 +433,8 @@ class ReduceTensorGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* Y_ = nullptr;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
 };
 
 TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
@@ -661,8 +661,8 @@ class BroadcastGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
   }
 
   void SetUpData(
@@ -672,14 +672,14 @@ class BroadcastGPUTest : public testing::Test {
     X_->Resize(X_dims);
     Y_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->CopyFromCPU<float>(
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
@@ -707,8 +707,8 @@ class BroadcastGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* Y_ = nullptr;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
 };
 
 TEST_F(BroadcastGPUTest, BroadcastGPUFloatTest) {
@@ -737,9 +737,9 @@ class MomentsGPUTest : public testing::Test {
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_mean = ws_.CreateBlob("mean");
     Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    mean_ = blob_mean->GetMutableTensor(CUDA);
-    variance_ = blob_variance->GetMutableTensor(CUDA);
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    mean_ = blob_mean->GetMutable<Tensor<CUDAContext>>();
+    variance_ = blob_variance->GetMutable<Tensor<CUDAContext>>();
   }
 
   void SetUpData(
@@ -754,7 +754,7 @@ class MomentsGPUTest : public testing::Test {
     mean_->Resize(Y_dims);
     variance_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->CopyFromCPU<float>(
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
@@ -762,11 +762,12 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& mean_data,
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
-    mean_host->CopyFrom(*mean_, cuda_context_.get());
+    auto* mean_host = blob_mean_host->GetMutable<TensorCPU>();
+    mean_host->CopyFrom<CUDAContext, CUDAContext>(*mean_, cuda_context_.get());
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
-    variance_host->CopyFrom(*variance_, cuda_context_.get());
+    auto* variance_host = blob_variance_host->GetMutable<TensorCPU>();
+    variance_host->CopyFrom<CUDAContext, CUDAContext>(
+        *variance_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
 
     ASSERT_EQ(mean_data.size(), mean_host->size());
@@ -801,9 +802,9 @@ class MomentsGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* mean_ = nullptr;
-  Tensor* variance_ = nullptr;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* mean_ = nullptr;
+  Tensor<CUDAContext>* variance_ = nullptr;
 };
 
 TEST_F(MomentsGPUTest, MomentsGPUFloatTest) {
@@ -864,8 +865,8 @@ class TransposeGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = blob_x->GetMutable<Tensor<CUDAContext>>();
+    Y_ = blob_y->GetMutable<Tensor<CUDAContext>>();
   }
 
   void SetUpData(
@@ -880,14 +881,14 @@ class TransposeGPUTest : public testing::Test {
     X_->Resize(X_dims);
     Y_->Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_->size());
-    cuda_context_->CopyFromCPU<float>(
+    cuda_context_->Copy<float, CPUContext, CUDAContext>(
         X_data.size(), X_data.data(), X_->mutable_data<float>());
   }
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
+    auto* Y_host = blob_y_host->GetMutable<TensorCPU>();
+    Y_host->CopyFrom<CUDAContext, CUDAContext>(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
@@ -915,8 +916,8 @@ class TransposeGPUTest : public testing::Test {
   Workspace ws_;
   DeviceOption option_;
   std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* Y_ = nullptr;
+  Tensor<CUDAContext>* X_ = nullptr;
+  Tensor<CUDAContext>* Y_ = nullptr;
 };
 
 TEST_F(TransposeGPUTest, TransposeGPUFloatTest) {
diff --git a/caffe2/utils/math_test.cc b/caffe2/utils/math_test.cc
index ed08aedf1954c..6d3444553d51f 100644
--- a/caffe2/utils/math_test.cc
+++ b/caffe2/utils/math_test.cc
@@ -16,9 +16,9 @@ namespace caffe2 {
 TEST(MathTest, GemmNoTransNoTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  Tensor X(std::vector<int>{5, 10}, CPU);
-  Tensor W(std::vector<int>{10, 6}, CPU);
-  Tensor Y(std::vector<int>{5, 6}, CPU);
+  TensorCPU X(std::vector<int>{5, 10});
+  TensorCPU W(std::vector<int>{10, 6});
+  TensorCPU Y(std::vector<int>{5, 6});
   EXPECT_EQ(X.size(), 50);
   EXPECT_EQ(W.size(), 60);
   math::Set<float, CPUContext>(
@@ -91,9 +91,9 @@ TEST(MathTest, GemmNoTransNoTrans) {
 TEST(MathTest, GemmNoTransTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  Tensor X(std::vector<int>{5, 10}, CPU);
-  Tensor W(std::vector<int>{6, 10}, CPU);
-  Tensor Y(std::vector<int>{5, 6}, CPU);
+  TensorCPU X(std::vector<int>{5, 10});
+  TensorCPU W(std::vector<int>{6, 10});
+  TensorCPU Y(std::vector<int>{5, 6});
   EXPECT_EQ(X.size(), 50);
   EXPECT_EQ(W.size(), 60);
   math::Set<float, CPUContext>(
@@ -243,9 +243,9 @@ class GemmBatchedTest
 
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
-  Tensor X_{CPU};
-  Tensor W_{CPU};
-  Tensor Y_{CPU};
+  TensorCPU X_;
+  TensorCPU W_;
+  TensorCPU Y_;
   bool trans_X_;
   bool trans_W_;
 };
@@ -278,9 +278,9 @@ INSTANTIATE_TEST_CASE_P(
 TEST(MathTest, GemvNoTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  Tensor A(std::vector<int>{5, 10}, CPU);
-  Tensor X(std::vector<int>{10}, CPU);
-  Tensor Y(std::vector<int>{5}, CPU);
+  TensorCPU A(std::vector<int>{5, 10});
+  TensorCPU X(std::vector<int>{10});
+  TensorCPU Y(std::vector<int>{5});
   EXPECT_EQ(A.size(), 50);
   EXPECT_EQ(X.size(), 10);
   math::Set<float, CPUContext>(
@@ -344,9 +344,9 @@ TEST(MathTest, GemvNoTrans) {
 TEST(MathTest, GemvTrans) {
   DeviceOption option;
   CPUContext cpu_context(option);
-  Tensor A(std::vector<int>{6, 10}, CPU);
-  Tensor X(std::vector<int>{6}, CPU);
-  Tensor Y(std::vector<int>{10}, CPU);
+  TensorCPU A(std::vector<int>{6, 10});
+  TensorCPU X(std::vector<int>{6});
+  TensorCPU Y(std::vector<int>{10});
   EXPECT_EQ(A.size(), 60);
   EXPECT_EQ(X.size(), 6);
   math::Set<float, CPUContext>(
@@ -445,7 +445,7 @@ class ReduceTensorTest : public testing::Test {
     X_.Resize(X_dims);
     Y_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->CopyFromCPU<float>(
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     reduce_func(
         X_dims.size(),
@@ -463,8 +463,8 @@ class ReduceTensorTest : public testing::Test {
 
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
-  Tensor X_{CPU};
-  Tensor Y_{CPU};
+  TensorCPU X_;
+  TensorCPU Y_;
 };
 
 TEST_F(ReduceTensorTest, ReduceMinTest) {
@@ -679,7 +679,7 @@ class BroadcastTest : public testing::Test {
     X_.Resize(X_dims);
     Y_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->CopyFromCPU<float>(
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     math::Broadcast<float, CPUContext>(
         X_dims.size(),
@@ -698,8 +698,8 @@ class BroadcastTest : public testing::Test {
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
 
-  Tensor X_{CPU};
-  Tensor Y_{CPU};
+  TensorCPU X_;
+  TensorCPU Y_;
 };
 
 TEST_F(BroadcastTest, BroadcastFloatTest) {
@@ -735,7 +735,7 @@ class MomentsTest : public testing::Test {
     mean_.Resize(Y_dims);
     variance_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->CopyFromCPU<float>(
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     math::Moments<float, CPUContext>(
         X_dims.size(),
@@ -759,9 +759,9 @@ class MomentsTest : public testing::Test {
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
 
-  Tensor X_{CPU};
-  Tensor mean_{CPU};
-  Tensor variance_{CPU};
+  TensorCPU X_;
+  TensorCPU mean_;
+  TensorCPU variance_;
 };
 
 TEST_F(MomentsTest, MomentsFloatTest) {
@@ -828,7 +828,7 @@ class TransposeTest : public testing::Test {
     X_.Resize(X_dims);
     Y_.Resize(Y_dims);
     ASSERT_EQ(X_data.size(), X_.size());
-    cpu_context_->CopyFromCPU<float>(
+    cpu_context_->Copy<float, CPUContext, CPUContext>(
         X_data.size(), X_data.data(), X_.mutable_data<float>());
     math::Transpose<float, CPUContext>(
         X_dims.size(),
@@ -846,8 +846,8 @@ class TransposeTest : public testing::Test {
   DeviceOption option_;
   std::unique_ptr<CPUContext> cpu_context_;
 
-  Tensor X_{CPU};
-  Tensor Y_{CPU};
+  TensorCPU X_;
+  TensorCPU Y_;
 };
 
 TEST_F(TransposeTest, TransposeFloatTest) {
diff --git a/caffe2/utils/smart_tensor_printer.cc b/caffe2/utils/smart_tensor_printer.cc
index b7fa7ef0d3365..feb669e4bbae0 100644
--- a/caffe2/utils/smart_tensor_printer.cc
+++ b/caffe2/utils/smart_tensor_printer.cc
@@ -33,7 +33,7 @@ struct ProxyPrinter {
         char>>::call(this, tensor->meta());
   }
 
-  const Tensor* tensor;
+  const Tensor<CPUContext>* tensor;
   TensorPrinter* tensorPrinter;
 };
 }
@@ -52,7 +52,7 @@ SmartTensorPrinter::SmartTensorPrinter(
     int limit)
     : tensorPrinter_(tensor_name, file_name, limit) {}
 
-void SmartTensorPrinter::Print(const Tensor& tensor) {
+void SmartTensorPrinter::Print(const Tensor<CPUContext>& tensor) {
   ProxyPrinter printer;
 
   printer.tensor = &tensor;
@@ -71,7 +71,7 @@ SmartTensorPrinter& SmartTensorPrinter::DefaultTensorPrinter() {
 #endif
 }
 
-void SmartTensorPrinter::PrintTensor(const Tensor& tensor) {
+void SmartTensorPrinter::PrintTensor(const Tensor<CPUContext>& tensor) {
   DefaultTensorPrinter().Print(tensor);
 }
 }
diff --git a/caffe2/utils/smart_tensor_printer.h b/caffe2/utils/smart_tensor_printer.h
index 224f7d91e0e98..f99226d696438 100644
--- a/caffe2/utils/smart_tensor_printer.h
+++ b/caffe2/utils/smart_tensor_printer.h
@@ -27,17 +27,19 @@ class SmartTensorPrinter {
       const std::string& file_name,
       int limit);
 
-  void Print(const Tensor& tensor);
+  void Print(const Tensor<CPUContext>& tensor);
 
-  void PrintMeta(const Tensor& tensor) {
+  template <class Context>
+  void PrintMeta(const Tensor<Context>& tensor) {
     tensorPrinter_.PrintMeta(tensor);
   }
 
   // Uses a default constructed SmartTensorPrinter
-  static void PrintTensor(const Tensor& tensor);
+  static void PrintTensor(const Tensor<CPUContext>& tensor);
 
   // Uses a default constructed SmartTensorPrinter
-  void PrintTensorMeta(const Tensor& tensor) {
+  template <class Context>
+  void PrintTensorMeta(const Tensor<Context>& tensor) {
     DefaultTensorPrinter().PrintMeta(tensor);
   }
 
diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
index 82a59ad60aa95..d5681e2e0b07f 100644
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ b/caffe2/utils/smart_tensor_printer_test.cc
@@ -30,7 +30,7 @@ void printTensorAndCheck(const std::vector<T>& values) {
   testing::internal::CaptureStderr();
   CPUContext cpuContext;
 
-  Tensor tensor(
+  Tensor<CPUContext> tensor(
       std::vector<TIndex>{static_cast<TIndex>(values.size())},
       values,
       &cpuContext);
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index 3034e1bd4adbe..216b039501d33 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -52,14 +52,14 @@ class VideoInputOp final : public PrefetchOperator<Context> {
 
   const db::DBReader* reader_;
   CPUContext cpu_context_;
-  Tensor prefetched_clip_rgb_{CPU};
-  Tensor prefetched_clip_of_{CPU};
-  Tensor prefetched_label_{CPU};
-  Tensor prefetched_video_id_{CPU};
-  Tensor prefetched_clip_rgb_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_clip_of_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_label_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_video_id_on_device_{Context::GetDeviceType()};
+  TensorCPU prefetched_clip_rgb_;
+  TensorCPU prefetched_clip_of_;
+  TensorCPU prefetched_label_;
+  TensorCPU prefetched_video_id_;
+  Tensor<Context> prefetched_clip_rgb_on_device_;
+  Tensor<Context> prefetched_clip_of_on_device_;
+  Tensor<Context> prefetched_label_on_device_;
+  Tensor<Context> prefetched_video_id_on_device_;
   int batch_size_;
   int clip_per_video_;
   std::vector<float> mean_rgb_;
@@ -826,8 +826,7 @@ template <class Context>
 bool VideoInputOp<Context>::CopyPrefetched() {
   int index = 0;
   if (get_rgb_) {
-    auto* clip_rgb_output =
-        OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
+    auto* clip_rgb_output = OperatorBase::Output<Tensor<Context>>(index++);
     if (std::is_same<Context, CPUContext>::value) {
       clip_rgb_output->CopyFrom(prefetched_clip_rgb_, &context_);
     } else {
@@ -835,24 +834,21 @@ bool VideoInputOp<Context>::CopyPrefetched() {
     }
   }
   if (get_optical_flow_) {
-    auto* clip_of_output =
-        OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
+    auto* clip_of_output = OperatorBase::Output<Tensor<Context>>(index++);
     if (std::is_same<Context, CPUContext>::value) {
       clip_of_output->CopyFrom(prefetched_clip_of_, &context_);
     } else {
       clip_of_output->CopyFrom(prefetched_clip_of_on_device_, &context_);
     }
   }
-  auto* label_output =
-      OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
+  auto* label_output = OperatorBase::Output<Tensor<Context>>(index++);
   if (std::is_same<Context, CPUContext>::value) {
     label_output->CopyFrom(prefetched_label_, &context_);
   } else {
     label_output->CopyFrom(prefetched_label_on_device_, &context_);
   }
   if (get_video_id_) {
-    auto* video_id_output =
-        OperatorBase::Output<Tensor>(index, Context::GetDeviceType());
+    auto* video_id_output = OperatorBase::Output<Tensor<Context>>(index);
     if (std::is_same<Context, CPUContext>::value) {
       video_id_output->CopyFrom(prefetched_video_id_, &context_);
     } else {
diff --git a/modules/detectron/group_spatial_softmax_op.h b/modules/detectron/group_spatial_softmax_op.h
index 2109aca13fe80..6bced40dc0532 100644
--- a/modules/detectron/group_spatial_softmax_op.h
+++ b/modules/detectron/group_spatial_softmax_op.h
@@ -68,7 +68,7 @@ class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
  protected:
   int num_classes_;
   StorageOrder order_;
-  Tensor sum_probs_{Context::GetDeviceType()};
+  Tensor<Context> sum_probs_;
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/select_smooth_l1_loss_op.h b/modules/detectron/select_smooth_l1_loss_op.h
index 131be9e0993c5..04908ef3af3ad 100644
--- a/modules/detectron/select_smooth_l1_loss_op.h
+++ b/modules/detectron/select_smooth_l1_loss_op.h
@@ -45,7 +45,7 @@ class SelectSmoothL1LossOp final : public Operator<Context> {
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
   int dim_; // dimension for 1 anchor prediction
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
+  Tensor<Context> buff_; // Buffer for element-wise differences
 };
 
 template <typename T, class Context>
@@ -69,7 +69,7 @@ class SelectSmoothL1LossGradientOp final : public Operator<Context> {
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
   int dim_; // dimension for 1 anchor prediction
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
+  Tensor<Context> buff_; // Buffer for element-wise differences
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.h b/modules/detectron/sigmoid_cross_entropy_loss_op.h
index bb0e923ddb93d..34acd6886a716 100644
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.h
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.h
@@ -44,9 +44,9 @@ class SigmoidCrossEntropyLossOp final : public Operator<Context> {
  protected:
   float scale_;
   int normalize_;
-  Tensor losses_{Context::GetDeviceType()};
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor normalizer_{Context::GetDeviceType()};
+  Tensor<Context> losses_;
+  Tensor<Context> counts_;
+  Tensor<Context> normalizer_;
 };
 
 template <typename T, class Context>
@@ -69,8 +69,8 @@ class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
  protected:
   float scale_;
   int normalize_;
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor normalizer_{Context::GetDeviceType()};
+  Tensor<Context> counts_;
+  Tensor<Context> normalizer_;
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/sigmoid_focal_loss_op.h b/modules/detectron/sigmoid_focal_loss_op.h
index 2a07abc5afe72..d59df8f4b2fa1 100644
--- a/modules/detectron/sigmoid_focal_loss_op.h
+++ b/modules/detectron/sigmoid_focal_loss_op.h
@@ -47,8 +47,8 @@ class SigmoidFocalLossOp final : public Operator<Context> {
   int num_classes_;
   float gamma_;
   float alpha_;
-  Tensor losses_{Context::GetDeviceType()};
-  Tensor counts_{Context::GetDeviceType()};
+  Tensor<Context> losses_;
+  Tensor<Context> counts_;
 };
 
 template <typename T, class Context>
@@ -74,8 +74,8 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
   int num_classes_;
   float gamma_;
   float alpha_;
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
+  Tensor<Context> counts_;
+  Tensor<Context> weights_; // unignored weights
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/smooth_l1_loss_op.h b/modules/detectron/smooth_l1_loss_op.h
index 013645ebc08ad..283be2eb73134 100644
--- a/modules/detectron/smooth_l1_loss_op.h
+++ b/modules/detectron/smooth_l1_loss_op.h
@@ -44,7 +44,7 @@ class SmoothL1LossOp final : public Operator<Context> {
  protected:
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
+  Tensor<Context> buff_; // Buffer for element-wise differences
 };
 
 template <typename T, class Context>
@@ -67,7 +67,7 @@ class SmoothL1LossGradientOp final : public Operator<Context> {
  protected:
   float beta_; // Transition point from L1 to L2 loss
   float scale_; // Scale the loss by scale_
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
+  Tensor<Context> buff_; // Buffer for element-wise differences
 };
 
 } // namespace caffe2
diff --git a/modules/detectron/softmax_focal_loss_op.h b/modules/detectron/softmax_focal_loss_op.h
index ac9b0e39a0780..98750dd189bf1 100644
--- a/modules/detectron/softmax_focal_loss_op.h
+++ b/modules/detectron/softmax_focal_loss_op.h
@@ -52,7 +52,7 @@ class SoftmaxFocalLossOp final : public Operator<Context> {
   float alpha_;
   int num_classes_;
   StorageOrder order_;
-  Tensor losses_{Context::GetDeviceType()};
+  Tensor<Context> losses_;
 };
 
 template <typename T, class Context>
@@ -83,7 +83,7 @@ class SoftmaxFocalLossGradientOp final : public Operator<Context> {
   float alpha_;
   int num_classes_;
   StorageOrder order_;
-  Tensor buff_{Context::GetDeviceType()};
+  Tensor<Context> buff_;
 };
 
 } // namespace caffe2