From 2158f4a9c891f8a52c13a08b28ee55341b6ac8c4 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Mon, 10 Sep 2018 11:29:40 -0700
Subject: [PATCH 001/237] add export import test to TestJitGenerated (#10982)

Summary:
Checking assertExportImport for all of the generated test jit tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10982

Differential Revision: D9636935

Pulled By: eellison

fbshipit-source-id: f3f1ce77d454848098f2ac7e0fa18bf8564890be
---
 test/test_jit.py          | 28 ++++++++++++++++++----------
 torch/csrc/jit/export.cpp |  5 +++--
 torch/csrc/jit/import.cpp |  2 ++
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 5eb7d4649bd763..342087e277292c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -351,8 +351,6 @@ def allSum(vs):
 
         return ge
 
-
-class TestJit(JitTestCase):
     def assertExportImport(self, trace, inputs):
         graph = trace if isinstance(trace, torch._C.Graph) else trace.graph()
         m = torch.jit.ScriptModule()
@@ -361,6 +359,9 @@ def assertExportImport(self, trace, inputs):
 
         self.assertEqual(m.forward(*inputs), m_import.forward(*inputs))
 
+
+class TestJit(JitTestCase):
+
     def test_simple(self):
         x = torch.tensor([0.4], requires_grad=True)
         y = torch.tensor([0.7], requires_grad=True)
@@ -7164,10 +7165,12 @@ def new_fn(*tensors_):
     return new_fn, [arg for arg in args if isinstance(arg, torch.Tensor)]
 
 
-def create_traced_fn(fn):
+# create a trace function from input fn
+def create_traced_fn(self, fn):
     def traced_fn(*inputs, **kwargs):
         fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
         traced = torch.jit.trace(fn_tensors, inputs_tensors)
+        self.assertExportImport(traced.graph, inputs_tensors)
         return traced(*inputs_tensors)
     return traced_fn
 
@@ -7177,7 +7180,10 @@ def the_method({}):
 '''
 
 
-def create_script_fn(method_name, func_type, output_process_fn):
+# create a script function from (name, func_type, output_process_fn),
+# returns a function takes in (args, kwargs) and runs the compiled function and
+# then applies the post process fn to the outputs
+def create_script_fn(self, method_name, func_type, output_process_fn):
     def script_fn(*args, **kwargs):
         formals = []
         tensors = []
@@ -7204,6 +7210,8 @@ def script_fn(*args, **kwargs):
 
         script = script_template.format(', '.join(formals), call)
         CU = torch.jit.CompilationUnit(script)
+        self.assertExportImport(CU.the_method.graph, tensors)
+
         return output_process_fn(CU.the_method(*tensors))
     return script_fn
 
@@ -7272,7 +7280,7 @@ def clone_inputs(requires_grad):
         self.assertTrue(torch.allclose(g2, g2_test, atol=5e-4, rtol=1e-4))
 
 
-class TestJitGenerated(TestCase):
+class TestJitGenerated(JitTestCase):
     pass
 
 
@@ -7560,12 +7568,12 @@ def fn(*inputs, **kwargs):
 
                 if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name):
                     if test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(fn),
+                        check_against_reference(self, create_traced_fn(self, fn),
                                                 fn, (self_variable,) + args_variable, kwargs_variable)
 
                     if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(name, 'method', output_process_fn),
+                                                create_script_fn(self, name, 'method', output_process_fn),
                                                 fn, (self_variable,) + args_variable, kwargs_variable)
 
                 # functional interface tests
@@ -7578,11 +7586,11 @@ def fn(*inputs, **kwargs):
                     f_args_tensor = (self_tensor,) + args_tensor
 
                     if not is_inplace and test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(fn), fn, f_args_variable, kwargs_variable)
+                        check_against_reference(self, create_traced_fn(self, fn), fn, f_args_variable, kwargs_variable)
 
                     if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(name, 'functional', output_process_fn),
+                                                create_script_fn(self, name, 'functional', output_process_fn),
                                                 fn, f_args_variable, kwargs_variable)
 
             check(name)
@@ -7620,7 +7628,7 @@ def fn(*inputs, **kwargs):
 
         if test_name not in EXCLUDE_SCRIPT:
             check_against_reference(self,
-                                    create_script_fn(name, 'nn_functional', output_process_fn),
+                                    create_script_fn(self, name, 'nn_functional', output_process_fn),
                                     fn, f_args_variable, kwargs_variable)
 
     post_add_test(test_name, skipTestIf, do_test)
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index e5e6cf1960df60..c4ef14f191f0ee 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -557,8 +557,9 @@ void ModuleEncoder::EncodeTypeInfo(
     type_proto->set_denotation("IntType");
   } else if (kind == TypeKind::NoneType) {
     type_proto->set_denotation("NoneType");
-  }
-  else {
+  } else if (kind == TypeKind::GeneratorType) {
+    type_proto->set_denotation("GeneratorType");
+  } else {
     throw std::runtime_error("unexpected type kind");
   }
 }
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index ba6fe9fe0a27a4..f868f1ff555ad5 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -258,6 +258,8 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
     return IntType::get();
   } else if (kind == "NoneType") {
     return NoneType::get();
+  } else if (kind == "GeneratorType") {
+    return GeneratorType::get();
   } else {
     throw std::runtime_error("unexpected string for type kind");
   }

From 040d75d455cb2963de9a147001f11f63459c0620 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 10 Sep 2018 12:30:42 -0700
Subject: [PATCH 002/237] Add option to use CUDA memory leak testing as a
 context manager (#11380)

Summary:
cc SsnL
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11380

Reviewed By: ezyang

Differential Revision: D9705877

Pulled By: zou3519

fbshipit-source-id: 02470c25236f57fa02f4ac9d7ed63d38a6355db2
---
 test/common.py    | 56 ++++++++++++++++++++++++++++++++---------------
 test/test_cuda.py |  7 ++++++
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/test/common.py b/test/common.py
index e65795967deb7a..04986fcf5af57b 100644
--- a/test/common.py
+++ b/test/common.py
@@ -128,14 +128,6 @@ def dec(fn):
     return dec
 
 
-def get_cuda_memory_usage():
-    # we don't need CUDA synchronize because the statistics are not tracked at
-    # actual freeing, but at when marking the block as free.
-    num_devices = torch.cuda.device_count()
-    gc.collect()
-    return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices))
-
-
 def suppress_warnings(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -215,6 +207,38 @@ def is_iterable(obj):
         return False
 
 
+class CudaMemoryLeakCheck():
+    def __init__(self, testcase, name=None):
+        self.name = testcase.id() if name is None else name
+        self.testcase = testcase
+
+        # initialize context & RNG to prevent false positive detections
+        # when the test is the first to initialize those
+        from common_cuda import initialize_cuda_context_rng
+        initialize_cuda_context_rng()
+
+    @staticmethod
+    def get_cuda_memory_usage():
+        # we don't need CUDA synchronize because the statistics are not tracked at
+        # actual freeing, but at when marking the block as free.
+        num_devices = torch.cuda.device_count()
+        gc.collect()
+        return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices))
+
+    def __enter__(self):
+        self.befores = self.get_cuda_memory_usage()
+
+    def __exit__(self, exec_type, exec_value, traceback):
+        # Don't check for leaks if an exception was thrown
+        if exec_type is not None:
+            return
+        afters = self.get_cuda_memory_usage()
+        for i, (before, after) in enumerate(zip(self.befores, afters)):
+            self.testcase.assertEqual(
+                before, after, '{} leaked {} bytes CUDA memory on device {}'.format(
+                    self.name, after - before, i))
+
+
 class TestCase(unittest.TestCase):
     precision = 1e-5
     maxDiff = None
@@ -232,12 +256,12 @@ def __init__(self, method_name='runTest'):
             from common_cuda import TEST_CUDA
             fullname = self.id().lower()  # class_name.method_name
             if TEST_CUDA and ('gpu' in fullname or 'cuda' in fullname):
-                # initialize context & RNG to prevent false positive detections
-                # when the test is the first to initialize those
-                from common_cuda import initialize_cuda_context_rng
-                initialize_cuda_context_rng()
                 setattr(self, method_name, self.wrap_with_cuda_memory_check(test_method))
 
+    def assertLeaksNoCudaTensors(self, name=None):
+        name = self.id() if name is None else name
+        return CudaMemoryLeakCheck(self, name)
+
     def wrap_with_cuda_memory_check(self, method):
         # Assumes that `method` is the tested function in `self`.
         # NOTE: Python Exceptions (e.g., unittest.Skip) keeps objects in scope
@@ -247,12 +271,8 @@ def wrap_with_cuda_memory_check(self, method):
         #       call in try-finally and always do the check.
         @wraps(method)
         def wrapper(self, *args, **kwargs):
-            befores = get_cuda_memory_usage()
-            method(*args, **kwargs)
-            afters = get_cuda_memory_usage()
-            for i, (before, after) in enumerate(zip(befores, afters)):
-                self.assertEqual(before, after, '{} leaked {} bytes CUDA memory on device {}'.format(
-                                 self.id(), after - before, i))
+            with self.assertLeaksNoCudaTensors():
+                method(*args, **kwargs)
         return types.MethodType(wrapper, self)
 
     def setUp(self):
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 1ca7155dd098c3..71e4a8220c6e68 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1955,6 +1955,13 @@ def leak_gpu1():
             with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 1"):
                 leak_gpu1()
 
+    def test_cuda_memory_leak_detection_propagates_errors(self):
+        with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"):
+            with self.assertLeaksNoCudaTensors():
+                x = torch.randn(3, 1, device='cuda')
+                y = torch.randn(2, 1, device='cuda')
+                z = x + y
+
 
 def load_ignore_file():
     from os.path import join, dirname

From ce6906b05116a0ef9dfd3e52284c0fcf77186129 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 10 Sep 2018 12:31:22 -0700
Subject: [PATCH 003/237] Narrowing Blob (#11167)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11167

Narrow the Blob API as preparation for merging Blob/IValue

- get rid of templated IsType and Operator::InputIsType / OutputIsType
- Use 'using' instead of 'typedef' for DestroyCall (just for readability)

Reviewed By: ezyang

Differential Revision: D9623916

fbshipit-source-id: 952f0b0cf5a525094b02e8d2798dd57a56a9e1d8
---
 .../contrib/tensorrt/tensorrt_tranformer.cc   |  4 ++--
 caffe2/core/blob.h                            | 24 +++++++------------
 caffe2/core/blob_gpu_test.cc                  |  6 ++---
 caffe2/core/blob_test.cc                      | 16 ++++++-------
 caffe2/core/operator.h                        | 22 +++++------------
 caffe2/ideep/operators/concat_split_op.cc     |  2 +-
 .../ideep/operators/operator_fallback_ideep.h |  2 +-
 caffe2/ideep/operators/utility_ops.cc         |  2 +-
 caffe2/mkl/operators/operator_fallback_mkl.h  |  2 +-
 caffe2/mkl/operators/packed_fc_op.cc          |  2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |  2 +-
 caffe2/mpi/mpi_ops.h                          |  2 +-
 caffe2/observers/profile_observer_gpu.cc      |  8 +++----
 caffe2/operators/ensure_cpu_output_op.h       |  4 ++--
 caffe2/operators/if_op.h                      |  2 +-
 caffe2/operators/onnxifi_op.cc                |  2 +-
 caffe2/operators/operator_fallback_gpu.h      |  4 ++--
 caffe2/operators/string_ops_test.cc           |  2 +-
 caffe2/operators/stylizer_ops.cc              |  2 +-
 caffe2/operators/utility_ops.h                |  8 +++----
 caffe2/operators/while_op.h                   |  2 +-
 caffe2/predictor/predictor.cc                 |  2 +-
 caffe2/sgd/adam_op.h                          |  2 +-
 caffe2/sgd/fp16_momentum_sgd_op.h             |  4 ++--
 caffe2/sgd/fp32_momentum_sgd_op.h             |  4 ++--
 caffe2/sgd/iter_op.h                          |  2 +-
 caffe2/sgd/momentum_sgd_op.h                  |  8 +++----
 caffe2/sgd/yellowfin_op.h                     |  2 +-
 28 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index 716ff7a70814e0..3612d8b46f1f8d 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -95,10 +95,10 @@ void BlobToTensorProto(
   }
 
   // Set values
-  if (blob->template IsType<Tensor>(CPU)) {
+  if (blob->IsTensorType(CPU)) {
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (blob->template IsType<Tensor>(CUDA)) {
+  } else if (blob->IsTensorType(CUDA)) {
     const auto& cuda_tensor = blob->template Get<TensorCUDA>();
     const auto cpu_tensor = TensorCPU(cuda_tensor, context);
     context->FinishDeviceComputation();
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index f085ee23995bd7..1df7bc9beafcb8 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -23,9 +23,9 @@ namespace caffe2 {
  * properly when the blob is deallocated or re-allocated with a new type. A blob
  * could contain anything, although the most common case is to contain a Tensor.
  */
-class CAFFE2_API Blob {
+class CAFFE2_API Blob final {
  public:
-  typedef void (*DestroyCall)(void*);
+  using DestroyCall = void(void*);
 
   /**
    * Initializes an empty Blob.
@@ -63,14 +63,8 @@ class CAFFE2_API Blob {
     return meta_.Match<T>();
   }
 
-  // TODO(jerryzh): Remove template
-  template <class T>
-  bool IsType(DeviceType device_type) const {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "IsType(DeviceType) only available on "
-        "Tensor types.");
-    bool is_match = meta_.Match<T>();
+  bool IsTensorType(DeviceType device_type) const {
+    bool is_match = meta_.Match<Tensor>();
     auto* tensor = static_cast<Tensor*>(pointer_);
     if (is_match && tensor && tensor->GetDeviceType() == device_type) {
       return true;
@@ -149,7 +143,7 @@ class CAFFE2_API Blob {
   }
 
   inline Tensor* GetMutableTensor(DeviceType device_type) {
-    if (IsType<Tensor>(device_type)) {
+    if (IsTensorType(device_type)) {
       return static_cast<Tensor*>(pointer_);
     } else {
       VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
@@ -178,7 +172,7 @@ class CAFFE2_API Blob {
   }
 
   inline void*
-  Reset(void* allocated, const TypeMeta& meta, const DestroyCall& destroy) {
+  Reset(void* allocated, const TypeMeta& meta, DestroyCall* destroy) {
     if (pointer_ && destroy_) {
       destroy_(pointer_);
     }
@@ -192,8 +186,8 @@ class CAFFE2_API Blob {
    * Releases the ownership, if any, this Blob has on the underlying pointer.
    * The user is then responsible for freeing the data if needed
    */
-  inline DestroyCall Release() {
-    DestroyCall d = destroy_;
+  inline DestroyCall* Release() {
+    DestroyCall* d = destroy_;
     destroy_ = nullptr;
     return d;
   }
@@ -289,7 +283,7 @@ class CAFFE2_API Blob {
   }
   TypeMeta meta_;
   void* pointer_ = nullptr;
-  DestroyCall destroy_ = nullptr;
+  DestroyCall* destroy_ = nullptr;
 
   AT_DISABLE_COPY_AND_ASSIGN(Blob);
 };
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 0f0fcb54906dcd..29bf0c3bc52ae6 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -148,7 +148,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                     \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CUDA));                            \
+    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                            \
     Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(blob.Deserialize(serialized));
-    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
+    EXPECT_TRUE(blob.IsTensorType(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
-    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
+    EXPECT_TRUE(blob.IsTensorType(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 97d17c6e5924f3..8103071c81ee26 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
   int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
   EXPECT_TRUE(blob.IsType<int>());
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(blob.IsType<Tensor>(CPU));
+  EXPECT_FALSE(blob.IsTensorType(CPU));
 
   BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
   EXPECT_TRUE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(blob.IsType<Tensor>(CPU));
+  EXPECT_FALSE(blob.IsTensorType(CPU));
 
   Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
-  EXPECT_TRUE(blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(blob.IsTensorType(CPU));
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
 }
@@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
+    EXPECT_TRUE(new_blob.IsTensorType(CPU));                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
+    EXPECT_TRUE(new_blob.IsTensorType(CPU));                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(new_blob.IsTensorType(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -724,7 +724,7 @@ TEST(TensorTest, float16) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(new_blob.IsTensorType(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsType<Tensor>(CPU));
+    EXPECT_TRUE(new_blob->IsTensorType(CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 2def93f0b51d08..e75681ff3a9df6 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -143,36 +143,26 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   inline bool InputIsType(int idx) {
     static_assert(
         !std::is_same<T, Tensor>::value,
-        "You should use InputIsType<Tensor>(int, DeviceType) for "
+        "You should use InputIsTensorType(int, DeviceType) for "
         "Tensor.");
     return inputs_.at(idx)->template IsType<T>();
   }
 
-  template <typename T>
-  inline bool InputIsType(int idx, DeviceType device_type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "InputIsType(idx, DeviceType) only available on "
-        "Tensor types.");
-    return inputs_.at(idx)->template IsType<T>(device_type);
+  inline bool InputIsTensorType(int idx, DeviceType device_type) {
+    return inputs_.at(idx)->IsTensorType(device_type);
   }
 
   template <typename T>
   inline bool OutputIsType(int idx) {
     static_assert(
         !std::is_same<T, Tensor>::value,
-        "You should use OutputIsType<Tensor>(int, DeviceType) for "
+        "You should use OutputIsTensorType(int, DeviceType) for "
         "Tensor.");
     return outputs_.at(idx)->template IsType<T>();
   }
 
-  template <typename T>
-  inline bool OutputIsType(int idx, DeviceType type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "OutputIsType(idx, DeviceType) only available on "
-        "Tensor types.");
-    return outputs_.at(idx)->template IsType<T>(type);
+  inline bool OutputIsTensorType(int idx, DeviceType type) {
+    return outputs_.at(idx)->IsTensorType(type);
   }
 
   inline int InputSize() const {
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 25d4e16d2f9e7a..311c6446184a87 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -33,7 +33,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
       if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
         inputs.emplace_back(Input(i));
       } else {
-        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType<Tensor>(CPU),
+        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
                       "Expect cpu tensor if not itensor");
         auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
         CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 9ae2323442120a..08e6de2ae3f0dc 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 63bd0da7cb5cb6..626568a989b939 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.template IsType<Tensor>(CPU)) {
+    if (input_blob.IsTensorType(CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<Tensor>(0, CPU);
       auto* Y = OperatorBase::Output<Tensor>(0, CPU);
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index ce4c85d2c231e0..6d9713b74612d8 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "MKL fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc
index 1f3231dc521f5d..0ed93cf061070c 100644
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@@ -49,7 +49,7 @@ class PackedFCOp final : public Operator<CPUContext> {
 
     // Check out what is the passed in format.
     const MKLPackedMatrix* packed_matrix = nullptr;
-    if (OperatorBase::InputIsType<Tensor>(1, CPU)) {
+    if (OperatorBase::InputIsTensorType(1, CPU)) {
       const auto& W = Input(1);
       CAFFE_ENFORCE_EQ(W.ndim(), 2);
       CAFFE_ENFORCE_EQ(W.dim32(0), N);
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 755e1b5a57b8a9..2238d7af08dda6 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,7 +489,7 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsType<Tensor>(CPU) ||
+    if (!noiseBlob->IsTensorType(CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
index b84de851a9e948..8657c107ed0f33 100644
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@@ -36,7 +36,7 @@ class MPIBroadcastOp final : public Operator<Context> {
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     CAFFE_ENFORCE(
-        OperatorBase::OutputIsType<Tensor>(0, Context::GetDeviceType()),
+        OperatorBase::OutputIsTensorType(0, Context::GetDeviceType()),
         "Output is of wrong type.");
     auto* output = Output(0);
     // Make sure that output is already allocated.
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index 5d40916be09346..bf4e20b7904711 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -26,12 +26,12 @@ void ProfileOperatorObserver::Dump() const {
   LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
             << " op#" << getId() << " ---------";
   for (int i = 0; i < subject_->InputSize(); ++i) {
-    if (subject_->InputIsType<Tensor>(i, CPU)) {
+    if (subject_->InputIsTensorType(i, CPU)) {
       const auto& tensor = subject_->Input<Tensor>(i, CPU);
       const auto& name = subject_->debug_def().input(i);
       TensorPrinter printer(name);
       LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
-    } else if (subject_->InputIsType<Tensor>(i, CUDA)) {
+    } else if (subject_->InputIsTensorType(i, CUDA)) {
       const auto& tensor = subject_->Input<Tensor>(i, CUDA);
       const auto& name = subject_->debug_def().input(i);
       TensorPrinter printer(name);
@@ -46,12 +46,12 @@ void ProfileOperatorObserver::Dump() const {
   }
 
   for (int o = 0; o < subject_->OutputSize(); ++o) {
-    if (subject_->OutputIsType<Tensor>(o, CPU)) {
+    if (subject_->OutputIsTensorType(o, CPU)) {
       auto* tensor = subject_->Output<Tensor>(o, CPU);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
-    } else if (subject_->OutputIsType<Tensor>(o, CUDA)) {
+    } else if (subject_->OutputIsTensorType(o, CUDA)) {
       auto* tensor = subject_->Output<Tensor>(o, CUDA);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h
index 08207644f7f094..3b8cb439f12066 100644
--- a/caffe2/operators/ensure_cpu_output_op.h
+++ b/caffe2/operators/ensure_cpu_output_op.h
@@ -15,9 +15,9 @@ class EnsureCPUOutputOp : public Operator<Context> {
       : Operator<Context>(operator_def, ws) {}
 
   bool RunOnDevice() override {
-    if (this->template InputIsType<Tensor>(0, CPU)) {
+    if (this->InputIsTensorType(0, CPU)) {
       return CopyWithContext<CPUContext>();
-    } else if (this->template InputIsType<Tensor>(0, Context::GetDeviceType())) {
+    } else if (this->InputIsTensorType(0, Context::GetDeviceType())) {
       // CUDA Context will go this branch
       return CopyWithContext<Context>();
     } else {
diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h
index cff2a620ef4694..e76fea0ee8f21a 100644
--- a/caffe2/operators/if_op.h
+++ b/caffe2/operators/if_op.h
@@ -32,7 +32,7 @@ class IfOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
+        this->InputIsTensorType(0, Context::GetDeviceType()),
         "Invalid condition in If operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 4e9f6f2ac280f1..aee7fff4bc3391 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->template IsType<Tensor>(CPU),
+      blob->IsTensorType(CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index b0ee9611a69042..8ef39e7c0e78d1 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -64,7 +64,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
   bool RunOnDevice() override {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
-      if (this->template InputIsType<Tensor>(i, CUDA)) {
+      if (this->InputIsTensorType(i, CUDA)) {
         local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
             Input(i), &context_);
         need_sync = true;
@@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index ece70ffd2425e1..c9ba13efb50258 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsType<Tensor>(CPU));
+    EXPECT_TRUE(output->IsTensorType(CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index 8f1e0895a28596..a6d395fe9ba647 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,7 +82,7 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsType<Tensor>(CPU)) {
+    if (!noiseBlob->IsTensorType(CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
       auto* t = noiseBlob->GetMutableTensor(CPU);
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index c9564dfa74a86a..634281428bb898 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -91,8 +91,8 @@ class PrintOp final : public Operator<Context> {
       return true;
     }
 
-    if (!this->template InputIsType<Tensor>(0, Context::GetDeviceType()) &&
-        !this->template InputIsType<Tensor>(0, CPU)) {
+    if (!this->InputIsTensorType(0, Context::GetDeviceType()) &&
+        !this->InputIsTensorType(0, CPU)) {
       LOG(INFO) << "Blob of type: "
                 << OperatorBase::Inputs().at(0)->meta().name();
       return true;
@@ -113,7 +113,7 @@ class PrintOp final : public Operator<Context> {
         unsigned char,
         std::string>;
 
-    if (this->template InputIsType<Tensor>(0, CPU)) {
+    if (this->InputIsTensorType(0, CPU)) {
       return DispatchHelper<Types>::call(
           this, this->template Input<Tensor>(0, CPU));
     } else {
@@ -129,7 +129,7 @@ class PrintOp final : public Operator<Context> {
     // will handle memory deallocation itself so no smart pointer is needed.
     const TensorCPU* tensor;
     Tensor tensor_copy_if_needed(CPU);
-    if (this->template InputIsType<Tensor>(0, CPU)) {
+    if (this->InputIsTensorType(0, CPU)) {
       tensor = &this->template Input<Tensor>(0, CPU);
     } else {
       tensor_copy_if_needed.CopyFrom(Input(0), &context_);
diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h
index 258862b690e4a6..3e90341bcdd7ef 100644
--- a/caffe2/operators/while_op.h
+++ b/caffe2/operators/while_op.h
@@ -35,7 +35,7 @@ class WhileOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
+        this->InputIsTensorType(0, Context::GetDeviceType()),
         "Invalid condition in While operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index cb1c9028d5c12b..84dac93753d37a 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -10,7 +10,7 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->template IsType<Tensor>(CPU), "Blob is not a CPU Tensor: ", name);
+      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index dadf7f4ee22015..3cd301a8223009 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -88,7 +88,7 @@ class AdamOp final : public Operator<Context> {
         epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)) {}
   bool RunOnDevice() override {
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size());
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size());
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h
index 7b1c68634de228..a8f3ce75c7fc2c 100644
--- a/caffe2/sgd/fp16_momentum_sgd_op.h
+++ b/caffe2/sgd/fp16_momentum_sgd_op.h
@@ -37,8 +37,8 @@ class FP16MomentumSGDUpdateOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h
index 75907a63501da9..57ea18a097b099 100644
--- a/caffe2/sgd/fp32_momentum_sgd_op.h
+++ b/caffe2/sgd/fp32_momentum_sgd_op.h
@@ -33,8 +33,8 @@ class FP32MomentumSGDUpdateOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
index 91709f47f3453a..22ec8d252c455f 100644
--- a/caffe2/sgd/iter_op.h
+++ b/caffe2/sgd/iter_op.h
@@ -39,7 +39,7 @@ class IterOp final : public Operator<Context> {
   bool RunOnDevice() override {
     if (InputSize() == 0) {
       LOG(INFO) << "[Input size is zero]";
-      if (!OperatorBase::OutputIsType<Tensor>(0, CPU)) {
+      if (!OperatorBase::OutputIsTensorType(0, CPU)) {
         // This is the first run; set the iter to start with 0.
         LOG(ERROR) << "You are using an old definition of IterOp that will "
                       "be deprecated soon. More specifically, IterOp now "
diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h
index 6e79d5dbedc7da..c3f25c84c9b8aa 100644
--- a/caffe2/sgd/momentum_sgd_op.h
+++ b/caffe2/sgd/momentum_sgd_op.h
@@ -47,8 +47,8 @@ class MomentumSGDOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
@@ -87,8 +87,8 @@ class MomentumSGDUpdateOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
     CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h
index 249f638bfac03d..94150413df1750 100644
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@@ -126,7 +126,7 @@ CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
 CAFFE2_YF_READ_INPUT(GRAD, grad)
 #undef CAFFE2_YF_READ_OUTPUT
 
-CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
+CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
 CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
 CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
 CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());

From 09292f2c03fa37eaa67651114d6b1b5943ec53d5 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 10 Sep 2018 12:31:24 -0700
Subject: [PATCH 004/237] Some improvements to IValue (#11238)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11238

- when moving an IValue, free the old value instead of keeping it allocated
- making classes final
- moving std::string
- making ConstantList const

Reviewed By: ezyang

Differential Revision: D9644700

fbshipit-source-id: ab7228368e4f00f664ba54e1242b0307d91c5e7e
---
 torch/csrc/jit/ivalue.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index c8475b7ff86183..0cb821e99a505b 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -13,14 +13,14 @@ template <typename T>
 using Shared = c10::intrusive_ptr<T>;
 
 // string
-struct TORCH_API ConstantString : c10::intrusive_ptr_target {
+struct TORCH_API ConstantString final : c10::intrusive_ptr_target {
  private:
   const std::string str_;
  public:
   ConstantString(std::string str)
   : str_(std::move(str)) {}
-  static c10::intrusive_ptr<ConstantString> create(const std::string str_) {
-    return c10::make_intrusive<ConstantString>(str_);
+  static c10::intrusive_ptr<ConstantString> create(std::string str_) {
+    return c10::make_intrusive<ConstantString>(std::move(str_));
   }
   const std::string & string() const {
     return str_;
@@ -34,9 +34,9 @@ struct TORCH_API ConstantString : c10::intrusive_ptr_target {
 
 // non-mutable list
 template<typename Elem>
-struct TORCH_API ConstantList : c10::intrusive_ptr_target {
+struct TORCH_API ConstantList final : c10::intrusive_ptr_target {
  private:
-  std::vector<Elem> elements_;
+  const std::vector<Elem> elements_;
  public:
   ConstantList(std::vector<Elem> elements_)
   : elements_(std::move(elements_)) {}
@@ -67,7 +67,7 @@ using DoubleList = ConstantList<double>;
 #define TORCH_FORALL_TAGS(_) \
   _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
 
-struct TORCH_API IValue {
+struct TORCH_API IValue final {
   IValue()
   : payload(0)
   , tag(Tag::None)
@@ -89,12 +89,12 @@ struct TORCH_API IValue {
     }
   }
   IValue & operator=(IValue && rhs) & noexcept {
-    rhs.swap(*this);
+    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
     return *this;
   }
   IValue & operator=(IValue const & rhs) & {
-      IValue(rhs).swap(*this);
-      return *this;
+    IValue(rhs).swap(*this);
+    return *this;
   }
   void swap(IValue & rhs) noexcept {
     std::swap(payload, rhs.payload);
@@ -173,7 +173,7 @@ struct TORCH_API IValue {
   IValue(c10::intrusive_ptr<IntList> v);
   IValue(std::vector<int64_t> v);
   IValue(at::ArrayRef<int64_t> v)
-  : IValue(std::vector<int64_t>(v.begin(), v.end())) {}
+  : IValue(v.vec()) {}
   bool isIntList() const { return Tag::IntList == tag; }
   c10::intrusive_ptr<IntList> toIntList() && {
     JIT_ASSERT(isIntList());
@@ -190,7 +190,7 @@ struct TORCH_API IValue {
 
   // ConstantString
   IValue(c10::intrusive_ptr<ConstantString> v);
-  IValue(const std::string& v);
+  IValue(std::string v);
   bool isString() const { return Tag::String == tag; }
   c10::intrusive_ptr<ConstantString> toString() && {
     JIT_ASSERT(isString());
@@ -369,8 +369,8 @@ inline IValue::IValue(c10::intrusive_ptr<ConstantString> v)
 : tag(Tag::String), is_intrusive_ptr(true) {
   as_intrusive_ptr = v.release();
 }
-inline IValue::IValue(const std::string& v)
-: IValue(ConstantString::create(v)) {}
+inline IValue::IValue(std::string v)
+: IValue(ConstantString::create(std::move(v))) {}
 
 inline IValue::IValue(c10::intrusive_ptr<DoubleList> v)
 : tag(Tag::DoubleList), is_intrusive_ptr(true) {

From 252f93df091ad4c282f38312a42d0865a47e2de6 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 10 Sep 2018 12:31:27 -0700
Subject: [PATCH 005/237] Improve Tensor() constructor (#11258)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11258

The two intrusive_ptr constructors in Tensor can be combined into one implementation that does both, moving and copying.

Reviewed By: ezyang

Differential Revision: D9652088

fbshipit-source-id: 5efca02654ba305c99c20bbeb83551469d17a51d
---
 aten/src/ATen/templates/Tensor.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 77c18466404c83..b96b2cf13d86be 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -51,9 +51,7 @@ struct AT_API Tensor {
       c10::raw::intrusive_ptr::incref(tensor_impl);
     }
   }
-  Tensor(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr)
-      : tensor_impl_(ptr) {}
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr)
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor> ptr)
       : tensor_impl_(std::move(ptr)) {}
 
   Tensor(const Tensor&) = default;

From b0c139727112423fdf1f989b85da0e3e5af28650 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 10 Sep 2018 12:31:29 -0700
Subject: [PATCH 006/237] Fix intrusive_ptr move/copy for different NullType's
 (#11260)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11260

This is needed to make something like this work:

    intrusive_ptr<TensorImpl, UndefinedTensorImpl> a = make_intrusive<SparseTensorImpl>(...);

Reviewed By: ezyang

Differential Revision: D9652089

fbshipit-source-id: 19c65e98460ccb27bc69e36d7e558cb9d6e67615
---
 aten/src/ATen/core/intrusive_ptr.h           |  73 +++---------
 aten/src/ATen/core/intrusive_ptr_test.cpp    | 115 ++++++++++++++++++-
 aten/src/ATen/native/sparse/SparseTensor.cpp |   2 +-
 torch/csrc/autograd/variable.h               |  10 +-
 4 files changed, 133 insertions(+), 67 deletions(-)

diff --git a/aten/src/ATen/core/intrusive_ptr.h b/aten/src/ATen/core/intrusive_ptr.h
index 65c6b5e702f2a0..961915555a3756 100644
--- a/aten/src/ATen/core/intrusive_ptr.h
+++ b/aten/src/ATen/core/intrusive_ptr.h
@@ -119,6 +119,15 @@ struct AT_CORE_EXPORT intrusive_target_default_null_type final {
     return nullptr;
   }
 };
+
+template<class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
 } // namespace detail
 
 template <class TTarget, class NullType>
@@ -191,17 +200,10 @@ class AT_CORE_EXPORT intrusive_ptr final {
 
   template <class From, class FromNullType>
   /* implicit */ intrusive_ptr(intrusive_ptr<From, FromNullType>&& rhs) noexcept
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr move constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr move constructor got pointer with differing null value.");
-#endif
     rhs.target_ = FromNullType::singleton();
   }
 
@@ -212,17 +214,10 @@ class AT_CORE_EXPORT intrusive_ptr final {
   template <class From, class FromNullType>
   /* implicit */ intrusive_ptr(
       const intrusive_ptr<From, FromNullType>& rhs)
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr copy constructor got pointer with differing null value.");
-#endif
     retain_();
   }
 
@@ -240,13 +235,6 @@ class AT_CORE_EXPORT intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr move assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr move assignment got pointer with differing null value.");
-#endif
     intrusive_ptr tmp = std::move(rhs);
     swap(tmp);
     return *this;
@@ -261,13 +249,6 @@ class AT_CORE_EXPORT intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr copy assignment got pointer with differing null value.");
-#endif
     intrusive_ptr tmp = rhs;
     swap(tmp);
     return *this;
@@ -464,17 +445,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
   template <class From, class FromNullType>
   /* implicit */ weak_intrusive_ptr(
       weak_intrusive_ptr<From, FromNullType>&& rhs) noexcept
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr move constructor got pointer with differing null value.");
-#endif
     rhs.target_ = FromNullType::singleton();
   }
 
@@ -486,17 +460,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
   template <class From, class FromNullType>
   /* implicit */ weak_intrusive_ptr(
       const weak_intrusive_ptr<From, FromNullType>& rhs)
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr copy constructor got pointer with differing null value.");
-#endif
     retain_();
   }
 
@@ -515,13 +482,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr move assignment got pointer with differing null value.");
-#endif
     weak_intrusive_ptr tmp = std::move(rhs);
     swap(tmp);
     return *this;
@@ -537,13 +497,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr copy assignment got pointer with differing null value.");
-#endif
     weak_intrusive_ptr tmp = rhs;
     swap(tmp);
     return *this;
diff --git a/aten/src/ATen/core/intrusive_ptr_test.cpp b/aten/src/ATen/core/intrusive_ptr_test.cpp
index df98563f1ff1fc..4d0701ebe164d7 100644
--- a/aten/src/ATen/core/intrusive_ptr_test.cpp
+++ b/aten/src/ATen/core/intrusive_ptr_test.cpp
@@ -59,6 +59,23 @@ class ChildDestructableMock final : public DestructableMock {
   ChildDestructableMock(bool* resourcesReleased, bool* wasDestructed)
       : DestructableMock(resourcesReleased, wasDestructed) {}
 };
+class NullType1 final {
+  static SomeClass singleton_;
+public:
+  static constexpr SomeClass* singleton() {
+    return &singleton_;
+  }
+};
+SomeClass NullType1::singleton_;
+class NullType2 final {
+  static SomeClass singleton_;
+public:
+  static constexpr SomeClass* singleton() {
+    return &singleton_;
+  }
+};
+SomeClass NullType2::singleton_;
+static_assert(NullType1::singleton() != NullType2::singleton(), "");
 } // namespace
 
 static_assert(
@@ -262,6 +279,19 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2;
+  obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, givenValidPtr_whenCopyAssigning_thenPointsToSameObject) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> obj2 = make_intrusive<SomeClass>();
@@ -359,6 +389,19 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2;
+  obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, givenPtr_whenMoveConstructing_thenPointsToSameObject) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   SomeClass* obj1ptr = obj1.get();
@@ -420,6 +463,18 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, givenPtr_whenCopyConstructing_thenPointsToSameObject) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   SomeClass* obj1ptr = obj1.get();
@@ -482,6 +537,18 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, SwapFunction) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> obj2 = make_intrusive<SomeClass>();
@@ -1520,9 +1587,9 @@ weak_intrusive_ptr<T> make_weak_only(Args&&... args) {
   auto intrusive = make_intrusive<T>(std::forward<Args>(args)...);
   return weak_intrusive_ptr<T>(intrusive);
 }
-template <class T>
-weak_intrusive_ptr<T> make_invalid_weak() {
-  return weak_intrusive_ptr<T>(intrusive_ptr<T>());
+template <class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+weak_intrusive_ptr<T, NullType> make_invalid_weak() {
+  return weak_intrusive_ptr<T, NullType>(intrusive_ptr<T, NullType>());
 }
 } // namespace
 
@@ -1752,6 +1819,17 @@ TEST(
   EXPECT_TRUE(obj2.weak.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = make_invalid_weak<SomeClass, NullType2>();
+  obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenValidPtr_whenCopyAssigning_thenPointsToSameObject) {
@@ -1930,6 +2008,17 @@ TEST(
   EXPECT_TRUE(obj2.weak.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = make_invalid_weak<SomeClass, NullType2>();
+  obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenPtr_whenMoveConstructing_thenPointsToSameObject) {
@@ -2014,6 +2103,16 @@ TEST(
   EXPECT_TRUE(obj2.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenPtr_whenCopyConstructing_thenPointsToSameObject) {
@@ -2097,6 +2196,16 @@ TEST(
   EXPECT_TRUE(obj2.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(WeakIntrusivePtrTest, SwapFunction) {
   IntrusiveAndWeak<SomeClass> obj1 = make_weak_intrusive<SomeClass>();
   IntrusiveAndWeak<SomeClass> obj2 = make_weak_intrusive<SomeClass>();
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index c96e577ba9d47a..b09feb0dc1e7dd 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -68,7 +68,7 @@ SparseTensor new_sparse(const SparseType& dtype) {
   } else {
     type_id = SparseCPUTensorId();
   }
-  return SparseTensor(c10::make_intrusive<SparseTensorImpl>(type_id, dtype.scalarType()).release(), /* retain */ false);
+  return SparseTensor(c10::make_intrusive<SparseTensorImpl>(type_id, dtype.scalarType()));
 }
 
 /*** Helper methods ***/
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 5a59afda5a8640..c5057711edfcd7 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -266,6 +266,7 @@ struct TORCH_API Variable : public at::Tensor {
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
   Variable(Variable::Impl* self, bool retain);
+  Variable(c10::intrusive_ptr<Variable::Impl> self);
   Impl* get() const;
 };
 
@@ -417,7 +418,7 @@ inline Variable make_variable_view(
     Edge gradient_edge = Edge()) {
   if (data.defined()) {
     return Variable(c10::make_intrusive<Variable::ViewImpl>(
-            std::move(base), std::move(data), std::move(gradient_edge)).release(), false);
+            std::move(base), std::move(data), std::move(gradient_edge)));
   }
   return Variable();
 }
@@ -427,7 +428,7 @@ inline Variable make_variable(at::Tensor data, bool requires_grad = false) {
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
-    return Variable(c10::make_intrusive<Variable::Impl>(data, requires_grad).release(), false);
+    return Variable(c10::make_intrusive<Variable::Impl>(data, requires_grad));
   }
   return Variable();
 }
@@ -437,7 +438,7 @@ inline Variable make_variable(at::Tensor data, Edge gradient_edge) {
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
-    return Variable(c10::make_intrusive<Variable::Impl>(data, false, std::move(gradient_edge)).release(), false);
+    return Variable(c10::make_intrusive<Variable::Impl>(data, false, std::move(gradient_edge)));
   }
   return Variable();
 }
@@ -597,6 +598,9 @@ inline PyObject* Variable::pyobj() const noexcept {
 inline Variable::Variable(Variable::Impl* self, bool retain)
     : at::Tensor(self, retain) {}
 
+inline Variable::Variable(c10::intrusive_ptr<Variable::Impl> self)
+    : at::Tensor(std::move(self)) {}
+
 inline Variable::Impl* Variable::get() const {
   AT_CHECK(defined(), "Called Variable::get() on an undefined Variable");
   return static_cast<Variable::Impl*>(tensor_impl_.get());

From 198ade74f929bb4f0d8b743a517aaeeaa219cadf Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 10 Sep 2018 12:31:34 -0700
Subject: [PATCH 007/237] Remove manual refcounting from Tensor class (#11294)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11294

The Tensor(ptr, retain) constructor is error prone and circumvents the intrusive_ptr safety.

This diff removes that and pushes the responsibility to callers.
Step by step, manual refcounting can be pushed back and possibly eliminated in the end.

Reviewed By: ezyang

Differential Revision: D9663476

fbshipit-source-id: 7f010e5e47b137a9575960201c5bf5d552c5c2f5
---
 aten/src/ATen/core/C++17.h                    |  2 ++
 aten/src/ATen/function_wrapper.py             |  6 +++--
 aten/src/ATen/templates/Tensor.h              | 12 +++-------
 aten/src/ATen/templates/TypeDefault.cpp       |  6 ++++-
 .../src/THC/generic/THCTensorMathPointwise.cu | 23 ++++++++++++-------
 torch/csrc/autograd/variable.cpp              |  4 +++-
 torch/csrc/autograd/variable.h                |  4 ----
 torch/csrc/jit/ivalue.h                       |  9 ++++++--
 8 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/core/C++17.h b/aten/src/ATen/core/C++17.h
index d8440ceea0c21a..5964a9dea36b2f 100644
--- a/aten/src/ATen/core/C++17.h
+++ b/aten/src/ATen/core/C++17.h
@@ -82,6 +82,7 @@ template<class T> using remove_cv_t = std::remove_cv_t<T>;
 template<class T> using result_of_t = std::result_of_t<T>;
 template<class T> using decay_t = std::decay_t<T>;
 template<class T> using remove_const_t = std::remove_const_t<T>;
+template<class T> using remove_pointer_t = std::remove_pointer_t<T>;
 #else
 template<bool B, class T, class F> using conditional_t = typename std::conditional<B, T, F>::type;
 template<bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
@@ -91,6 +92,7 @@ template<class T> using remove_cv_t = typename std::remove_cv<T>::type;
 template<class T> using result_of_t = typename std::result_of<T>::type;
 template<class T> using decay_t = typename std::decay<T>::type;
 template<class T> using remove_const_t = typename std::remove_const<T>::type;
+template<class T> using remove_pointer_t = typename std::remove_pointer<T>::type;
 #endif
 
 
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index e87a7bb88f8eb4..85b99469cdee15 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -1299,7 +1299,7 @@ def allocate_arg(env, arg, output_count):
                           .format(name, name))
         return [
             'auto {}_ = {};'.format(name, allocation),
-            'auto {} = Tensor({}, false);'.format(name, tensor_arg),
+            'auto {} = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim({}));'.format(name, tensor_arg),
         ]
 
     def resize_arg(arg):
@@ -1507,7 +1507,9 @@ def emit_body(env, option):
                                else ""
                 wrapped_tensor = CodeTemplate(ALLOC_WRAP[ret['type']]).substitute(
                     env, arguments=[call])
-                return_tensor = "return Tensor((${wrapped_tensor})${maybe_scalar},false);"
+                return_tensor = (
+                    "return Tensor(" +
+                    "c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim((${wrapped_tensor})${maybe_scalar}));")
                 body.append(CodeTemplate(return_tensor).substitute(
                     env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar))
             # return the same underlying Tensor type for both real and accreal; this ensures
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index b96b2cf13d86be..e54c6e4b62fc40 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -41,18 +41,12 @@ namespace at {
 // special care must be taken to handle this.
 struct AT_API Tensor {
   Tensor(){};
-  Tensor(TensorImpl* tensor_impl, bool retain)
-      : tensor_impl_(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(
-            tensor_impl)) {
-    if (tensor_impl == nullptr) {
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl)
+      : tensor_impl_(std::move(tensor_impl)) {
+    if (tensor_impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
     }
-    if (retain && tensor_impl != UndefinedTensor::singleton()) {
-      c10::raw::intrusive_ptr::incref(tensor_impl);
-    }
   }
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor> ptr)
-      : tensor_impl_(std::move(ptr)) {}
 
   Tensor(const Tensor&) = default;
   Tensor(Tensor&&) = default;
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 5e614edc57f216..54c6c282dcb95d 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -107,7 +107,11 @@ Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) co
     return Storage(scalarType(), size, allocator);
 }
 Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const {
-  return Tensor(static_cast<TensorImpl*>(th_pointer), retain);
+  auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(static_cast<TensorImpl*>(th_pointer));
+  if (retain && tensor_impl.get() != UndefinedTensor::singleton()) {
+    c10::raw::intrusive_ptr::incref(tensor_impl.get());
+  }
+  return Tensor(std::move(tensor_impl));
 }
 Storage TypeDefault::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   if (retain && th_pointer) {
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index 9192d6c9f9a1d5..2d69e60a40fed3 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -228,35 +228,42 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b,
 
 #endif
 
+namespace {
+c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor> retainTensorImpl(THCTensor* self) {
+  c10::raw::intrusive_ptr::incref(self);
+  return c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor>::reclaim(self);
+}
+}
+
 THC_API void
 THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
+  auto out = at::Tensor(retainTensorImpl(self_));
 #ifdef THC_REAL_IS_HALF
   auto alpha = at::Half(value);
 #else
   auto alpha = value;
 #endif
-  at::add_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha);
+  at::add_out(out, retainTensorImpl(src1), retainTensorImpl(src2), alpha);
 }
 
 THC_API void
 THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
+  auto out = at::Tensor(retainTensorImpl(self_));
 #ifdef THC_REAL_IS_HALF
   auto alpha = at::Half(value);
 #else
   auto alpha = value;
 #endif
-  at::sub_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha);
+  at::sub_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)), alpha);
 }
 
 THC_API void
 THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
-  at::mul_out(out, at::Tensor(src1, true), at::Tensor(src2, true));
+  auto out = at::Tensor(retainTensorImpl(self_));
+  at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
 THC_API void
@@ -370,8 +377,8 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, scalar_t value, THCTens
 THC_API void
 THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
-  at::div_out(out, at::Tensor(src1, true), at::Tensor(src2, true));
+  auto out = at::Tensor(retainTensorImpl(self_));
+  at::div_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
 THC_API void
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index a3bfd2f7749a60..3808aef75c4d57 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -108,7 +108,9 @@ std::shared_ptr<Function> Variable::Impl::get_grad_accumulator() {
   if (result)
     return result;
 
-  result = std::make_shared<AccumulateGrad>(Variable(this, true));
+  c10::raw::intrusive_ptr::incref(this);
+  auto intrusive_from_this = c10::intrusive_ptr<Variable::Impl>::reclaim(this);
+  result = std::make_shared<AccumulateGrad>(Variable(std::move(intrusive_from_this)));
   grad_accumulator_ = result;
   return result;
 }
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index c5057711edfcd7..bd2e475645975a 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -265,7 +265,6 @@ struct TORCH_API Variable : public at::Tensor {
   // Private Methods
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-  Variable(Variable::Impl* self, bool retain);
   Variable(c10::intrusive_ptr<Variable::Impl> self);
   Impl* get() const;
 };
@@ -595,9 +594,6 @@ inline PyObject* Variable::pyobj() const noexcept {
 // Private Methods
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-inline Variable::Variable(Variable::Impl* self, bool retain)
-    : at::Tensor(self, retain) {}
-
 inline Variable::Variable(c10::intrusive_ptr<Variable::Impl> self)
     : at::Tensor(std::move(self)) {}
 
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 0cb821e99a505b..7d9bea6e678eec 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -118,13 +118,18 @@ struct TORCH_API IValue final {
   bool isTensor() const { return Tag::Tensor == tag; }
   at::Tensor toTensor() && {
     JIT_ASSERT(isTensor());
-    at::Tensor t(as_tensor_impl, /*retain=*/false);
+    at::Tensor t(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor>::reclaim(as_tensor_impl));
     clearToNone();
     return t;
   }
   at::Tensor toTensor() const & {
     JIT_ASSERT(isTensor());
-    return at::Tensor(as_tensor_impl, /*retain=*/true);
+    JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensor::singleton()));
+    auto tensor_impl = c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor>::reclaim(as_tensor_impl);
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::incref(tensor_impl.get());
+    }
+    return at::Tensor(std::move(tensor_impl));
   }
 
   // Tuple

From ea0ee77c61beacfe67c0209de4e142e22ce136f1 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Mon, 10 Sep 2018 12:36:38 -0700
Subject: [PATCH 008/237] Fix katex math rendering (#11472)

Summary:
I'm 80% sure that this fixes the math bug. But I can't repro locally so I don't know.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11472

Differential Revision: D9755328

Pulled By: SsnL

fbshipit-source-id: 130be664d3c6ceee3c0c166c1a86fc9ec3b79d74
---
 docs/source/conf.py | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1b4d8d6ff83195..cbcdf707f2fdb4 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -55,22 +55,6 @@
     'sphinxcontrib.katex',
 ]
 
-# katex (mathjax replacement) macros
-#
-#
-
-katex_macros = r'''
-"\\op": "\\operatorname{{#1}}",
-"\\i": "\\mathrm{i}",
-"\\e": "\\mathrm{e}^{#1}",
-"\\w": "\\omega",
-"\\vec": "\\mathbf{#1}",
-"\\x": "\\vec{x}",
-"\\d": "\\operatorname{d}\\!{}",
-"\\dirac": "\\operatorname{\\delta}\\left(#1\\right)",
-"\\scalarprod": "\\left\\langle#1,#2\\right\\rangle",
-'''
-
 # katex options
 #
 #
@@ -78,10 +62,9 @@
 katex_options = r'''
 delimiters : [
    {left: "$$", right: "$$", display: true},
-   {left: "\\(", right: "\\)", display: true},
+   {left: "\\(", right: "\\)", display: false},
    {left: "\\[", right: "\\]", display: true}
-],
-strict : false
+]
 '''
 
 napoleon_use_ivar = True

From 18e5fd36c2d8b55520aee0998258a61dc696602a Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 10 Sep 2018 13:42:51 -0700
Subject: [PATCH 009/237] Normalize gradients before reduction in
 DistributedDataParallelC10d (#11109)

Summary:
Normalizing by the world size before the reduction is less likely to cause overflow in FP16 training.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11109

Differential Revision: D9594708

Pulled By: myleott

fbshipit-source-id: 93ab53cb782ee1cbe1264e529b333490a0940338
---
 test/test_c10d.py                     | 31 +++++++++++++++++++++++++++
 torch/nn/parallel/distributed_c10d.py |  4 +++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/test/test_c10d.py b/test/test_c10d.py
index 755982246ac4aa..4e804ee1635ac4 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -722,6 +722,37 @@ def test_sync_params_with_buffers(self):
             for i, buffer in enumerate(device_data):
                 self.assertEqual(buffer, target[i])
 
+    @skip_if_not_multigpu
+    @skip_if_not_nccl
+    def test_fp16(self):
+        store = c10d.TCPStore('localhost', self.port, self.rank == 0)
+        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+        model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
+        nn.init.constant_(model.weight, 1)
+        ddp_model = distributed_c10d._DistributedDataParallelC10d(
+            model,
+            device_ids=[gpus[0]],
+            process_group=process_group,
+            bucket_cap_mb=1,
+        )
+
+        # Input 2**15, so that the gradients will overflow with a
+        # world_size of 2, unless we normalize the gradient by the
+        # world_size before the reduction
+        input = torch.Tensor([[2**15]]).cuda(gpus[0]).half()
+
+        # Step model
+        ddp_model.train()
+        output = ddp_model(input)
+        loss = output.sum()
+        loss.backward()
+
+        self.assertFalse(
+            any(torch.isinf(p.grad).any() for p in ddp_model.parameters())
+        )
+
 if __name__ == '__main__':
     assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process"
 
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
index 78a3bbfc2c79fa..61acf2c8784f04 100644
--- a/torch/nn/parallel/distributed_c10d.py
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -332,6 +332,9 @@ def _queue_reduction(self, bucket_idx):
         if len(self.device_ids) > 1:
             nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
 
+        # divide by the number of processes here to reduce chances of overflow
+        grads_batch_coalesced[0] /= self.process_group.size()
+
         # now work on the first gpu
         reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
                                                       self.allreduce_opts)
@@ -345,7 +348,6 @@ def _sync_reduction_works(self):
             # wait will let current stream wait on the c10d reduction stream
             self.reduction_works[bucket_idx].wait()
 
-            self.buckets_coalesced[bucket_idx] /= self.process_group.size()
             grads_batch_reduced = _unflatten_dense_tensors(
                 self.buckets_coalesced[bucket_idx], grads_batch[0])
 

From 35008e0a1a95420033ddad2ce60c5e3450bda71c Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 10 Sep 2018 14:04:14 -0700
Subject: [PATCH 010/237] Add flags to fix half comparison and test (#11395)

Summary:
The controller you requested could not be found.  found there are some issues when using comparison operators for half types when certain THC header are included. I was able to reproduce and added a test. I also fix the issue by adding the proper definitions to avoid this issue.

Reported in https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333
Related: https://github.com/pytorch/tutorials/pull/292

soumith fmassa
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11395

Differential Revision: D9725102

Pulled By: goldsborough

fbshipit-source-id: 630425829046bbebea3409bb792a9d62c91f41ad
---
 .gitignore                           |  1 +
 test/cpp_extensions/half_support.cpp |  0
 test/cpp_extensions/half_support.cu  | 19 +++++++++++++
 test/test_cpp_extensions.py          | 41 ++++++++++++++++++++++++++++
 torch/utils/cpp_extension.py         | 10 +++++--
 5 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 test/cpp_extensions/half_support.cpp
 create mode 100644 test/cpp_extensions/half_support.cu

diff --git a/.gitignore b/.gitignore
index da783554fe1120..110046e09abf94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,7 @@ test/data/legacy_modules.t7
 test/data/legacy_serialized.pt
 test/data/linear.pt
 test/htmlcov
+test/cpp_extensions/install/
 third_party/build/
 tools/shared/_utils_internal.py
 torch.egg-info/
diff --git a/test/cpp_extensions/half_support.cpp b/test/cpp_extensions/half_support.cpp
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/test/cpp_extensions/half_support.cu b/test/cpp_extensions/half_support.cu
new file mode 100644
index 00000000000000..a3621bfe7c55fb
--- /dev/null
+++ b/test/cpp_extensions/half_support.cu
@@ -0,0 +1,19 @@
+#include <torch/torch.h>
+
+#include <THC/THCNumerics.cuh>
+
+template <typename T, typename U>
+__global__ void half_test_kernel(const T* input, U* output) {
+  if (input[0] < input[1] || input[0] >= input[1]) {
+    output[0] = 123;
+  }
+}
+
+at::Tensor half_test(at::Tensor input) {
+  auto output = at::empty(1, input.options().dtype(at::kFloat));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
+    half_test_kernel<scalar_t>
+        <<<1, 1>>>(input.data<scalar_t>(), output.data<float>());
+  });
+  return output;
+}
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index a5312cd0383008..f24571e6aad899 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -274,6 +274,47 @@ def test_complex_registration(self):
 
         torch.empty(2, 2, dtype=torch.complex64)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_half_support(self):
+        '''
+        Checks for an issue with operator< ambiguity for half when certain
+        THC headers are included.
+
+        See https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333
+        for the corresponding issue.
+        '''
+        cuda_source = '''
+        #include <THC/THCNumerics.cuh>
+
+        template<typename T, typename U>
+        __global__ void half_test_kernel(const T* input, U* output) {
+            if (input[0] < input[1] || input[0] >= input[1]) {
+                output[0] = 123;
+            }
+        }
+
+        at::Tensor half_test(at::Tensor input) {
+            auto output = at::empty(1, input.options().dtype(at::kFloat));
+            AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
+                half_test_kernel<scalar_t><<<1, 1>>>(
+                    input.data<scalar_t>(),
+                    output.data<float>());
+            });
+            return output;
+        }
+        '''
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='half_test_extension',
+            cpp_sources='at::Tensor half_test(at::Tensor input);',
+            cuda_sources=cuda_source,
+            functions=['half_test'],
+            verbose=True)
+
+        x = torch.randn(3, device='cuda', dtype=torch.half)
+        result = module.half_test(x)
+        self.assertEqual(result[0], 123)
+
 
 if __name__ == '__main__':
     common.run_tests()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 0af802625087c5..43edbd2f8bd055 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -69,6 +69,12 @@ def _find_cuda_home():
 # it the below pattern.
 BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+')
 
+COMMON_NVCC_FLAGS = [
+    '-D__CUDA_NO_HALF_OPERATORS__',
+    '-D__CUDA_NO_HALF_CONVERSIONS__',
+    '-D__CUDA_NO_HALF2_OPERATORS__',
+]
+
 
 def is_binary_build():
     return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
@@ -165,7 +171,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
                     self.compiler.set_executable('compiler_so', nvcc)
                     if isinstance(cflags, dict):
                         cflags = cflags['nvcc']
-                    cflags += ['--compiler-options', "'-fPIC'"]
+                    cflags = COMMON_NVCC_FLAGS + ['--compiler-options', "'-fPIC'"] + cflags
                 elif isinstance(cflags, dict):
                     cflags = cflags['cxx']
                 # NVCC does not allow multiple -std to be passed, so we avoid
@@ -831,7 +837,7 @@ def _write_ninja_file(path,
     flags = ['cflags = {}'.format(' '.join(cflags))]
 
     if with_cuda:
-        cuda_flags = common_cflags
+        cuda_flags = common_cflags + COMMON_NVCC_FLAGS
         if sys.platform == 'win32':
             cuda_flags = _nt_quote_args(cuda_flags)
         else:

From 70d93f4777be108beea1bb0bff02e6213b49ea9a Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Mon, 10 Sep 2018 14:38:32 -0700
Subject: [PATCH 011/237] Check for maximum numel in NCCL broadcasting (#11466)

Summary:
NCCL1 uses `int` as its numerical type for fields like `count`, which makes broadcasting tensors larger than `2 << 31 - 1` impossible, and raises opaque error `invalid arguments`. NCCL2 greatly increase the limit on many platforms by using `size_t`. This patch statically detects this type, and raises properly if the broadcast tensor exceeds the limit.

No test because I don't think our test suite should broadcast big tensors.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11466

Differential Revision: D9754753

Pulled By: SsnL

fbshipit-source-id: 73506450cae047e06b5b225b39efdb42d5d26685
---
 torch/csrc/cuda/comm.cpp |  3 +++
 torch/csrc/cuda/nccl.cpp | 27 +++++++++++++++++++++++++++
 torch/csrc/cuda/nccl.h   |  2 ++
 3 files changed, 32 insertions(+)

diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 5ca60c7c62a2a0..716a1d30c3c9cd 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -72,6 +72,9 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
                    [&](const at::Tensor& t) { return t.get_device() == devices[0]; })) {
     throw std::runtime_error("all tensors must be on devices[0]");
   }
+#ifdef USE_NCCL
+  buffer_size = std::min(torch::cuda::nccl::get_max_count(), buffer_size);
+#endif
 
   tensor_list2d outputs(devices.size());
   outputs[0] = tensors.vec();
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index f69e69e633b2e6..ab0fecfd761c24 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -5,6 +5,8 @@
 
 #include <unordered_map>
 #include <sstream>
+#include <limits>
+#include <type_traits>
 #include <ATen/ATen.h>
 #include <THC/THC.h>
 #include <THC/THCStream.h>
@@ -177,6 +179,28 @@ std::uint64_t version() {
 #endif
 }
 
+namespace {
+  // NCCL changed the numerical type used for count between NCCL1 and NCCL2.
+  // So we use the following struct, which gets the type of the second argument
+  // of T, if T is a function type, with ncclBcast, to get that type statically
+  // and programmatically.
+
+  template<typename T>
+  struct GetSecondArgType;
+
+  template<typename R, typename Arg0, typename Arg1, typename ...Args>
+  struct GetSecondArgType<R(Arg0, Arg1, Args...)> {
+    typedef typename std::decay<Arg1>::type type;
+  };
+
+  constexpr auto count_max = std::numeric_limits<GetSecondArgType<decltype(ncclBcast)>::type>::max();
+}
+
+size_t get_max_count() {
+  return count_max;
+}
+
+
 void broadcast(TensorList tensors, const stream_list& streams, const comm_list& user_comms) {
 #ifdef USE_NCCL
   using namespace torch::cuda::nccl::detail;
@@ -192,6 +216,9 @@ void broadcast(TensorList tensors, const stream_list& streams, const comm_list&
     device_guard.set_index(tensors[i].get_device());
     // TODO: use current stream
     const auto stream = (streams.empty() || !streams[i]) ? nullptr : THCStream_stream(streams[i]);
+    AT_CHECK(static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
+             "Broadcast tensor has ", numel, " elements, which exceeds the "
+             "maximum NCCL supports (", count_max, ")");
     CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream));
   }
 #else
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
index 57bb5952454e38..919363474b9f9f 100644
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@@ -47,4 +47,6 @@ void broadcast(at::TensorList tensors,
                const stream_list& streams = {},
                const comm_list& user_comms = {});
 
+size_t get_max_count();
+
 }}}

From 3e665cc29bbf429fef742450528b2272a3bc17af Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Mon, 10 Sep 2018 15:01:07 -0700
Subject: [PATCH 012/237] Improve support for tracing sizes, add more tracer
 warnings (#11288)

Summary:
Many constructors like `torch.zeros` or `torch.randn` didn't support
size tracing correctly which is fixed by this pass. Same issue has been
fixed in legacy tensor constructors.

Additionally, new tensor constructors, which do not participate in
tracing (most notably `torch.tensor`, `torch.as_tensor` and
`torch.from_numpy`) raise a warning when they are used.

Finally, entering a traceable operation disables the tracing in its body.
This is needed because

zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11288

Reviewed By: ezyang

Differential Revision: D9751183

Pulled By: apaszke

fbshipit-source-id: 51444a39d76a3e164adc396c432fd5ee3c8d5f7f
---
 ...ript.test_index_put_trace_with_view.expect |  4 +--
 ...t.test_index_put_trace_without_view.expect |  4 +--
 test/onnx/test_pytorch_onnx_caffe2.py         |  8 ++---
 test/test_jit.py                              |  6 ++--
 tools/autograd/gen_variable_type.py           | 31 ++++++-------------
 .../templates/python_torch_functions.cpp      |  5 +++
 torch/csrc/autograd/python_variable.cpp       |  2 ++
 torch/csrc/jit/tracer.cpp                     | 11 ++++---
 torch/csrc/jit/tracer.h                       |  2 ++
 torch/csrc/utils/python_arg_parser.h          |  2 +-
 torch/csrc/utils/tensor_new.cpp               |  8 ++---
 torch/onnx/symbolic.py                        | 19 ++++++++++++
 12 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect
index 37f08643f139a4..cc03d3d5296d08 100644
--- a/test/expect/TestScript.test_index_put_trace_with_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_with_view.expect
@@ -7,6 +7,6 @@ graph(%0 : Double(100)
   %6 : int = prim::Constant[value=0]()
   %7 : Long(4) = aten::_cast_Long(%1, %6)
   %8 : Dynamic[] = prim::ListConstruct(%7)
-  %20 : Double(100) = aten::index_put(%0, %8, %5)
-  return (%20);
+  %9 : Double(100) = aten::index_put(%0, %8, %5)
+  return (%9);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect
index 772308223b454b..c72506796064b2 100644
--- a/test/expect/TestScript.test_index_put_trace_without_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_without_view.expect
@@ -4,6 +4,6 @@ graph(%0 : Double(100)
   %3 : int = prim::Constant[value=0]()
   %4 : Long(4) = aten::_cast_Long(%1, %3)
   %5 : Dynamic[] = prim::ListConstruct(%4)
-  %17 : Double(100) = aten::index_put(%0, %5, %2)
-  return (%17);
+  %6 : Double(100) = aten::index_put(%0, %5, %2)
+  return (%6);
 }
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index f8038d2465dbc6..1a9e9cdbff1442 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -346,11 +346,11 @@ def test_rnn_init_predict_split(self):
         mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params)[0])
         prepared = c2.prepare(mp, device='CPU')
         if self.embed_params:
-            assert len(prepared.init_net.op) == 1038
-            assert len(prepared.predict_net.op) == 101
+            assert len(prepared.init_net.op) == 1019
+            assert len(prepared.predict_net.op) == 142
         else:
-            assert len(prepared.init_net.op) == 27
-            assert len(prepared.predict_net.op) == 1112
+            assert len(prepared.init_net.op) == 8
+            assert len(prepared.predict_net.op) == 1153
 
     def test_alexnet(self):
         state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False)
diff --git a/test/test_jit.py b/test/test_jit.py
index 342087e277292c..32ccf392f60b93 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -916,6 +916,7 @@ def f(x, y):
 
         torch.jit.verify(f, (x, y), loss_fn=lambda z, w: z * w, devices=[])
 
+    @suppress_warnings
     def test_constant(self):
         x = torch.randn(2, 2, requires_grad=True)
 
@@ -6197,7 +6198,7 @@ def test_index_put(target, indices, rhs):
             target[indices] = rhs
             return target
 
-        self.assertExpected(str(test_index_put.graph))
+        self.assertExpectedGraph(test_index_put.graph)
 
     def test_index_put_trace_without_view(self):
         @_trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(4))
@@ -6205,7 +6206,7 @@ def test_index_put(target, indices, rhs):
             target[indices] = rhs
             return target
 
-        self.assertExpected(str(test_index_put.graph))
+        self.assertExpectedGraph(test_index_put.graph)
 
     def test_annotated_script_fn(self):
         @torch.jit.script
@@ -6895,6 +6896,7 @@ def forward(self, x):
         net = Net(upscale_factor=4)
         self.checkTrace(net, (torch.rand(5, 1, 64, 64),))
 
+    @suppress_warnings
     def test_time_sequence_prediction(self):
         class Sequence(torch.jit.ScriptModule):
             def __init__(self):
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index d6bcb0821e83c0..86dd5fcb98bda0 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -128,13 +128,15 @@
 
 PRE_RECORD_TRACE = CodeTemplate("""\
 torch::jit::Node* node = nullptr;
+std::shared_ptr<jit::tracer::TracingState> tracer_state;
 if (jit::tracer::isTracing()) {
-  auto& graph = jit::tracer::getTracingState()->graph;
-  node = graph->create(jit::aten::${trace_name}, /*outputs=*/0);
+  tracer_state = jit::tracer::getTracingState();
+  node = tracer_state->graph->create(jit::aten::${trace_name}, /*outputs=*/0);
   jit::tracer::recordSourceLocation(node);
   ${add_trace_inputs}
-  graph->appendNode(node);
+  tracer_state->graph->appendNode(node);
   ${inplace_guard}
+  jit::tracer::setTracingState(nullptr);
 }
 """)
 
@@ -145,35 +147,20 @@
 ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${input}", ${input});""")
 
 POST_RECORD_TRACE = CodeTemplate("""\
-if (jit::tracer::isTracing()) {
+if (tracer_state) {
+  jit::tracer::setTracingState(std::move(tracer_state));
   ${record_trace_outputs}
 }
 """)
 
-RECORD_ATTRIBUTE = CodeTemplate("""\
-setattr(trace_info.n, jit::attr::${attr_name}, ${name});""")
-
-RECORD_POSITIONAL_ATTRIBUTE = CodeTemplate("""\
-setposattr(trace_info.n, ${i}, "${name}", ${name});""")
-
-POSITIONAL_ATTR_NYI = """\
-throw std::runtime_error("Can't have size-dependent arguments to functions that "
-                         "take variable number of tensor arguments");
-"""
-
 
 def should_trace(declaration):
-    # Operations involving Generator, Storage, Type are not traceable
-    # at the moment
-    if any(arg['simple_type'] in {'Generator', 'Storage', 'ScalarType', 'Type', 'optional<ScalarType>'}
-            for arg in declaration['arguments']):
+    # Operations involving Storage or Type are not traceable at the moment
+    if any(arg['simple_type'] in {'Storage', 'Type'} for arg in declaration['arguments']):
         return False
     # We can't trace functions which don't have any Tensor or TensorList returns
     if 'Tensor' not in declaration['return_type']:
         return False
-    tensor_args = [arg for arg in declaration['arguments'] if arg['simple_type'] in {'Tensor', 'TensorList'}]
-    if len(tensor_args) == 0:
-        return False
     name = declaration['name']
     base_name = name[:-1] if declaration['inplace'] else name[:-4] if name.endswith('_out') else name
     if base_name in DONT_RECORD_TRACE:
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 144771290d1c89..a7174b52e52b88 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -19,6 +19,7 @@
 #include "torch/csrc/utils/tensor_layouts.h"
 #include "torch/csrc/utils/tensor_new.h"
 #include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/jit/tracer.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
 
 #include <ATen/ATen.h>
@@ -320,6 +321,7 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject*
 static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.as_tensor");
   return THPVariable_Wrap(torch::utils::as_tensor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
@@ -327,6 +329,7 @@ static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject
 static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.from_numpy");
   auto data = torch::utils::tensor_from_numpy(arg);
   return THPVariable_Wrap(make_variable(std::move(data), /*requires_grad=*/false));
   END_HANDLE_TH_ERRORS
@@ -351,6 +354,7 @@ static PyObject * THPVariable__promote_types(PyObject* self, PyObject* args, PyO
 static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.sparse_coo_tensor");
   return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
@@ -358,6 +362,7 @@ static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args,
 static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.tensor");
   return THPVariable_Wrap(torch::utils::tensor_ctor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index b1c2d85592d6dd..5666da28135f1f 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -22,6 +22,7 @@
 #include "torch/csrc/utils/python_strings.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/jit/tracer.h"
 
 #include <ATen/ATen.h>
 
@@ -125,6 +126,7 @@ static void THPVariable_dealloc(THPVariable* self)
 static PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.Tensor");
   auto& default_type = torch::tensors::get_default_tensor_type();
   auto tensor = torch::utils::legacy_tensor_ctor(default_type, args, kwargs);
   return THPVariable_NewWithVar(type, std::move(tensor));
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index d2c4ef9f0da5a7..03e31440ef5c95 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -26,8 +26,9 @@ void genericAddInput(Node *n, T value) {
   n->addInput(v);
 }
 
-void badArgType() {
-  AT_ERROR("Found an unsupported argument type in the JIT tracer. File a bug report.");
+template<typename T>
+void badArgType(const T& v) {
+  AT_ERROR("Found an unsupported argument type in the JIT tracer: ", at::demangle_type<T>(), ". File a bug report.");
 }
 
 thread_local std::shared_ptr<TracingState> tracing_state;
@@ -39,8 +40,10 @@ void addInputs(Node *n, const char * name, bool value)               { detail::g
 void addInputs(Node *n, const char * name, double value)             { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Scalar& value)  { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Tensor& value)  { n->addInput(getValueTrace(value)); }
-void addInputs(Node *n, const char * name, const std::string& value)         { detail::badArgType(); }
-void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(); }
+void addInputs(Node *n, const char * name, const std::string& value)         { detail::badArgType(value); }
+void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); }
+void addInputs(Node *n, const char * name, at::Generator * value)            { detail::badArgType(value); }
+void addInputs(Node *n, const char * name, at::ScalarType value)             { detail::badArgType(value); }
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
   Graph *g = n->owningGraph();
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 73adda456c39f1..37cd9b332923ee 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -172,6 +172,8 @@ TORCH_API void addInputs(Node *n, const char * name, const ArrayRef<double>& val
 TORCH_API void addInputs(Node *n, const char * name, const std::string& value);
 TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
 TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value);
+TORCH_API void addInputs(Node *n, const char * name, at::Generator * value);
+TORCH_API void addInputs(Node *n, const char * name, at::ScalarType value);
 
 template<size_t N>
 void addInputs(Node *n, const char * name, std::array<bool, N> value) {
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a44ce9b3ed0221..86453a79e30dc2 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -292,7 +292,7 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(int i, std::vector<in
     try {
       // Elements of torch.Size are tensors during tracing, and we need to record extra
       // information before they are turned into an IntList
-      if (traceable && THPVariable_Check(obj)) {
+      if (traceable && jit::tracer::isTracing() && THPVariable_Check(obj)) {
         auto & var = THPVariable_Unpack(obj);
         jit::tracer::ArgumentStash::stashIntListElem(
             signature.params[i].name, size, idx, var);
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index bf0e3308b1e66f..3c3ab4384dc85f 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -511,7 +511,7 @@ Tensor new_tensor(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_empty(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_empty(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
@@ -525,7 +525,7 @@ Tensor new_empty(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_full(IntList size, Scalar fill_value, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<5> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
@@ -539,7 +539,7 @@ Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_ones(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
@@ -553,7 +553,7 @@ Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_zeros(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_zeros(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 23e95e282c4408..03e93125f40755 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -936,6 +936,25 @@ def zeros_like(g, input):
     return g.op("Sub", input, input).setType(input.type().contiguous())
 
 
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],
+    cast_pytorch_to_onnx["Char"],
+    cast_pytorch_to_onnx["Short"],
+    cast_pytorch_to_onnx["Int"],
+    cast_pytorch_to_onnx["Long"],
+    cast_pytorch_to_onnx["Half"],
+    cast_pytorch_to_onnx["Float"],
+    cast_pytorch_to_onnx["Double"],
+]
+
+
+@parse_args('v', 'i', 'i', 'v')
+def zeros(g, shape, scalar_type, layout, device):
+    # NOTE: no way to set device in ONNX, so we ignore it
+    return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type],
+                input_as_shape_i=1, value_f=0)
+
+
 def full_like(g, input, fill_value):
     # TODO: a more efficient implementation (ConstantFill?)
     return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1)))

From a0d4106c075ba8e00ed867364ea1f730d2735e90 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 10 Sep 2018 15:24:47 -0700
Subject: [PATCH 013/237] Integrate custom op tests with CI (#10611)

Summary:
This PR is stacked on https://github.com/pytorch/pytorch/pull/10610, and only adds changes in one file `.jenkins/pytorch/test.sh`, where we now build the custom op tests and run them.

I'd also like to take this PR to discuss whether the [`TorchConfig.cmake`](https://github.com/pytorch/pytorch/blob/master/cmake/TorchConfig.cmake.in) I made is robust enough (we will also see in the CI) orionr Yangqing dzhulgakov what do you think?

Also ezyang for CI changes
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10611

Differential Revision: D9597627

Pulled By: goldsborough

fbshipit-source-id: f5af8164c076894f448cef7e5b356a6b3159f8b3
---
 .gitignore                               |  2 +
 .jenkins/pytorch/build.sh                | 10 ++++
 .jenkins/pytorch/macos-test.sh           | 22 +++++++++
 .jenkins/pytorch/test.sh                 | 19 +++++++-
 cmake/TorchConfig.cmake.in               | 58 +++++++++++++++---------
 setup.py                                 | 34 +++++++++-----
 test/custom_operator/CMakeLists.txt      |  9 ++++
 test/custom_operator/model.py            | 13 +++++-
 test/custom_operator/op.h                |  2 +-
 test/custom_operator/test_custom_ops.cpp | 10 ++--
 test/custom_operator/test_custom_ops.py  |  4 +-
 torch/CMakeLists.txt                     | 11 +++--
 torch/op.h                               |  1 +
 13 files changed, 150 insertions(+), 45 deletions(-)

diff --git a/.gitignore b/.gitignore
index 110046e09abf94..e3e7acb8e310cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,7 @@ docs/cpp/html/
 docs/cpp/api/
 test/.coverage
 test/cpp/api/mnist
+test/custom_operator/model.pt
 test/data/gpu_tensors.pt
 test/data/legacy_modules.t7
 test/data/legacy_serialized.pt
@@ -66,6 +67,7 @@ torch/lib/protoc
 torch/lib/tmp_install
 torch/lib/torch_shm_manager
 torch/lib/python*
+torch/share/
 torch/version.py
 
 # IPython notebook checkpoints
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 3ffed384b081b7..f108c37cdd6ed1 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -138,4 +138,14 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
   pushd ../cpp-build/caffe2
   WERROR=1 VERBOSE=1 DEBUG=1 python $BUILD_LIBTORCH_PY
   popd
+
+  # Build custom operator tests.
+  CUSTOM_OP_BUILD="$PWD/../custom-op-build"
+  CUSTOM_OP_TEST="$PWD/test/custom_operator"
+  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+  mkdir "$CUSTOM_OP_BUILD"
+  pushd "$CUSTOM_OP_BUILD"
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake "$CUSTOM_OP_TEST"
+  make VERBOSE=1
+  popd
 fi
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 87e0476e418ba5..16d34342c544c8 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -78,13 +78,35 @@ test_cpp_api() {
   "$CPP_BUILD"/caffe2/bin/test_api
 }
 
+test_custom_script_ops() {
+  echo "Testing custom script operators"
+  pushd test/custom_operator
+  # Build the custom operator library.
+  rm -rf build && mkdir build
+  pushd build
+  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  make VERBOSE=1
+  popd
+
+  # Run tests Python-side and export a script module.
+  python test_custom_ops.py -v
+  python model.py --export-script-module=model.pt
+  # Run tests C++-side and load the exported script module.
+  build/test_custom_ops ./model.pt
+  popd
+}
+
+
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
   test_python_all
   test_cpp_api
+  test_custom_script_ops
 else
   if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
     test_python_all
   elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
     test_cpp_api
+    test_custom_script_ops
   fi
 fi
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 05bd71602b9783..bc22fa7a36070a 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -97,7 +97,7 @@ test_aten() {
     # put the dynamic libraries somewhere were the dynamic linker can find them.
     # This is a bit of a hack.
     if [[ "$BUILD_ENVIRONMENT" == *ppc64le* ]]; then
-      SUDO=sudo 
+      SUDO=sudo
     fi
 
     ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin
@@ -140,12 +140,28 @@ test_libtorch() {
   fi
 }
 
+test_custom_script_ops() {
+  if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
+    echo "Testing custom script operators"
+    CUSTOM_OP_BUILD="$PWD/../custom-op-build"
+    pushd test/custom_operator
+    cp -r "$CUSTOM_OP_BUILD" build
+    # Run tests Python-side and export a script module.
+    python test_custom_ops.py -v
+    python model.py --export-script-module=model.pt
+    # Run tests C++-side and load the exported script module.
+    build/test_custom_ops ./model.pt
+    popd
+  fi
+}
+
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
   test_python_nn
   test_python_all_except_nn
   test_aten
   test_torchvision
   test_libtorch
+  test_custom_script_ops
 else
   if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
     test_python_nn
@@ -154,5 +170,6 @@ else
     test_aten
     test_torchvision
     test_libtorch
+    test_custom_script_ops
   fi
 fi
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index a14b2e1b0e8b44..f177a9c2c3504d 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -17,32 +17,48 @@
 #
 #   torch_add_custom_op_library(<name> <source_files>)
 
-SET(TORCH_ROOT "${CMAKE_CURRENT_LIST_DIR}/../")
+if ($ENV{TORCH_INSTALL_PREFIX})
+  set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX})
+else()
+  # Assume we are in <install-prefix>/share/cmake/Torch/TorchConfig.cmake
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+endif()
 
-set(TORCH_INCLUDE_DIRS
-  "${TORCH_ROOT}"
-  "${TORCH_ROOT}/aten/src"
-  "${CMAKE_CURRENT_LIST_DIR}/aten/src"
-  "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src"
-  "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src/TH"
-)
+# Include directories.
+set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include")
 
-find_library(TORCH_LIBRARY torch PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH)
-find_library(CAFFE2_LIBRARY caffe2 PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH)
+# Library dependencies.
+find_package(Caffe2 REQUIRED)
+find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
 
+set(TORCH_LIBRARIES ${TORCH_LIBRARY} ${Caffe2_MAIN_LIBS})
 if (@USE_CUDA@)
-  find_package(CUDA REQUIRED)
-  find_library(CAFFE2_CUDA_LIBRARY caffe2_gpu PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH)
-  set(TORCH_CUDA_LIBRARIES -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 cuda nvrtc cudart nvToolsExt)
-  list(APPEND TORCH_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+  if(MSVC)
+    set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
+    if ($ENV{NVTOOLEXT_HOME})
+      set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME})
+    endif()
+    set(TORCH_CUDA_LIBRARIES
+      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
+      ${CUDA_LIBRARIES})
+    list(APPEND TORCH_INCLUDE_DIRS "${NVTOOLEXT_HOME}/include")
+  elseif(APPLE)
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
+      ${CUDA_LIBRARIES})
+  else()
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_CUDA_LIB}
+      ${CUDA_NVRTC_LIB}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
+      ${CUDA_LIBRARIES})
+  endif()
+  list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
 endif()
 
-set(TORCH_LIBRARIES
-  ${TORCH_LIBRARY}
-  ${CAFFE2_LIBRARY}
-  ${CAFFE2_CUDA_LIBRARY}
-  ${TORCH_CUDA_LIBRARIES})
-
 # Creates a shared library <name> with the correct include directories
 # and linker flags set to include Torch header files and link with Torch
 # libraries. Also sets the C++ standard version to C++11. All options
@@ -51,5 +67,5 @@ function(torch_add_custom_op_library name source_files)
   add_library(${name} SHARED ${source_files})
   target_include_directories(${name} PUBLIC "${TORCH_INCLUDE_DIRS}")
   target_link_libraries(${name} "${TORCH_LIBRARIES}")
-  target_compile_options(${name} PUBLIC -std=c++11)
+  set_property(TARGET ${name} PROPERTY CXX_STANDARD 11)
 endfunction(torch_add_custom_op_library)
diff --git a/setup.py b/setup.py
index 3cc5c0c1642a0c..34d2e7e5402840 100644
--- a/setup.py
+++ b/setup.py
@@ -443,11 +443,10 @@ def check_file(f):
         # we need to find a better way to do this.
         # More information can be found in conversation thread of PR #5772
 
-        self.copy_tree('torch/csrc', 'torch/lib/include/torch/csrc/')
+        self.copy_tree('torch/lib/tmp_install/share', 'torch/share')
         self.copy_tree('third_party/pybind11/include/pybind11/',
                        'torch/lib/include/pybind11')
         self.copy_file('torch/csrc/torch.h', 'torch/lib/include/torch/torch.h')
-        self.copy_file('torch/op.h', 'torch/lib/include/torch/op.h')
 
 
 build_dep_cmds = {}
@@ -1136,27 +1135,38 @@ def make_relative_rpath(path):
                 'lib/*.h',
                 'lib/include/ATen/*.h',
                 'lib/include/ATen/core/*.h',
-                'lib/include/ATen/detail/*.h',
-                'lib/include/ATen/cuda/*.h',
                 'lib/include/ATen/cuda/*.cuh',
+                'lib/include/ATen/cuda/*.h',
+                'lib/include/ATen/cuda/detail/*.cuh',
                 'lib/include/ATen/cuda/detail/*.h',
                 'lib/include/ATen/cudnn/*.h',
-                'lib/include/ATen/cuda/detail/*.cuh',
+                'lib/include/ATen/detail/*.h',
+                'lib/include/caffe2/utils/*.h',
+                'lib/include/torch/*.h',
+                'lib/include/torch/csrc/*.h',
+                'lib/include/torch/csrc/api/include/torch/detail/ordered_dict.h',
+                'lib/include/torch/csrc/autograd/*.h',
+                'lib/include/torch/csrc/autograd/generated/*.h',
+                'lib/include/torch/csrc/cuda/*.h',
+                'lib/include/torch/csrc/jit/*.h',
+                'lib/include/torch/csrc/jit/generated/*.h',
+                'lib/include/torch/csrc/jit/passes/*.h',
+                'lib/include/torch/csrc/jit/script/*.h',
+                'lib/include/torch/csrc/utils/*.h',
                 'lib/include/pybind11/*.h',
                 'lib/include/pybind11/detail/*.h',
                 'lib/include/TH/*.h*',
                 'lib/include/TH/generic/*.h*',
-                'lib/include/THC/*.h*',
                 'lib/include/THC/*.cuh',
+                'lib/include/THC/*.h*',
                 'lib/include/THC/generic/*.h',
                 'lib/include/THCUNN/*.cuh',
                 'lib/include/THNN/*.h',
-                'lib/include/torch/csrc/*.h',
-                'lib/include/torch/csrc/autograd/*.h',
-                'lib/include/torch/csrc/jit/*.h',
-                'lib/include/torch/csrc/utils/*.h',
-                'lib/include/torch/csrc/cuda/*.h',
-                'lib/include/torch/torch.h',
+                'share/cmake/ATen/*.cmake',
+                'share/cmake/Caffe2/*.cmake',
+                'share/cmake/Caffe2/public/*.cmake',
+                'share/cmake/Gloo/*.cmake',
+                'share/cmake/Torch/*.cmake',
             ],
             'caffe2': [
                 rel_site_packages + '/caffe2/**/*.py'
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index 114cfda92d83d6..14f605d4f5b3f7 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -4,6 +4,15 @@ project(custom_ops)
 
 find_package(Torch REQUIRED)
 
+# This convenience function will create a shared library target, configure
+# the right include directories and link against the right libraries. It is
+# exactly equivalent to the following lines:
+#
+# add_library(custom_ops SHARED op.cpp)
+# target_include_directories(custom_ops PUBLIC "${TORCH_INCLUDE_DIRS}")
+# target_link_libraries(custom_ops "${TORCH_LIBRARIES}")
+# set_property(TARGET custom_ops PROPERTY CXX_STANDARD 11)
+#
 torch_add_custom_op_library(custom_ops op.cpp)
 
 add_executable(test_custom_ops test_custom_ops.cpp)
diff --git a/test/custom_operator/model.py b/test/custom_operator/model.py
index 6f91bd81c577cd..c803c18d467e13 100644
--- a/test/custom_operator/model.py
+++ b/test/custom_operator/model.py
@@ -1,9 +1,20 @@
 import argparse
 import os.path
+import sys
 
 import torch
 
 
+SHARED_LIBRARY_EXTENSIONS = {'linux': 'so', 'darwin': 'dylib', 'win32': 'dll'}
+
+
+def get_custom_op_library_path():
+    extension = SHARED_LIBRARY_EXTENSIONS[sys.platform]
+    path = os.path.abspath('build/libcustom_ops.{}'.format(extension))
+    assert os.path.exists(path), path
+    return path
+
+
 class Model(torch.jit.ScriptModule):
     def __init__(self):
         super(Model, self).__init__()
@@ -20,7 +31,7 @@ def main():
     parser.add_argument("--export-script-module-to", required=True)
     options = parser.parse_args()
 
-    torch.ops.load_library(os.path.abspath('build/libcustom_ops.so'))
+    torch.ops.load_library(get_custom_op_library_path())
 
     model = Model()
     model.save(options.export_script_module_to)
diff --git a/test/custom_operator/op.h b/test/custom_operator/op.h
index d45123d3afccb4..b1c4b5b1647f0b 100644
--- a/test/custom_operator/op.h
+++ b/test/custom_operator/op.h
@@ -3,7 +3,7 @@
 #include <cstddef>
 #include <vector>
 
-std::vector<at::Tensor> custom_op(
+TORCH_API std::vector<at::Tensor> custom_op(
     at::Tensor tensor,
     double scalar,
     int64_t repeat);
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index d653d0e2ccf9e0..47e6f3618aad7c 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -22,9 +22,12 @@ void get_operator_from_registry_and_execute() {
   std::vector<at::Tensor> output;
   torch::jit::pop(stack, output);
 
+  const auto manual = custom_op(torch::ones(5), 2.0, 3);
+
   assert(output.size() == 3);
-  for (const auto& tensor : output) {
-    assert(tensor.allclose(torch::ones(5) * 2));
+  for (size_t i = 0; i < output.size(); ++i) {
+    assert(output[i].allclose(torch::ones(5) * 2));
+    assert(output[i].allclose(manual[i]));
   }
 }
 
@@ -71,10 +74,9 @@ void test_argument_checking_for_serialized_modules(
     module->forward({});
     assert(false);
   } catch (const at::Error& error) {
-    std::cout << error.what_without_backtrace() << std::endl;
     assert(
         std::string(error.what_without_backtrace())
-            .find("custom::op() is missing value for argument 'tensor'") == 0);
+            .find("forward() is missing value for argument 'input'") == 0);
   }
 }
 
diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py
index f7e6b2fa7b9e5f..ab6c958acd55fc 100644
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@@ -5,12 +5,12 @@
 
 import torch
 
-from model import Model
+from model import Model, get_custom_op_library_path
 
 
 class TestCustomOperators(unittest.TestCase):
     def setUp(self):
-        self.library_path = os.path.abspath('build/libcustom_ops.so')
+        self.library_path = get_custom_op_library_path()
         torch.ops.load_library(self.library_path)
 
     def test_custom_library_is_loaded(self):
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index d5d0ebc663915d..fb0b76e8aeb91f 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -26,7 +26,7 @@ if(NOT TORCH_INSTALL_BIN_DIR)
 endif()
 
 if(NOT TORCH_INSTALL_INCLUDE_DIR)
-  set(TORCH_INSTALL_INCLUDE_DIR include/libtorch)
+  set(TORCH_INSTALL_INCLUDE_DIR include)
 endif()
 
 if(NOT TORCH_INSTALL_LIB_DIR)
@@ -287,8 +287,7 @@ if (MSVC)
   target_link_libraries(torch onnx onnx_library)
 endif()
 
-target_link_libraries(torch
-  caffe2_library)
+target_link_libraries(torch caffe2_library)
 
 find_package(OpenMP)
 if(OPENMP_FOUND)
@@ -395,6 +394,8 @@ endif()
 install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
         FILES_MATCHING PATTERN "*.h")
+install(FILES "${TORCH_SRC_DIR}/op.h"
+        DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 install(TARGETS torch
   RUNTIME DESTINATION "${TORCH_INSTALL_BIN_DIR}"
@@ -473,3 +474,7 @@ configure_file(
     ${TORCH_ROOT}/cmake/TorchConfig.cmake.in
     ${PROJECT_BINARY_DIR}/TorchConfig.cmake
     @ONLY)
+install(FILES
+    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
+    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
+    DESTINATION share/cmake/Torch)
diff --git a/torch/op.h b/torch/op.h
index bf6f2aa080c133..9033857d672925 100644
--- a/torch/op.h
+++ b/torch/op.h
@@ -3,6 +3,7 @@
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/custom_operator.h>
 #include <torch/csrc/jit/import.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <ATen/ATen.h>
 

From 0b78ae86c59e7372f0b9d4cadcd0e50bc76f8667 Mon Sep 17 00:00:00 2001
From: Owen Anderson <owen.anderson@oculus.com>
Date: Mon, 10 Sep 2018 15:30:08 -0700
Subject: [PATCH 014/237] Cleanup byte swapping utilities to generate optimal
 code on the platforms we care about. (#11394)

Summary:
While the use of memcpy as part of the byte swapping sequence looks funky, all major
compilers recognize and optimize this pattern reliably, resulting in essentially
optimal code generation.

For example, decodeUInt32LE goes from this on iOS arm64:
>         ldrb    w8, [x0, #3]
>         ldrb    w9, [x0, #2]
>         bfi     w8, w9, #8, #8
>         ldrb    w9, [x0, #1]
>         bfi     w8, w9, #16, #8
>         ldrb            w9, [x0]
>         bfi     w8, w9, #24, #8
>         mov      x0, x8
>         ret

To this:
>         ldr             w8, [x0]
>         rev     w0, w8
>         ret
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11394

Reviewed By: SsnL

Differential Revision: D9728659

Pulled By: resistor

fbshipit-source-id: 9afbd4adfad1d1fb7b01f1179e6707ee21fa726f
---
 torch/csrc/byte_order.cpp | 106 +++++++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 29 deletions(-)

diff --git a/torch/csrc/byte_order.cpp b/torch/csrc/byte_order.cpp
index 8bc1ff10f98c32..7818ca2191b966 100644
--- a/torch/csrc/byte_order.cpp
+++ b/torch/csrc/byte_order.cpp
@@ -2,34 +2,94 @@
 
 #include <string.h>
 
+#if defined(_MSC_VER)
+#include <stdlib.h>
+#endif
+
+static inline void swapBytes16(void *ptr)
+{
+  uint16_t output;
+  memcpy(&output, ptr, sizeof(uint16_t));
+#if defined(_MSC_VER) && !defined(_DEBUG)
+  output = _byteswap_ushort(output);
+#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
+  output = __builtin_bswap16(output);
+#else
+  uint16_t Hi = output >> 8;
+  uint16_t Lo = output << 8;
+  output = Hi | Lo;
+#endif
+  memcpy(ptr, &output, sizeof(uint16_t));
+}
+
+static inline void swapBytes32(void *ptr)
+{
+  uint32_t output;
+  memcpy(&output, ptr, sizeof(uint32_t));
+#if defined(_MSC_VER) && !defined(_DEBUG)
+  output = _byteswap_ulong(output);
+#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
+  output = __builtin_bswap32(output);
+#else
+   uint32_t Byte0 = output & 0x000000FF;
+   uint32_t Byte1 = output & 0x0000FF00;
+   uint32_t Byte2 = output & 0x00FF0000;
+   uint32_t Byte3 = output & 0xFF000000;
+   output = (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
+#endif
+  memcpy(ptr, &output, sizeof(uint32_t));
+}
+
+static inline void swapBytes64(void *ptr)
+{
+  uint64_t output;
+  memcpy(&output, ptr, sizeof(uint64_t));
+#if defined(_MSC_VER) && !defined(_DEBUG)
+  output = _byteswap_uint64(output);
+#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
+  output = __builtin_bswap64(output);
+#else
+   uint64_t Hi = SwapByteOrder_32(uint32_t(value));
+   uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32));
+   return (Hi << 32) | Lo;
+#endif
+  memcpy(ptr, &output, sizeof(uint64_t));
+}
+
 static inline uint16_t decodeUInt16LE(const uint8_t *data) {
-  return (data[0]<<0) | (data[1]<<8);
+  uint16_t output;
+  memcpy(&output, data, sizeof(uint16_t));
+  return output;
 }
 
 static inline uint16_t decodeUInt16BE(const uint8_t *data) {
-  return (data[1]<<0) | (data[0]<<8);
+  uint16_t output = decodeUInt16LE(data);
+  swapBytes16(&output);
+  return output;
 }
 
 static inline uint32_t decodeUInt32LE(const uint8_t *data) {
-  return (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+  uint32_t output;
+  memcpy(&output, data, sizeof(uint32_t));
+  return output;
 }
 
 static inline uint32_t decodeUInt32BE(const uint8_t *data) {
-  return (data[3]<<0) | (data[2]<<8) | (data[1]<<16) | (data[0]<<24);
+  uint32_t output = decodeUInt32LE(data);
+  swapBytes32(&output);
+  return output;
 }
 
 static inline uint64_t decodeUInt64LE(const uint8_t *data) {
-  return (((uint64_t)data[0])<< 0) | (((uint64_t)data[1])<< 8) |
-         (((uint64_t)data[2])<<16) | (((uint64_t)data[3])<<24) |
-         (((uint64_t)data[4])<<32) | (((uint64_t)data[5])<<40) |
-         (((uint64_t)data[6])<<48) | (((uint64_t)data[7])<<56);
+  uint64_t output;
+  memcpy(&output, data, sizeof(uint64_t));
+  return output;
 }
 
 static inline uint64_t decodeUInt64BE(const uint8_t *data) {
-  return (((uint64_t)data[7])<< 0) | (((uint64_t)data[6])<< 8) |
-         (((uint64_t)data[5])<<16) | (((uint64_t)data[4])<<24) |
-         (((uint64_t)data[3])<<32) | (((uint64_t)data[2])<<40) |
-         (((uint64_t)data[1])<<48) | (((uint64_t)data[0])<<56);
+  uint64_t output = decodeUInt64LE(data);
+  swapBytes64(&output);
+  return output;
 }
 
 THPByteOrder THP_nativeByteOrder()
@@ -92,24 +152,12 @@ void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order,
   }
 }
 
-template<size_t size>
-static void swapBytes(uint8_t *ptr)
-{
-  uint8_t tmp;
-  for (size_t i = 0; i < size / 2; i++) {
-    tmp = ptr[i];
-    ptr[i] = ptr[size-i];
-    ptr[size-i] = tmp;
-  }
-}
-
-
 void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
 {
   memcpy(dst, src, sizeof(int16_t) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(int16_t)>(dst);
+      swapBytes16(dst);
       dst += sizeof(int16_t);
     }
   }
@@ -120,7 +168,7 @@ void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order,
   memcpy(dst, src, sizeof(int32_t) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(int32_t)>(dst);
+      swapBytes32(dst);
       dst += sizeof(int32_t);
     }
   }
@@ -131,7 +179,7 @@ void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order,
   memcpy(dst, src, sizeof(int64_t) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(int64_t)>(dst);
+      swapBytes64(dst);
       dst += sizeof(int64_t);
     }
   }
@@ -142,7 +190,7 @@ void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, s
   memcpy(dst, src, sizeof(float) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(float)>(dst);
+      swapBytes32(dst);
       dst += sizeof(float);
     }
   }
@@ -153,7 +201,7 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order,
   memcpy(dst, src, sizeof(double) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(double)>(dst);
+      swapBytes64(dst);
       dst += sizeof(double);
     }
   }

From f2f43ad2dab024ef4010b63ec2faa64e9affcdac Mon Sep 17 00:00:00 2001
From: Mingda Li <mingda@fb.com>
Date: Mon, 10 Sep 2018 15:39:01 -0700
Subject: [PATCH 015/237] Add new LengthsSplit operator (#10974)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10974

Pull Request resolved: https://github.com/pytorch/pytorch/pull/10291

This new operator will do the following:

Given a LENGTHS vector and n_splits, output a "split" LENGTHS vector where:

1. Each length in input vector is split into n_splits values (thus output vector should have LENGTHS.size(0) * n_splits elements)
2. The new lengths in output should be evenly split, and if the length is not divisible by n_splits, then order new values in descending order. (e.g. n_splits = 3, length = 5 -> 2 2 1)
3. If n_splits > some element in the array, its split elements will contain 0s. (e.g. n_splits = 3, length = 2 - > 1 1 0)

Reviewed By: bddppq, chocjy

Differential Revision: D9013119

fbshipit-source-id: 82bf3371ec08c41fc3379177f0007afc142e0d84
---
 caffe2/operators/length_split_op.cc           |  37 +++++
 caffe2/operators/length_split_op.h            |  75 +++++++++
 .../operator_test/length_split_op_test.py     | 151 ++++++++++++++++++
 3 files changed, 263 insertions(+)
 create mode 100644 caffe2/operators/length_split_op.cc
 create mode 100644 caffe2/operators/length_split_op.h
 create mode 100644 caffe2/python/operator_test/length_split_op_test.py

diff --git a/caffe2/operators/length_split_op.cc b/caffe2/operators/length_split_op.cc
new file mode 100644
index 00000000000000..7c342d154491b1
--- /dev/null
+++ b/caffe2/operators/length_split_op.cc
@@ -0,0 +1,37 @@
+#include "caffe2/operators/length_split_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(LengthsSplit, LengthsSplitOp<CPUContext>);
+
+OPERATOR_SCHEMA(LengthsSplit)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::INT32)
+    .SetDoc(R"DOC(
+Given input vector LENGTHS, and input n_split, LengthsSplit returns
+a single output vector. It "splits" each length into n_split values which add
+up to the original length. It will attempt to do equal splits, and if not possible,
+it orders larger values first. If the n_split is larger than the length, zero
+padding will be applied.
+
+e.g. LENGTHS = [9 4 5]
+     n_split = 3
+     Y = [3 3 3 2 1 1 2 2 1]
+
+e.g. LENGTHS = [2, 1, 2]
+     n_split = 3
+     Y = [1 1 0 1 0 0 1 1 0]
+)DOC")
+    .Arg("n_split", "Number of splits for each element in LENGTHS")
+    .Input(0, "LENGTHS", "Mx1 Input tensor denoting INT32 lengths")
+    .Input(
+        1,
+        "n_split",
+        "(Optional) Number of splits for each element in LENGTHS (overrides argument)")
+    .Output(0, "Y", "(M*n_split)x1 Output vector denoting split lengths");
+
+// TODO: Write gradient for this when needed
+GRADIENT_NOT_IMPLEMENTED_YET(LengthsSplit);
+
+} // namespace caffe2
diff --git a/caffe2/operators/length_split_op.h b/caffe2/operators/length_split_op.h
new file mode 100644
index 00000000000000..d8c98bf085c8a2
--- /dev/null
+++ b/caffe2/operators/length_split_op.h
@@ -0,0 +1,75 @@
+#ifndef CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_
+#define CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class LengthsSplitOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  LengthsSplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        n_split_(OperatorBase::GetSingleArgument<int32_t>("n_split", 0)) {
+    if (InputSize() == 1) {
+      // If not specified, then must have this argument
+      CAFFE_ENFORCE(
+          OperatorBase::HasArgument("n_split"),
+          "Argument `n_split` is missing and was not specified as input.");
+      CAFFE_ENFORCE(
+          n_split_ > 0,
+          "`n_split` must contain a positive value for defined behavior.");
+    }
+  }
+  ~LengthsSplitOp() {}
+
+  bool RunOnDevice() override {
+    const auto& L = Input(0);
+    CAFFE_ENFORCE_EQ(L.ndim(), 1, "Input `LENGTHS` should be a 1D vector.");
+
+    if (InputSize() > 1) {
+      // We potentially have n_split specified as inputs as well
+      CAFFE_ENFORCE(
+          Input(1).ndim() == 1 && Input(1).size() == 1,
+          "Input `n_split` should be a vector of size 1.");
+
+      const auto& input1 = Input(1);
+      context_.template CopyItems<Context, CPUContext>(
+          input1.meta(), 1, input1.raw_data(), &n_split_);
+    }
+
+    CAFFE_ENFORCE(
+        n_split_ > 0,
+        "`n_split` must contain a positive value for defined behavior.");
+    const auto M = L.size();
+
+    auto* Y = Output(0);
+    Y->Resize(M * n_split_);
+
+    const int32_t* Ldata = L.template data<int32_t>();
+    int32_t* Ydata = Y->template mutable_data<int32_t>();
+
+    for (int i = 0; i < M; i++) {
+      int32_t mod = Ldata[i] % n_split_;
+      int32_t res =
+          mod != 0 ? math::divUp(Ldata[i], n_split_) : Ldata[i] / n_split_ + 1;
+      for (int j = 0; j < n_split_; j++) {
+        Ydata[(i * n_split_) + j] = mod-- > 0 ? res : res - 1;
+      }
+    }
+    return true;
+  }
+
+ private:
+  int32_t n_split_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_
diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py
new file mode 100644
index 00000000000000..08ec2638886291
--- /dev/null
+++ b/caffe2/python/operator_test/length_split_op_test.py
@@ -0,0 +1,151 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLengthSplitOperator(hu.HypothesisTestCase):
+
+    def _length_split_op_ref(self, input_lengths, n_split_array):
+        output = []
+        n_split = n_split_array[0]
+        for x in input_lengths:
+            mod = x % n_split
+            val = x // n_split + 1
+            for _ in range(n_split):
+                if mod > 0:
+                    output.append(val)
+                    mod -= 1
+                else:
+                    output.append(val - 1)
+        return [np.array(output).astype(np.int32)]
+
+    @given(**hu.gcs_cpu_only)
+    def test_length_split_edge(self, gc, dc):
+        input_lengths = np.array([3, 4, 5]).astype(np.int32)
+        n_split_ = np.array([5]).astype(np.int32)
+        # Expected output:
+        # [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'],
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
+
+    @given(**hu.gcs_cpu_only)
+    def test_length_split_arg(self, gc, dc):
+        input_lengths = np.array([9, 4, 5]).astype(np.int32)
+        n_split = 3
+        # Expected output:
+        # [3, 3, 3, 2, 1, 1, 2, 2, 1]
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths'],
+            ['Y'], n_split=n_split
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths],
+            reference=lambda x : self._length_split_op_ref(x, [n_split]),
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths], [0])
+
+    @given(**hu.gcs_cpu_only)
+    def test_length_split_override_arg(self, gc, dc):
+        input_lengths = np.array([9, 4, 5]).astype(np.int32)
+        n_split_ignored = 2
+        n_split_used = np.array([3]).astype(np.int32)
+
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'], n_split=n_split_ignored
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_used],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_used], [0])
+
+    @given(m=st.integers(1, 100), n_split=st.integers(1, 20),
+           **hu.gcs_cpu_only)
+    def test_length_split_even_divide(self, m, n_split, gc, dc):
+        # multiples of n_split
+        input_lengths = np.random.randint(100, size=m).astype(np.int32) * n_split
+        n_split_ = np.array([n_split]).astype(np.int32)
+
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'],
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
+
+    @given(m=st.integers(1, 100), n_split=st.integers(1, 20),
+           **hu.gcs_cpu_only)
+    def test_length_split_random(self, m, n_split, gc, dc):
+        input_lengths = np.random.randint(100, size=m).astype(np.int32)
+        n_split_ = np.array([n_split]).astype(np.int32)
+
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'],
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()

From 3ad67c60f078f62f3d67e7228cf7b8a1a0f6e3e8 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Mon, 10 Sep 2018 16:53:25 -0700
Subject: [PATCH 016/237] Traceable explicit Variable instantiation (#11463)

Summary:
There's a bunch of legacy code where people are explicitly instantiating Variable, and these call-sites have thus far been untraceable (appearing as prim::Constant nodes with the tensor value at the time of tracing). This makes it so that the new variable inherits the traced Value* from the tensor it's being constructed from
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11463

Differential Revision: D9756529

Pulled By: jamesr66a

fbshipit-source-id: da99c6a7621957a305f2699ec9cb9def69b1b2d7
---
 test/test_jit.py                               | 9 +++++++++
 torch/csrc/autograd/python_legacy_variable.cpp | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/test/test_jit.py b/test/test_jit.py
index 32ccf392f60b93..382daa03ec1614 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1676,6 +1676,15 @@ def forward(self, x, lengths, hiddens):
             torch.randn(5, 3, 10), torch.LongTensor([3, 3, 2]), torch.randn(2, 3, 20), torch.randn(2, 3, 20)
         self.assertEqual(traced(x, lengths, (h0, c0)), imported(x, lengths, (h0, c0)))
 
+    def test_trace_variable_instantiation(self):
+        def random_foo(x):
+            return Variable(Variable(x) + 1.0)
+
+        random_foo_traced = torch.jit.trace(random_foo, (torch.rand(3, 4),))
+
+        x = torch.rand(5, 6)
+        self.assertEqual(random_foo(x), random_foo_traced(x))
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 339e58cde4e56c..3ba7ff94bc1fd7 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -6,6 +6,7 @@
 #include "torch/csrc/autograd/python_function.h"
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/tensor/python_tensor.h"
+#include "torch/csrc/jit/tracer.h"
 
 using namespace at;
 
@@ -67,6 +68,12 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject
     var.set_name(name);
   }
 
+  if (jit::tracer::isTracing() && data && data != Py_None && THPVariable_Check(data)) {
+    if (auto *v = jit::tracer::getValueTrace(((THPVariable*)data)->cdata)) {
+      jit::tracer::setValueTrace(var, v);
+    }
+  }
+
   return THPVariable_Wrap(std::move(var));
   END_HANDLE_TH_ERRORS
 }

From 3e49a694660711a20acb3dd4b5c41fc8679ee48b Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Mon, 10 Sep 2018 17:07:37 -0700
Subject: [PATCH 017/237] Resolve ambiguity when including both caffe2 and aten
 registries (#11411)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11411

Simple fix

Reviewed By: goldsborough

Differential Revision: D9730371

fbshipit-source-id: f841327c01faa13cfb6b7fc6e279b8fc50fad1db
---
 caffe2/core/registry.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index c7f3a7af539d51..f353e8d6601fab 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -17,21 +17,13 @@
 #include <memory>
 #include <mutex>
 
+#include <ATen/core/Registry.h>
+
 #include "caffe2/core/common.h"
 #include "caffe2/core/typeid.h"
 
 namespace caffe2 {
 
-template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
-}
-
-template <>
-inline void PrintOffendingKey(const string& key) {
-  printf("Offending key: %s.\n", key.c_str());
-}
-
 /**
  * @brief A template class that allows one to register classes by keys.
  *
@@ -59,7 +51,7 @@ class CAFFE2_API Registry {
     std::lock_guard<std::mutex> lock(register_mutex_);
     if (registry_.count(key) != 0) {
       printf("Key already registered.\n");
-      PrintOffendingKey(key);
+      at::PrintOffendingKey(key);
       std::exit(1);
     }
     registry_[key] = creator;

From e1e69446f6c3419c9a587a692c571bf77f9cc047 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Mon, 10 Sep 2018 17:07:52 -0700
Subject: [PATCH 018/237] Lockdown NO_TEST=1 for tests even more (#11415)

Summary:
Skip torch tests as well when NO_TEST=1 environment variable is set. Also remove the separate ATen code path for not being built with Caffe2, since it will always be built with Caffe2.

cc The controller you requested could not be found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11415

Reviewed By: soumith

Differential Revision: D9758179

Pulled By: orionr

fbshipit-source-id: e3e3327364fccdc57a703aeaad8c4f30452973fb
---
 aten/src/ATen/CMakeLists.txt | 34 ++++++++++++++++++----------------
 tools/build_libtorch.py      |  1 +
 torch/CMakeLists.txt         |  6 +++---
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index f6d296dfe79e45..d65211e2269d98 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -405,30 +405,32 @@ else()
 endif()
 
 if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  foreach(test_src ${ATen_CPU_TEST_SRCS})
-    get_filename_component(test_name ${test_src} NAME_WE)
-    add_executable(${test_name} "${test_src}")
-    target_include_directories(
-        ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-    target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
-    target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-    target_link_libraries(${test_name} ATen_cpu)
-    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-    install(TARGETS ${test_name} DESTINATION test)
-  endforeach()
-
-  if(USE_CUDA OR USE_ROCM)
-    foreach(test_src ${ATen_CUDA_TEST_SRCS})
+  if (BUILD_TEST)
+    foreach(test_src ${ATen_CPU_TEST_SRCS})
       get_filename_component(test_name ${test_src} NAME_WE)
-      torch_cuda_based_add_executable(${test_name} "${test_src}")
+      add_executable(${test_name} "${test_src}")
       target_include_directories(
           ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
       target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-      target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda)
+      target_link_libraries(${test_name} ATen_cpu)
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       install(TARGETS ${test_name} DESTINATION test)
     endforeach()
+
+    if(USE_CUDA OR USE_ROCM)
+      foreach(test_src ${ATen_CUDA_TEST_SRCS})
+        get_filename_component(test_name ${test_src} NAME_WE)
+        torch_cuda_based_add_executable(${test_name} "${test_src}")
+        target_include_directories(
+            ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+        target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
+        target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
+        target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda)
+        add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+        install(TARGETS ${test_name} DESTINATION test)
+      endforeach()
+    endif()
   endif()
 
   # Make sure these don't get built by parent
diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index 46b73687a12064..8428282ab7b9f8 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -12,6 +12,7 @@
     args = parser.parse_args()
 
     os.environ['BUILD_TORCH'] = 'ON'
+    os.environ['BUILD_TEST'] = 'ON'
     os.environ['ONNX_NAMESPACE'] = 'onnx_torch'
     os.environ['PYTORCH_PYTHON'] = sys.executable
 
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index fb0b76e8aeb91f..614dbf3937742d 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -9,7 +9,7 @@ else()
   option(USE_CUDA "Use CUDA" ON)
 endif()
 
-option(BUILD_TORCH_TEST "Build torch test binaries" ON)
+option(BUILD_TEST "Build torch test binaries" ON)
 option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF)
 
 # TODO: Unify with version from setup.py
@@ -403,7 +403,7 @@ install(TARGETS torch
   ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
 # JIT Tests. TODO: Put into test/cpp/jit folder
-if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
+if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp)
   target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES})
   target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES)
@@ -416,7 +416,7 @@ if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   endif()
 endif()
 
-if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
+if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
   set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api")
 
   add_executable(test_api

From a17528277678b0d6dc76ffce1c4940c0b995abc2 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Mon, 10 Sep 2018 17:11:16 -0700
Subject: [PATCH 019/237] Flags for LMDB, LevelDB, and Caffe2 ops (#11462)

Summary:
Add flags for LMDB and LevelDB, default `OFF`. These can be enabled with

```
USE_LMDB=1 USE_LEVELDB=1 python setup.py build_deps
```

Also add a flag to build Caffe2 ops, which is default `ON`. Disable with

```
NO_CAFFE2_OPS=1 python setup.py build_deps
```

cc Yangqing soumith pjh5 mingzhe09088
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11462

Reviewed By: soumith

Differential Revision: D9758156

Pulled By: orionr

fbshipit-source-id: 95fd206d72fdf44df54fc5d0aeab598bff900c63
---
 .jenkins/caffe2/build.sh            |  2 +-
 CMakeLists.txt                      |  1 +
 caffe2/CMakeLists.txt               |  6 ++++--
 caffe2/contrib/aten/CMakeLists.txt  |  2 +-
 caffe2/share/contrib/CMakeLists.txt |  2 +-
 cmake/Dependencies.cmake            |  2 +-
 cmake/Summary.cmake                 |  1 +
 scripts/onnx/install-develop.sh     |  2 +-
 scripts/onnx/install.sh             |  2 +-
 setup.py                            | 18 ++++++++++++++++--
 tools/build_pytorch_libs.bat        |  3 +++
 tools/build_pytorch_libs.sh         |  3 +++
 tools/setup_helpers/build.py        |  3 +++
 13 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index d9b2a2e096a1e5..ffcbbc136b50d5 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -226,7 +226,7 @@ else
     export MAX_JOBS=`expr $(nproc) - 1`
   fi
 
-  USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user
+  USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user
 
   # This is to save test binaries for testing
   cp -r torch/lib/tmp_install $INSTALL_PREFIX
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7f56f96e87f3a..8dc6dc146ffb79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
+option(BUILD_CAFFE2_OPS "Build Caffe2 operators" ON)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
     CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 32183e212c923c..8672f836d89c69 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -86,8 +86,10 @@ add_subdirectory(mobile)
 add_subdirectory(mpi)
 add_subdirectory(observers)
 add_subdirectory(onnx)
-add_subdirectory(operators)
-add_subdirectory(operators/rnn)
+if (BUILD_CAFFE2_OPS)
+  add_subdirectory(operators)
+  add_subdirectory(operators/rnn)
+endif()
 add_subdirectory(opt)
 add_subdirectory(perfkernels)
 add_subdirectory(python)
diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt
index 92eb671e019cb7..add3918d4c3373 100644
--- a/caffe2/contrib/aten/CMakeLists.txt
+++ b/caffe2/contrib/aten/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT BUILD_ATEN_MOBILE)
+if(NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS)
   # Add source generated by Codegen.cmake and pass to parent
   list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
   list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc)
diff --git a/caffe2/share/contrib/CMakeLists.txt b/caffe2/share/contrib/CMakeLists.txt
index 01af2c0616dfce..0fc3a4186f0189 100644
--- a/caffe2/share/contrib/CMakeLists.txt
+++ b/caffe2/share/contrib/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (USE_NNPACK)
+if (USE_NNPACK AND BUILD_CAFFE2_OPS)
   add_subdirectory(nnpack)
 endif()
 if (USE_ZSTD)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 67b65c0dcba74a..fed64398276078 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -770,7 +770,7 @@ if (USE_NNAPI AND NOT ANDROID)
   caffe2_update_option(USE_NNAPI OFF)
 endif()
 
-if (NOT BUILD_ATEN_MOBILE)
+if (NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS)
   if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
     list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen)
     if (USE_CUDA)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 846f4d6154b8ba..16d18ac7634d0d 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -38,6 +38,7 @@ function (caffe2_print_configuration_summary)
     message(STATUS "    Python includes     : ${PYTHON_INCLUDE_DIRS}")
     message(STATUS "    Python site-packages: ${PYTHON_SITE_PACKAGES}")
   endif()
+  message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
   message(STATUS "  BUILD_SHARED_LIBS     : ${BUILD_SHARED_LIBS}")
   message(STATUS "  BUILD_TEST            : ${BUILD_TEST}")
 
diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh
index 02888b11901aa6..24bb4bd5808a1c 100755
--- a/scripts/onnx/install-develop.sh
+++ b/scripts/onnx/install-develop.sh
@@ -15,4 +15,4 @@ pip install -e "$tp2_dir/onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop
+USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop
diff --git a/scripts/onnx/install.sh b/scripts/onnx/install.sh
index 7d4a3139d2e002..29da7e17baa00a 100755
--- a/scripts/onnx/install.sh
+++ b/scripts/onnx/install.sh
@@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-USE_OPENCV=1 BUILD_BINARY=1 python setup.py install
+USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install
diff --git a/setup.py b/setup.py
index 34d2e7e5402840..7edd89cbd7fcb6 100644
--- a/setup.py
+++ b/setup.py
@@ -46,12 +46,21 @@
 #     disables use of system-wide nccl (we will use our submoduled
 #     copy in third_party/nccl)
 #
+#   NO_CAFFE2_OPS
+#     disable Caffe2 operators build
+#
 #   USE_GLOO_IBVERBS
 #     toggle features related to distributed support
 #
 #   USE_OPENCV
 #     enables use of OpenCV for additional operators
 #
+#   USE_LEVELDB
+#     enables use of LevelDB for storage
+#
+#   USE_LMBD
+#     enables use of LMDB for storage
+#
 #   BUILD_BINARY
 #     enables the additional binaries/ build
 #
@@ -144,11 +153,13 @@ def hotpatch_var(var, prefix='USE_'):
 list(map(hotpatch_var, use_env_vars))
 
 # Also hotpatch a few with BUILD_* equivalent
-build_env_vars = ['BINARY', 'TEST']
+build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
 [hotpatch_var(v, 'BUILD_') for v in build_env_vars]
 
 from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
-from tools.setup_helpers.build import BUILD_BINARY, BUILD_TEST, USE_OPENCV
+from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
+                                       BUILD_CAFFE2_OPS, USE_LEVELDB,
+                                       USE_LMDB, USE_OPENCV)
 from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
@@ -376,7 +387,10 @@ def build_libs(libs):
     my_env["BUILD_PYTHON"] = "ON"
     my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
     my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
+    my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
     my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
+    my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
+    my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
     my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
 
     try:
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 80d79dc4364260..34d7ea1ac0d206 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -180,10 +180,13 @@ goto:eof
                   -DBUILD_BINARY=%BUILD_BINARY% ^
                   -DBUILD_TEST=OFF ^
                   -DINSTALL_TEST=%INSTALL_TEST% ^
+                  -DBUILD_CAFFE2_OPS=%BUILD_CAFFE2_OPS% ^
                   -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^
                   -DUSE_CUDA=%USE_CUDA% ^
                   -DUSE_CUDNN=OFF ^
                   -DUSE_NNPACK=%USE_NNPACK% ^
+                  -DUSE_LEVELDB=%USE_LEVELDB% ^
+                  -DUSE_LMDB=%USE_LMDB% ^
                   -DUSE_OPENCV=%USE_OPENCV% ^
                   -DUSE_GLOG=OFF ^
                   -DUSE_GFLAGS=OFF ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index ba0bde0d577347..d09dc756d82d8e 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -263,11 +263,14 @@ function build_caffe2() {
       -DBUILD_BINARY=$BUILD_BINARY \
       -DBUILD_TEST=$BUILD_TEST \
       -DINSTALL_TEST=$INSTALL_TEST \
+      -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \
       -DONNX_NAMESPACE=$ONNX_NAMESPACE \
       -DUSE_CUDA=$USE_CUDA \
       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
       -DUSE_ROCM=$USE_ROCM \
       -DUSE_NNPACK=$USE_NNPACK \
+      -DUSE_LEVELDB=$USE_LEVELDB \
+      -DUSE_LMDB=$USE_LMDB \
       -DUSE_OPENCV=$USE_OPENCV \
       -DUSE_GLOG=OFF \
       -DUSE_GFLAGS=OFF \
diff --git a/tools/setup_helpers/build.py b/tools/setup_helpers/build.py
index 82364bd7394c51..09e26cb5aa1b20 100644
--- a/tools/setup_helpers/build.py
+++ b/tools/setup_helpers/build.py
@@ -2,4 +2,7 @@
 
 BUILD_BINARY = check_env_flag('BUILD_BINARY')
 BUILD_TEST = not check_negative_env_flag('BUILD_TEST')
+BUILD_CAFFE2_OPS = not check_negative_env_flag('BUILD_CAFFE2_OPS')
+USE_LEVELDB = check_env_flag('USE_LEVELDB')
+USE_LMDB = check_env_flag('USE_LMDB')
 USE_OPENCV = check_env_flag('USE_OPENCV')

From 9cfdf0d677b2d7f7db83b56fce42fdfd6e5f26c3 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 10 Sep 2018 18:19:18 -0700
Subject: [PATCH 020/237] Document the Embedding module (#11469)

Summary:
ebetica soumith ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11469

Differential Revision: D9757547

Pulled By: goldsborough

fbshipit-source-id: a95673abe949bb81d716dbc03c5c3e2a11cc15d3
---
 .../api/include/torch/nn/modules/embedding.h    | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h
index bc33f8df74f75c..c2a701eb3ede24 100644
--- a/torch/csrc/api/include/torch/nn/modules/embedding.h
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -10,12 +10,16 @@
 namespace torch {
 namespace nn {
 
+/// Options for the `Embedding` module.
 struct EmbeddingOptions {
   EmbeddingOptions(int64_t count, int64_t dimension);
+  /// The number of embeddings (number of rows in the table).
   TORCH_ARG(int64_t, count);
+  /// The size of each embedding vector (number of columns in the table).
   TORCH_ARG(int64_t, dimension);
 };
 
+/// Performs a lookup in a fixed size embedding table.
 class EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
  public:
   EmbeddingImpl(int64_t count, int64_t dimension)
@@ -23,12 +27,23 @@ class EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
   explicit EmbeddingImpl(EmbeddingOptions options);
 
   void reset() override;
-  Tensor forward(Tensor);
 
+  /// Performs a lookup on the embedding table stored in `weight` using the
+  /// `indices` supplied and returns the result.
+  Tensor forward(Tensor indices);
+
+  /// The `Options` used to configure this `Embedding` module.
+  /// Changes to `EmbeddingOptions` *after construction* have no effect.
   EmbeddingOptions options;
+
+  /// The embedding table.
   Tensor weight;
 };
 
+/// A `ModuleHolder` subclass for `EmbeddingImpl`.
+/// See the documentation for `EmbeddingImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Embedding);
 
 } // namespace nn

From dd8defeb3f02f6307021cf690438a912265a5437 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 10 Sep 2018 19:41:46 -0700
Subject: [PATCH 021/237] Document the Functional module (#11460)

Summary:
Document the `Functional` module in the C++  API.

ebetica ezyang soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11460

Differential Revision: D9757555

Pulled By: goldsborough

fbshipit-source-id: 15f8bf6d60bd26f3f4e69fb8e414e186e3c220ee
---
 test/cpp/api/modules.cpp                      |  1 +
 .../api/include/torch/nn/modules/functional.h | 45 ++++++++++++++++++-
 torch/csrc/api/src/nn/modules/functional.cpp  |  4 +-
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 95f38d15a4c822..2da1e1c45bf6f6 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -226,6 +226,7 @@ TEST_CASE("modules") {
       REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
 
       was_called = false;
+      // Use the call operator overload here.
       output = functional(torch::ones(5, torch::requires_grad()));
       REQUIRE(was_called);
       REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
diff --git a/torch/csrc/api/include/torch/nn/modules/functional.h b/torch/csrc/api/include/torch/nn/modules/functional.h
index 4e234a8ad3fc80..44d11d1ffa6ee0 100644
--- a/torch/csrc/api/include/torch/nn/modules/functional.h
+++ b/torch/csrc/api/include/torch/nn/modules/functional.h
@@ -6,16 +6,51 @@
 #include <torch/tensor.h>
 
 #include <functional>
+#include <utility>
 
 namespace torch {
 namespace nn {
 
-// Lets you create a container from a function, designed for use in
-// Sequential.
+/// Wraps a function in a `Module`.
+///
+/// The `Functional` module allows wrapping an arbitrary function or function
+/// object in an `nn::Module`. This is primarily handy for usage in
+/// `Sequential`.
+///
+/// \rst
+/// .. code-block::
+///   Sequential sequential(
+///     Linear(3, 4),
+///     Functional(torch::relu),
+///     BatchNorm(3),
+///     Functional(torch::elu, /*alpha=*/1));
+/// \endrst
+///
+/// While a `Functional` module only accepts a single `Tensor` as input, it is
+/// possible for the the wrapped function to accept further arguments. However,
+/// these have to be bound *at construction time*. For example, if
+/// you want to wrap `torch::leaky_relu`, which accepts a `slope` scalar as its
+/// second argument, with a particular value for its `slope` in a `Functional`
+/// module, you could write `Functional(torch::leaky_relu, /*slope=*/0.5)`. The
+/// value of `0.5` is then stored within the `Functional` object and supplied to
+/// the function call at invocation time. Note that such bound values are
+/// evaluated eagerly and stored a single time. See the documentation of
+/// [`std::bind`](https://en.cppreference.com/w/cpp/utility/functional/bind) for
+/// more information on the semantics of argument binding.
+///
+/// \rst
+/// .. attention::
+///   After passing any bound arguments, the function must accept a single
+///   tensor and return a single tensor.
+/// \endrst
+///
+/// Note that `Functional` overloads the call operator (`operator()`) such that
+/// you can invoke it with `my_func(...)`.
 class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
  public:
   using Function = std::function<Tensor(Tensor)>;
 
+  /// Constructs a `Functional` from a function object.
   explicit FunctionalImpl(Function function);
 
   template <
@@ -34,6 +69,8 @@ class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
   }
 
   void reset() override;
+
+  /// Forwards the `input` tensor to the underlying (bound) function object.
   Tensor forward(Tensor input);
 
   /// Calls forward(input).
@@ -43,6 +80,10 @@ class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
   Function function_;
 };
 
+/// A `ModuleHolder` subclass for `FunctionalImpl`.
+/// See the documentation for `FunctionalImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Functional);
 
 } // namespace nn
diff --git a/torch/csrc/api/src/nn/modules/functional.cpp b/torch/csrc/api/src/nn/modules/functional.cpp
index 591634db23640e..1820e8f4df2384 100644
--- a/torch/csrc/api/src/nn/modules/functional.cpp
+++ b/torch/csrc/api/src/nn/modules/functional.cpp
@@ -13,11 +13,11 @@ FunctionalImpl::FunctionalImpl(Function function)
 void FunctionalImpl::reset() {}
 
 Tensor FunctionalImpl::forward(Tensor input) {
-  return function_(input);
+  return function_(std::move(input));
 }
 
 Tensor FunctionalImpl::operator()(Tensor input) {
-  return forward(input);
+  return forward(std::move(input));
 }
 } // namespace nn
 } // namespace torch

From f9d12eeb270d74e76ac2599495b95e8c4f5cb69b Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 10 Sep 2018 20:36:49 -0700
Subject: [PATCH 022/237] Give copy an optional device argument.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11497

Differential Revision: D9762014

Pulled By: gchanan

fbshipit-source-id: 996419cc5e86d000af953d030ff361adafb921ad
---
 aten/src/ATen/templates/TensorMethods.h | 5 ++---
 aten/src/ATen/templates/Type.h          | 2 +-
 aten/src/ATen/templates/TypeDefault.cpp | 7 +++++--
 aten/src/ATen/templates/TypeDefault.h   | 2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 07e5f2b634372c..5573f5802809e9 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -7,7 +7,6 @@
 #include "ATen/core/SparseTensorRef.h"
 #include "ATen/Type.h"
 #include "ATen/core/TensorOptions.h"
-#include "ATen/DeviceGuard.h"
 
 namespace at {
 
@@ -56,8 +55,8 @@ inline Tensor to(
   AT_CHECK(tensor.is_variable() == options.is_variable(),
            "cannot change is_variable, from: ", tensor.is_variable(),
            " to: ", options.is_variable());
-  DeviceGuard guard(options.device());
-  return tensor.type().toBackend(options.backend()).toScalarType(options.dtype()).copy(tensor, non_blocking);
+  return tensor.type().toBackend(options.backend()).toScalarType(options.dtype())
+               .copy(tensor, non_blocking, options.device());
 }
 } // namespace detail
 
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 3a7080ea201e35..183b2d4d498026 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -99,7 +99,7 @@ struct AT_API Type {
     return backendToDeviceType(backend());
   }
 
-  virtual Tensor copy(const Tensor & src, bool non_blocking=false) const = 0;
+  virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional<Device> to_device={}) const = 0;
   virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0;
   virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
   virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0;
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 54c6c282dcb95d..4e9c454977d588 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -22,8 +22,11 @@ Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking
   return s_copy_(self, b_src, non_blocking);
 }
 
-Tensor TypeDefault::copy(const Tensor & src, bool non_blocking) const {
-  // TODO(psag): have a DeviceGuard here
+Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional<Device> to_device) const {
+  DeviceGuard device_guard;
+  if (to_device.has_value()) {
+    device_guard.set_index(to_device.value().index());
+  }
   AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
   if (is_sparse()) {
     auto indices = src._indices();
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h
index 64ec158f82349e..0f315e590d700c 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/templates/TypeDefault.h
@@ -25,7 +25,7 @@ struct AT_API TypeDefault : public Type {
   Type & toBackend(Backend b) const override;
   Type & toScalarType(ScalarType s) const override;
 
-  Tensor copy(const Tensor & src, bool non_blocking=false) const override;
+  Tensor copy(const Tensor & src, bool non_blocking=false, optional<Device> to_device={}) const override;
   Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const override;
 
   void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const override;

From b14a80553d550fa5e2fe588ca50d885d643bce5e Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 10 Sep 2018 20:45:03 -0700
Subject: [PATCH 023/237] Ignore functional doc error

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11508

Differential Revision: D9764380

Pulled By: goldsborough

fbshipit-source-id: 3abb9c04f46137be833ea26d67734741e14f8010
---
 docs/cpp/check-doxygen.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/cpp/check-doxygen.sh b/docs/cpp/check-doxygen.sh
index 58e65b403e39bf..375347974ac749 100755
--- a/docs/cpp/check-doxygen.sh
+++ b/docs/cpp/check-doxygen.sh
@@ -20,6 +20,7 @@ ignore_warning "warning: no uniquely matching class member found for"
 ignore_warning "warning: source ../../build/aten/src/ is not a readable file"
 ignore_warning "warning: source ../../build/aten/src/ATen/Tensor.h is not a readable file"
 ignore_warning "warning: source ../../build/aten/src/ATen/Functions.h is not a readable file"
+ignore_warning "warning: documented symbol \`torch::nn::FunctionalImpl::FunctionalImpl' was not declared or defined"
 
 # Count the number of remaining warnings.
 warnings=$(grep 'warning:' doxygen-log.txt | wc -l)

From 0988bbad2de5e0ce403c5e6f781437b24a484fc2 Mon Sep 17 00:00:00 2001
From: Teng Li <tengli@fb.com>
Date: Mon, 10 Sep 2018 23:21:36 -0700
Subject: [PATCH 024/237] C10d release to torch.distributed for PT1 (#11405)

Summary:
The old `torch.distributed` will go to `torch.distributed.deprecated`
The old DDP will go to `torch.nn.parallel.deprecated`

Now `torch.nn.parallel.DDP` will use c10d DDP
Now `torch.distributed` will use C10d frontend API
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11405

Reviewed By: pietern

Differential Revision: D9733733

Pulled By: teng-li

fbshipit-source-id: d6a3f3e73f8d3a7fcb1f4baef53c78063b8cbb08
---
 test/run_test.py                              |   8 +-
 test/test_c10d.py                             |   8 +-
 test/test_distributed.py                      |   6 +-
 test/test_thd_distributed.py                  |   6 +-
 torch/csrc/distributed/Module.cpp             |   4 +-
 torch/csrc/distributed/c10d/init.cpp          |   2 +-
 torch/distributed/__init__.py                 | 563 +----------------
 torch/distributed/c10d/__init__.py            |  13 -
 torch/distributed/deprecated/__init__.py      | 566 ++++++++++++++++++
 .../{ => deprecated}/remote_types.py          |   2 +-
 .../{c10d => }/distributed_c10d.py            |   0
 torch/distributed/{c10d => }/rendezvous.py    |   0
 torch/nn/parallel/__init__.py                 |   3 +-
 torch/nn/parallel/deprecated/__init__.py      |   4 +
 torch/nn/parallel/deprecated/distributed.py   | 482 +++++++++++++++
 .../distributed_cpu.py}                       |  17 +-
 torch/nn/parallel/distributed.py              | 443 ++++++--------
 torch/nn/parallel/distributed_c10d.py         | 366 -----------
 torch/nn/parallel/distributed_cpu.py          |   5 +-
 torch/utils/data/distributed.py               |  10 +-
 20 files changed, 1265 insertions(+), 1243 deletions(-)
 delete mode 100644 torch/distributed/c10d/__init__.py
 create mode 100644 torch/distributed/deprecated/__init__.py
 rename torch/distributed/{ => deprecated}/remote_types.py (96%)
 rename torch/distributed/{c10d => }/distributed_c10d.py (100%)
 rename torch/distributed/{c10d => }/rendezvous.py (100%)
 create mode 100644 torch/nn/parallel/deprecated/__init__.py
 create mode 100644 torch/nn/parallel/deprecated/distributed.py
 rename torch/nn/parallel/{distributed_c10d_cpu.py => deprecated/distributed_cpu.py} (88%)
 delete mode 100644 torch/nn/parallel/distributed_c10d.py

diff --git a/test/run_test.py b/test/run_test.py
index 40095c9736095c..d7af8e47ab8767 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -14,7 +14,7 @@
 import torch
 from torch.utils import cpp_extension
 from common import TEST_WITH_ROCM
-import torch.distributed.c10d as c10d
+import torch.distributed as dist
 
 TESTS = [
     'autograd',
@@ -60,12 +60,12 @@
 }
 
 
-if c10d.is_available():
-    if c10d.is_mpi_available():
+if dist.is_available():
+    if dist.is_mpi_available():
         DISTRIBUTED_TESTS_CONFIG['mpi'] = {
             'WORLD_SIZE': '3'
         }
-    if c10d.is_nccl_available():
+    if dist.is_nccl_available():
         DISTRIBUTED_TESTS_CONFIG['nccl'] = {
             'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
         }
diff --git a/test/test_c10d.py b/test/test_c10d.py
index 4e804ee1635ac4..64bedb3183a212 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -14,8 +14,8 @@
 import common
 from torch import nn
 import torch.nn.functional as F
-from torch.distributed import c10d
-from torch.nn.parallel import distributed_c10d
+import torch.distributed as c10d
+from torch.nn.parallel import DistributedDataParallel
 
 from common import TestCase
 
@@ -570,7 +570,7 @@ def world_size(self):
     def _test_ddp_with_process_group(self, process_group):
         gpus = gpus_for_rank(self.world_size)[self.rank]
         model = Net()
-        ddp_model = distributed_c10d._DistributedDataParallelC10d(
+        ddp_model = DistributedDataParallel(
             copy.deepcopy(model).cuda(gpus[0]),
             device_ids=gpus,
             process_group=process_group)
@@ -731,7 +731,7 @@ def test_fp16(self):
         gpus = gpus_for_rank(self.world_size)[self.rank]
         model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
         nn.init.constant_(model.weight, 1)
-        ddp_model = distributed_c10d._DistributedDataParallelC10d(
+        ddp_model = DistributedDataParallel(
             model,
             device_ids=[gpus[0]],
             process_group=process_group,
diff --git a/test/test_distributed.py b/test/test_distributed.py
index c37eac2689e4b9..41fa5cdac0c827 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -12,7 +12,7 @@
 
 import torch
 import torch.cuda
-import torch.distributed.c10d as dist
+import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -1149,7 +1149,7 @@ def test_DistributedDataParallel(self):
         # DDP training setup
         model_DDP = copy.deepcopy(model)
         model_DDP.cuda(gpu_subset[0])
-        model_DDP = nn.parallel._DistributedDataParallelC10d(
+        model_DDP = nn.parallel.DistributedDataParallel(
             model_DDP, device_ids=gpu_subset
         )
 
@@ -1183,7 +1183,7 @@ def test_DistributedDataParallelCPU(self):
 
         # DDP-CPU training setup
         model_DDP = copy.deepcopy(model_base)
-        model_DDP = nn.parallel._DistributedDataParallelC10dCPU(model_DDP)
+        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
 
         # dummy data initialization
         local_bs = 2
diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py
index 47dbe9d056f154..a104baeb3fb451 100644
--- a/test/test_thd_distributed.py
+++ b/test/test_thd_distributed.py
@@ -11,7 +11,7 @@
 
 import torch
 import torch.cuda
-import torch.distributed as dist
+import torch.distributed.deprecated as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -985,7 +985,7 @@ def test_DistributedDataParallel(self):
         # DDP training setup
         model_DDP = copy.deepcopy(model)
         model_DDP.cuda(gpu_subset[0])
-        model_DDP = nn.parallel.DistributedDataParallel(
+        model_DDP = nn.parallel.deprecated.DistributedDataParallel(
             model_DDP, device_ids=gpu_subset
         )
 
@@ -1019,7 +1019,7 @@ def test_DistributedDataParallelCPU(self):
 
         # DDP-CPU training setup
         model_DDP = copy.deepcopy(model_base)
-        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+        model_DDP = nn.parallel.deprecated.DistributedDataParallelCPU(model_DDP)
 
         # dummy data initialization
         local_bs = 2
diff --git a/torch/csrc/distributed/Module.cpp b/torch/csrc/distributed/Module.cpp
index 9bf833e0b7a570..84d655288fe373 100644
--- a/torch/csrc/distributed/Module.cpp
+++ b/torch/csrc/distributed/Module.cpp
@@ -121,7 +121,7 @@ static THDReduceOp _getReduceOp(PyObject *obj)
   auto it = obj2reduceop.find(obj);
   if (it == obj2reduceop.end()) {
     throw std::runtime_error("op should be a constant from "
-        "torch.distributed.reduce_op");
+        "torch.distributed.deprecated.reduce_op");
   }
   return it->second;
 }
@@ -132,7 +132,7 @@ static THDGroup _getGroup(PyObject *obj)
   if (it == obj2group.end()) {
     if (!THPUtils_checkLong(obj))
       throw std::runtime_error("group should be an int or one of the values "
-          "from torch.distributed.group");
+          "from torch.distributed.deprecated.group");
     return THPUtils_unpackLong(obj);
   }
   return it->second;
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 021e5e01247d15..bfe381baf62f1f 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -34,7 +34,7 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
 PyObject* c10d_init(PyObject* _unused) {
   auto c10d_module =
-      THPObjectPtr(PyImport_ImportModule("torch.distributed.c10d"));
+      THPObjectPtr(PyImport_ImportModule("torch.distributed"));
   if (!c10d_module) {
     throw python_error();
   }
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index e717ae724c66a3..5356097743aa3c 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -1,566 +1,13 @@
-"""
-torch.distributed provides an MPI-like interface for exchanging tensor
-data across multi-machine networks. It supports a few different backends
-and initialization methods.
-"""
 import torch
-import atexit
-import warnings
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-
-class dist_backend:
-    UNDEFINED = -1
-    TCP = 0
-    MPI = 1
-    GLOO = 2
-    NCCL = 3
-
-
-_INITIALIZED_PG = 1
-_INITIALIZED_MW = 2
-_initialized = 0
-_backend = dist_backend.UNDEFINED
-_scope = locals()
-
-
-def _extend_scope(module):
-    _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')})
 
 
 def is_available():
-    return torch._C._has_distributed()
-
-
-def destroy_process_group():
-    r"""Destroy the initialized distributed package
-    """
-    global _backend
-    global _initialized
-    torch._C._dist_destroy_process_group()
-    _backend = dist_backend.UNDEFINED
-    _initialized = 0
-
-
-def is_initialized():
-    r"""Checking if the process group has been initialized
-    """
-    return _initialized == _INITIALIZED_PG
-
-
-def init_process_group(backend, init_method='env://', **kwargs):
-    r"""Initializes the distributed package.
-
-    Arguments:
-        backend (str): Name of the backend to use. Depending on build-time configuration
-            valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``.
-        init_method (str, optional): URL specifying how to initialize the package.
-        world_size (int, optional): Number of processes participating in the job.
-        rank (int, optional): Rank of the current process.
-        group_name (str, optional): Group name. See description of init methods.
-
-    To enable ``backend == mpi``, PyTorch needs to built from source on a system that
-    supports MPI. If you want to use Open MPI with CUDA-aware support, please use
-    Open MPI major version 2 and above.
-
-    .. note::
-        This method initializes CUDA context. Therefore, if multiple processes
-        run on a single machine but use different GPUs, make sure to use
-        :func:`torch.cuda.set_device` before this method to avoid unnecessarily
-        creating context on the first visible device.
-
-    """
-    world_size = kwargs.pop('world_size', -1)
-    group_name = kwargs.pop('group_name', '')
-    rank = kwargs.pop('rank', -1)
-    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
-
-    if not is_available():
-        raise RuntimeError("PyTorch built without distributed support")
-
-    global _initialized
-    if _initialized:
-        raise RuntimeError("trying to initialize torch.distributed twice!")
-
-    # Checking and assigning the distributed backend
-    global _backend
-
-    backend = backend.lower()
-    if backend == "tcp":
-        _backend = dist_backend.TCP
-    elif backend == "mpi":
-        _backend = dist_backend.MPI
-    elif backend == "gloo":
-        _backend = dist_backend.GLOO
-    elif backend == "nccl":
-        _backend = dist_backend.NCCL
-    else:
-        raise RuntimeError("Invalid distributed backend name: " + backend)
-
-    torch._C._dist_init_process_group(backend, init_method, world_size,
-                                      group_name, rank)
-    _initialized = _INITIALIZED_PG
-
-    if _backend == dist_backend.NCCL:
-        atexit.register(destroy_process_group)
-
-    if not torch._C._dist_init_extension(False, reduce_op, group):
-        raise RuntimeError("distributed module initialization failed")
-
-
-def init_master_worker(backend, init_method='env://', **kwargs):
-    warnings.warn("""
-    ================================================================================
-                                        WARNING
-    ================================================================================
-    Master-worker mode is still experimental. The API will change without
-    notice and we do not guarantee full correctness and expected performance yet.
-    We'll announce it once it's ready.
-    """)
-    world_size = kwargs.pop('world_size', -1)
-    group_name = kwargs.pop('group_name', '')
-    rank = kwargs.pop('rank', -1)
-    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
-
-    if not is_available():
-        raise RuntimeError("PyTorch built without distributed support")
-
-    global _initialized
-    if _initialized:
-        raise RuntimeError("trying to initialize torch.distributed twice!")
-    torch._C._dist_init_master_worker(backend, init_method, world_size,
-                                      group_name, rank)
-    _initialized = _INITIALIZED_MW
-    import torch.distributed.collectives as collectives
-    import torch.distributed.remote_types as remote_types
-    _extend_scope(collectives)
-    _extend_scope(remote_types)
-    if not torch._C._dist_init_extension(True, reduce_op, group):
-        raise RuntimeError("distributed module initialization failed")
-
-
-class reduce_op(object):
-    SUM = object()
-    PRODUCT = object()
-    MAX = object()
-    MIN = object()
-
-
-class group(object):
-    WORLD = object()
-
-
-class _DistributedRequest(object):
-    def __init__(self, request):
-        self.request = request
-
-    def is_completed(self):
-        return torch._C._dist_request_is_completed(self.request)
-
-    def wait(self):
-        torch._C._dist_request_wait(self.request)
-
-
-def get_rank():
-    r"""Returns the rank of current process.
-
-    Rank is a unique identifier assigned to each process within a distributed
-    group. They are always consecutive integers ranging from ``0`` to
-    ``world_size - 1`` (inclusive).
-    """
-    assert torch.distributed._initialized
-    return torch._C._dist_get_rank()
-
-
-def get_world_size():
-    r"""Returns the number of processes in the distributed group."""
-    assert torch.distributed._initialized
-    return torch._C._dist_get_num_processes()
-
-
-def isend(tensor, dst):
-    r"""Sends a tensor asynchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to send.
-        dst (int): Destination rank.
-
-    Returns:
-        A distributed request object.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return _DistributedRequest(torch._C._dist_isend(tensor, dst))
-
-
-def irecv(tensor, src):
-    r"""Receives a tensor asynchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to fill with received data.
-        src (int): Source rank.
-
-    Returns:
-        A distributed request object.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return _DistributedRequest(torch._C._dist_irecv(tensor, src))
-
-
-def send(tensor, dst):
-    r"""Sends a tensor synchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to send.
-        dst (int): Destination rank.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_send(tensor, dst)
-
-
-def recv(tensor, src=None):
-    r"""Receives a tensor synchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to fill with received data.
-        src (int, optional): Source rank. Will receive from any
-            process if unspecified.
-
-    Returns:
-        Sender rank.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    if src is None:
-        return torch._C._dist_recv_any_source(tensor)
-    return torch._C._dist_recv(tensor, src)
-
-
-def broadcast_multigpu(tensor_list, src, group=group.WORLD):
-    r"""Broadcasts the tensor to the whole group with multiple GPU tensors
-    per node.
-
-    :attr:`tensor` must have the same number of elements in all the GPUs from
-    all processes participating in the collective. each tensor in the list must
-    be on a different GPU.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`tensor_list` should only
-      contain GPU tensors.
-
-    Arguments:
-        tensor_list (List[Tensor]): Tensors that participate in the collective
-            operation. if ``src`` is the rank, then the first element of
-            ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all
-            other tensors (on different GPUs) in the src process and all tensors
-            in ``tensor_list`` of other non-src processes. You also need to make
-            sure that ``len(tensor_list)`` is the same for all the distributed
-            processes calling this function.
-
-        src (int): Source rank.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    return torch._C._dist_broadcast_multigpu(tensor_list, src, group)
-
-
-def broadcast(tensor, src, group=group.WORLD):
-    r"""Broadcasts the tensor to the whole group.
-
-    :attr:`tensor` must have the same number of elements in all processes
-    participating in the collective.
-
-    Arguments:
-        tensor (Tensor): Data to be sent if :attr:`src` is the rank of
-            current process, and tensor to be used to save received data
-            otherwise.
-        src (int): Source rank.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_broadcast(tensor, src, group)
-
-
-def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data across all machines in such a way that all get
-    the final result. This function reduces a number of tensors on every node,
-    while each tensor resides on a different GPU.
-    Therefore, the input tensor in the tensor list needs to be GPU tensors.
-    Also, each tensor in the tensor list needs to reside on a different GPU.
-
-    After the call, all tensors in :attr:`tensor_list` will be bitwise identical
-    in all processes.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`tensor_list` should only
-      contain GPU tensors.
-
-    Arguments:
-        tensor_list (List[Tensor]): List of input and output tensors of
-            the collective. The function operates in-place and requires that
-            each tensor to be a GPU tensor on different GPUs.
-            You also need to make sure that ``len(tensor_list)`` is the same for
-            all the distributed processes calling this function.
-
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    return torch._C._dist_all_reduce_multigpu(tensor_list, op, group)
-
-
-def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data across all machines in such a way that all get
-    the final result.
-
-    After the call :attr:`tensor` will be bitwise identical in all processes.
-
-    Arguments:
-        tensor (Tensor): Input and output of the collective. The function
-            operates in-place.
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_all_reduce(tensor, op, group)
-
-
-def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor
-    in :attr`tensor_list` should reside on a separate GPU.
-
-    Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is
-    going to receive the final result.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`tensor_list` should only
-      contain GPU tensors.
-
-    Arguments:
-        tensor_list (List[Tensor]): Input and output GPU tensors of the
-            collective. The function operates in-place.
-            You also need to make sure that ``len(tensor_list)`` is the same for
-            all the distributed processes calling this function.
-
-        dst (int): Destination rank
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group)
-
-
-def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data across all machines.
-
-    Only the process with rank :attr:`dst` is going to receive the final result.
-
-    Arguments:
-        tensor (Tensor): Input and output of the collective. The function
-            operates in-place.
-        dst (int): Destination rank
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_reduce(tensor, dst, op, group)
-
-
-def all_gather_multigpu(output_tensor_lists,
-                        input_tensor_list,
-                        group=group.WORLD):
-    r"""Gathers tensors from the whole group in a list.
-    Each tensor in :attr:`input_tensor_list` should reside on a separate GPU.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`output_tensor_lists` and
-      :attr:`input_tensor_list` should only contain GPU tensors.
-
-    Arguments:
-        output_tensor_lists (List[List[Tensor]]): Output lists. It should
-            contain correctly-sized tensors on each GPU to be used for output of
-            the collective.
-            e.g. ``output_tensor_lists[i]`` contains the all_gather
-            result that resides on the GPU of ``input_tensor_list[i]``.
-            Note that each element of ``output_tensor_lists[i]`` has the size of
-            ``world_size * len(input_tensor_list)``, since the function all
-            gathers the result from every single GPU in the group. To interpret
-            each element of ``output_tensor_list[i]``, note that
-            ``input_tensor_list[j]`` of rank k will be appear in
-            ``output_tensor_list[i][rank * world_size + j]``
-            Also note that ``len(output_tensor_lists)``, and the size of each
-            element in ``output_tensor_lists`` (each element is a list,
-            therefore ``len(output_tensor_lists[i])``) need to be the same
-            for all the distributed processes calling this function.
-
-        input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to
-            be broadcast from current process.
-            Note that ``len(input_tensor_list)`` needs to be the same for
-            all the distributed processes calling this function.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    flatten_tensor_list = []
-    for output_tensor_list in output_tensor_lists:
-        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))
-
-    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
-                                             input_tensor_list,
-                                             group)
-
-    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
-                                                  flatten_tensor_list):
-        for tensor, value in zip(output_tensor_list,
-                                 _unflatten_dense_tensors(flatten_tensor,
-                                                          output_tensor_list)):
-            tensor.copy_(value)
-
-    return ret
-
-
-def all_gather(tensor_list, tensor, group=group.WORLD):
-    r"""Gathers tensors from the whole group in a list.
-
-    Arguments:
-        tensor_list (list[Tensor]): Output list. It should contain
-            correctly-sized tensors to be used for output of the collective.
-        tensor (Tensor): Tensor to be broadcast from current process.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    if _backend != dist_backend.NCCL:
-        return torch._C._dist_all_gather(tensor_list, tensor, group)
-    else:
-        return all_gather_multigpu([tensor_list], [tensor], group)
-
-
-def gather(tensor, **kwargs):
-    r"""Gathers a list of tensors in a single process.
-
-    Arguments:
-        tensor (Tensor): Input tensor.
-        dst (int): Destination rank. Required in all processes except the one that
-            is receiveing the data.
-        gather_list (list[Tensor]): List of appropriately-sized tensors to
-            use for received data. Required only in the receiving process.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    my_rank = get_rank()
-    dst = kwargs.pop('dst', my_rank)
-    gather_list = kwargs.pop('gather_list', None)
-    _group = kwargs.pop('group', group.WORLD)
-    if kwargs:
-        raise RuntimeError("got unexpected kwargs")
-    if dst == my_rank:
-        if gather_list is None:
-            raise RuntimeError("gather_list is a required argument in gather destination")
-        return torch._C._dist_gather_recv(gather_list, tensor, _group)
-    else:
-        if gather_list:
-            raise RuntimeError("non-empty gather_list can be given only to gather destination")
-        return torch._C._dist_gather_send(tensor, dst, _group)
-
-
-def scatter(tensor, **kwargs):
-    r"""Scatters a list of tensors to all processes in a group.
-
-    Each process will receive exactly one tensor and store its data in the
-    :attr:`tensor` argument.
-
-    Arguments:
-        tensor (Tensor): Output tensor.
-        src (int): Source rank. Required in all processes except the one that
-            is sending the data.
-        scatter_list (list[Tensor]): List of tensors to scatter. Required only
-            in the process that is sending the data.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    my_rank = get_rank()
-    src = kwargs.pop('src', my_rank)
-    scatter_list = kwargs.pop('scatter_list', None)
-    _group = kwargs.pop('group', group.WORLD)
-    if kwargs:
-        raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys())))
-    if src == my_rank:
-        if scatter_list is None:
-            raise RuntimeError("scatter_list is a required argument in scatter source")
-        return torch._C._dist_scatter_send(scatter_list, tensor, _group)
-    else:
-        if scatter_list:
-            raise RuntimeError("non-empty can be given only to scatter source")
-        return torch._C._dist_scatter_recv(tensor, src, _group)
-
-
-def barrier(group=group.WORLD):
-    r"""Synchronizes all processes.
-
-    This collective blocks processes until the whole group enters this function.
-
-    Arguments:
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_barrier(group)
-
-
-def new_group(ranks=None):
-    r"""Creates a new distributed group.
-
-    This function requires that all processes in the main group (i.e., all
-    processes that are part of the distributed job) enter this function, even
-    if they are not going to be members of the group. Additionally, groups
-    should be created in the same order in all processes.
-
-    Arguments:
-        ranks (list[int]): List of ranks of group members.
-
-    Returns:
-        A handle of distributed group that can be given to collective calls.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    if ranks is None:
-        ranks = list(range(get_world_size()))
-    return torch._C._dist_new_group(ranks)
-
-
-def _clear_group_cache(group=group.WORLD):
-    r"""Clear the created distributed group's cached resource.
-
-    Only NCCL backend is currently supported.
+    return hasattr(torch._C, "_c10d_init")
 
-    Cached resource includes NCCL communicators and CUDA events.
 
-    Arguments:
-        group (optional): Group of the collective.
-    """
-    return torch._C._dist_clear_group_cache(group)
+if is_available() and not torch._C._c10d_init():
+    raise RuntimeError("Failed to initialize PyTorch distributed support")
 
 
-def _register_stream(stream):
-    if not _initialized:
-        raise RuntimeError("torch.distributed needs to be initialized first")
-    return torch._C._dist_register_stream(stream)
+if is_available():
+    from .distributed_c10d import *
diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py
deleted file mode 100644
index 5356097743aa3c..00000000000000
--- a/torch/distributed/c10d/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import torch
-
-
-def is_available():
-    return hasattr(torch._C, "_c10d_init")
-
-
-if is_available() and not torch._C._c10d_init():
-    raise RuntimeError("Failed to initialize PyTorch distributed support")
-
-
-if is_available():
-    from .distributed_c10d import *
diff --git a/torch/distributed/deprecated/__init__.py b/torch/distributed/deprecated/__init__.py
new file mode 100644
index 00000000000000..151439aa332bc0
--- /dev/null
+++ b/torch/distributed/deprecated/__init__.py
@@ -0,0 +1,566 @@
+"""
+torch.distributed.deprecated provides an MPI-like interface for exchanging tensor
+data across multi-machine networks. It supports a few different backends
+and initialization methods.
+"""
+import torch
+import atexit
+import warnings
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class dist_backend:
+    UNDEFINED = -1
+    TCP = 0
+    MPI = 1
+    GLOO = 2
+    NCCL = 3
+
+
+_INITIALIZED_PG = 1
+_INITIALIZED_MW = 2
+_initialized = 0
+_backend = dist_backend.UNDEFINED
+_scope = locals()
+
+
+def _extend_scope(module):
+    _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')})
+
+
+def is_available():
+    return torch._C._has_distributed()
+
+
+def destroy_process_group():
+    r"""Destroy the initialized distributed package
+    """
+    global _backend
+    global _initialized
+    torch._C._dist_destroy_process_group()
+    _backend = dist_backend.UNDEFINED
+    _initialized = 0
+
+
+def is_initialized():
+    r"""Checking if the process group has been initialized
+    """
+    return _initialized == _INITIALIZED_PG
+
+
+def init_process_group(backend, init_method='env://', **kwargs):
+    r"""Initializes the distributed package.
+
+    Arguments:
+        backend (str): Name of the backend to use. Depending on build-time configuration
+            valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``.
+        init_method (str, optional): URL specifying how to initialize the package.
+        world_size (int, optional): Number of processes participating in the job.
+        rank (int, optional): Rank of the current process.
+        group_name (str, optional): Group name. See description of init methods.
+
+    To enable ``backend == mpi``, PyTorch needs to built from source on a system that
+    supports MPI. If you want to use Open MPI with CUDA-aware support, please use
+    Open MPI major version 2 and above.
+
+    .. note::
+        This method initializes CUDA context. Therefore, if multiple processes
+        run on a single machine but use different GPUs, make sure to use
+        :func:`torch.cuda.set_device` before this method to avoid unnecessarily
+        creating context on the first visible device.
+
+    """
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if not is_available():
+        raise RuntimeError("PyTorch built without distributed support")
+
+    global _initialized
+    if _initialized:
+        raise RuntimeError("trying to initialize torch.distributed.deprecated twice!")
+
+    # Checking and assigning the distributed backend
+    global _backend
+
+    backend = backend.lower()
+    if backend == "tcp":
+        _backend = dist_backend.TCP
+    elif backend == "mpi":
+        _backend = dist_backend.MPI
+    elif backend == "gloo":
+        _backend = dist_backend.GLOO
+    elif backend == "nccl":
+        _backend = dist_backend.NCCL
+    else:
+        raise RuntimeError("Invalid distributed backend name: " + backend)
+
+    torch._C._dist_init_process_group(backend, init_method, world_size,
+                                      group_name, rank)
+    _initialized = _INITIALIZED_PG
+
+    if _backend == dist_backend.NCCL:
+        atexit.register(destroy_process_group)
+
+    if not torch._C._dist_init_extension(False, reduce_op, group):
+        raise RuntimeError("distributed module initialization failed")
+
+
+def init_master_worker(backend, init_method='env://', **kwargs):
+    warnings.warn("""
+    ================================================================================
+                                        WARNING
+    ================================================================================
+    Master-worker mode is still experimental. The API will change without
+    notice and we do not guarantee full correctness and expected performance yet.
+    We'll announce it once it's ready.
+    """)
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if not is_available():
+        raise RuntimeError("PyTorch built without distributed support")
+
+    global _initialized
+    if _initialized:
+        raise RuntimeError("trying to initialize torch.distributed.deprecated twice!")
+    torch._C._dist_init_master_worker(backend, init_method, world_size,
+                                      group_name, rank)
+    _initialized = _INITIALIZED_MW
+    import torch.distributed.deprecated.collectives as collectives
+    import torch.distributed.deprecated.remote_types as remote_types
+    _extend_scope(collectives)
+    _extend_scope(remote_types)
+    if not torch._C._dist_init_extension(True, reduce_op, group):
+        raise RuntimeError("distributed module initialization failed")
+
+
+class reduce_op(object):
+    SUM = object()
+    PRODUCT = object()
+    MAX = object()
+    MIN = object()
+
+
+class group(object):
+    WORLD = object()
+
+
+class _DistributedRequest(object):
+    def __init__(self, request):
+        self.request = request
+
+    def is_completed(self):
+        return torch._C._dist_request_is_completed(self.request)
+
+    def wait(self):
+        torch._C._dist_request_wait(self.request)
+
+
+def get_rank():
+    r"""Returns the rank of current process.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    group. They are always consecutive integers ranging from ``0`` to
+    ``world_size - 1`` (inclusive).
+    """
+    assert torch.distributed.deprecated._initialized
+    return torch._C._dist_get_rank()
+
+
+def get_world_size():
+    r"""Returns the number of processes in the distributed group."""
+    assert torch.distributed.deprecated._initialized
+    return torch._C._dist_get_num_processes()
+
+
+def isend(tensor, dst):
+    r"""Sends a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+
+    Returns:
+        A distributed request object.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return _DistributedRequest(torch._C._dist_isend(tensor, dst))
+
+
+def irecv(tensor, src):
+    r"""Receives a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int): Source rank.
+
+    Returns:
+        A distributed request object.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return _DistributedRequest(torch._C._dist_irecv(tensor, src))
+
+
+def send(tensor, dst):
+    r"""Sends a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_send(tensor, dst)
+
+
+def recv(tensor, src=None):
+    r"""Receives a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank. Will receive from any
+            process if unspecified.
+
+    Returns:
+        Sender rank.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if src is None:
+        return torch._C._dist_recv_any_source(tensor)
+    return torch._C._dist_recv(tensor, src)
+
+
+def broadcast_multigpu(tensor_list, src, group=group.WORLD):
+    r"""Broadcasts the tensor to the whole group with multiple GPU tensors
+    per node.
+
+    :attr:`tensor` must have the same number of elements in all the GPUs from
+    all processes participating in the collective. each tensor in the list must
+    be on a different GPU.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
+
+    Arguments:
+        tensor_list (List[Tensor]): Tensors that participate in the collective
+            operation. if ``src`` is the rank, then the first element of
+            ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all
+            other tensors (on different GPUs) in the src process and all tensors
+            in ``tensor_list`` of other non-src processes. You also need to make
+            sure that ``len(tensor_list)`` is the same for all the distributed
+            processes calling this function.
+
+        src (int): Source rank.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_broadcast_multigpu(tensor_list, src, group)
+
+
+def broadcast(tensor, src, group=group.WORLD):
+    r"""Broadcasts the tensor to the whole group.
+
+    :attr:`tensor` must have the same number of elements in all processes
+    participating in the collective.
+
+    Arguments:
+        tensor (Tensor): Data to be sent if :attr:`src` is the rank of
+            current process, and tensor to be used to save received data
+            otherwise.
+        src (int): Source rank.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_broadcast(tensor, src, group)
+
+
+def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data across all machines in such a way that all get
+    the final result. This function reduces a number of tensors on every node,
+    while each tensor resides on a different GPU.
+    Therefore, the input tensor in the tensor list needs to be GPU tensors.
+    Also, each tensor in the tensor list needs to reside on a different GPU.
+
+    After the call, all tensors in :attr:`tensor_list` will be bitwise identical
+    in all processes.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
+
+    Arguments:
+        tensor_list (List[Tensor]): List of input and output tensors of
+            the collective. The function operates in-place and requires that
+            each tensor to be a GPU tensor on different GPUs.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_all_reduce_multigpu(tensor_list, op, group)
+
+
+def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call :attr:`tensor` will be bitwise identical in all processes.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_all_reduce(tensor, op, group)
+
+
+def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor
+    in :attr`tensor_list` should reside on a separate GPU.
+
+    Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is
+    going to receive the final result.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
+
+    Arguments:
+        tensor_list (List[Tensor]): Input and output GPU tensors of the
+            collective. The function operates in-place.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+
+        dst (int): Destination rank
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group)
+
+
+def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data across all machines.
+
+    Only the process with rank :attr:`dst` is going to receive the final result.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        dst (int): Destination rank
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_reduce(tensor, dst, op, group)
+
+
+def all_gather_multigpu(output_tensor_lists,
+                        input_tensor_list,
+                        group=group.WORLD):
+    r"""Gathers tensors from the whole group in a list.
+    Each tensor in :attr:`input_tensor_list` should reside on a separate GPU.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`output_tensor_lists` and
+      :attr:`input_tensor_list` should only contain GPU tensors.
+
+    Arguments:
+        output_tensor_lists (List[List[Tensor]]): Output lists. It should
+            contain correctly-sized tensors on each GPU to be used for output of
+            the collective.
+            e.g. ``output_tensor_lists[i]`` contains the all_gather
+            result that resides on the GPU of ``input_tensor_list[i]``.
+            Note that each element of ``output_tensor_lists[i]`` has the size of
+            ``world_size * len(input_tensor_list)``, since the function all
+            gathers the result from every single GPU in the group. To interpret
+            each element of ``output_tensor_list[i]``, note that
+            ``input_tensor_list[j]`` of rank k will be appear in
+            ``output_tensor_list[i][rank * world_size + j]``
+            Also note that ``len(output_tensor_lists)``, and the size of each
+            element in ``output_tensor_lists`` (each element is a list,
+            therefore ``len(output_tensor_lists[i])``) need to be the same
+            for all the distributed processes calling this function.
+
+        input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to
+            be broadcast from current process.
+            Note that ``len(input_tensor_list)`` needs to be the same for
+            all the distributed processes calling this function.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    flatten_tensor_list = []
+    for output_tensor_list in output_tensor_lists:
+        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))
+
+    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
+                                             input_tensor_list,
+                                             group)
+
+    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
+                                                  flatten_tensor_list):
+        for tensor, value in zip(output_tensor_list,
+                                 _unflatten_dense_tensors(flatten_tensor,
+                                                          output_tensor_list)):
+            tensor.copy_(value)
+
+    return ret
+
+
+def all_gather(tensor_list, tensor, group=group.WORLD):
+    r"""Gathers tensors from the whole group in a list.
+
+    Arguments:
+        tensor_list (list[Tensor]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if _backend != dist_backend.NCCL:
+        return torch._C._dist_all_gather(tensor_list, tensor, group)
+    else:
+        return all_gather_multigpu([tensor_list], [tensor], group)
+
+
+def gather(tensor, **kwargs):
+    r"""Gathers a list of tensors in a single process.
+
+    Arguments:
+        tensor (Tensor): Input tensor.
+        dst (int): Destination rank. Required in all processes except the one that
+            is receiveing the data.
+        gather_list (list[Tensor]): List of appropriately-sized tensors to
+            use for received data. Required only in the receiving process.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    my_rank = get_rank()
+    dst = kwargs.pop('dst', my_rank)
+    gather_list = kwargs.pop('gather_list', None)
+    _group = kwargs.pop('group', group.WORLD)
+    if kwargs:
+        raise RuntimeError("got unexpected kwargs")
+    if dst == my_rank:
+        if gather_list is None:
+            raise RuntimeError("gather_list is a required argument in gather destination")
+        return torch._C._dist_gather_recv(gather_list, tensor, _group)
+    else:
+        if gather_list:
+            raise RuntimeError("non-empty gather_list can be given only to gather destination")
+        return torch._C._dist_gather_send(tensor, dst, _group)
+
+
+def scatter(tensor, **kwargs):
+    r"""Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    :attr:`tensor` argument.
+
+    Arguments:
+        tensor (Tensor): Output tensor.
+        src (int): Source rank. Required in all processes except the one that
+            is sending the data.
+        scatter_list (list[Tensor]): List of tensors to scatter. Required only
+            in the process that is sending the data.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    my_rank = get_rank()
+    src = kwargs.pop('src', my_rank)
+    scatter_list = kwargs.pop('scatter_list', None)
+    _group = kwargs.pop('group', group.WORLD)
+    if kwargs:
+        raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys())))
+    if src == my_rank:
+        if scatter_list is None:
+            raise RuntimeError("scatter_list is a required argument in scatter source")
+        return torch._C._dist_scatter_send(scatter_list, tensor, _group)
+    else:
+        if scatter_list:
+            raise RuntimeError("non-empty can be given only to scatter source")
+        return torch._C._dist_scatter_recv(tensor, src, _group)
+
+
+def barrier(group=group.WORLD):
+    r"""Synchronizes all processes.
+
+    This collective blocks processes until the whole group enters this function.
+
+    Arguments:
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_barrier(group)
+
+
+def new_group(ranks=None):
+    r"""Creates a new distributed group.
+
+    This function requires that all processes in the main group (i.e., all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group. Additionally, groups
+    should be created in the same order in all processes.
+
+    Arguments:
+        ranks (list[int]): List of ranks of group members.
+
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if ranks is None:
+        ranks = list(range(get_world_size()))
+    return torch._C._dist_new_group(ranks)
+
+
+def _clear_group_cache(group=group.WORLD):
+    r"""Clear the created distributed group's cached resource.
+
+    Only NCCL backend is currently supported.
+
+    Cached resource includes NCCL communicators and CUDA events.
+
+    Arguments:
+        group (optional): Group of the collective.
+    """
+    return torch._C._dist_clear_group_cache(group)
+
+
+def _register_stream(stream):
+    if not _initialized:
+        raise RuntimeError("torch.distributed.deprecated needs to be initialized first")
+    return torch._C._dist_register_stream(stream)
diff --git a/torch/distributed/remote_types.py b/torch/distributed/deprecated/remote_types.py
similarity index 96%
rename from torch/distributed/remote_types.py
rename to torch/distributed/deprecated/remote_types.py
index a8d10cd93b341f..549a131dabd8c5 100644
--- a/torch/distributed/remote_types.py
+++ b/torch/distributed/deprecated/remote_types.py
@@ -55,6 +55,6 @@ class HalfStorage(_DistributedBase, torch._C.DistributedHalfStorageBase, _Storag
 _tensors = [_locals[t + 'Tensor'] for t in _type_names]
 _storages = [_locals[t + 'Storage'] for t in _type_names]
 for cls in _tensors + _storages:
-    cls.__module__ = 'torch.distributed'
+    cls.__module__ = 'torch.distributed.deprecated'
 torch._C._init_names(_tensors + _storages)
 del _locals, _type_names, _tensors, _storages
diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/distributed_c10d.py
similarity index 100%
rename from torch/distributed/c10d/distributed_c10d.py
rename to torch/distributed/distributed_c10d.py
diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/rendezvous.py
similarity index 100%
rename from torch/distributed/c10d/rendezvous.py
rename to torch/distributed/rendezvous.py
diff --git a/torch/nn/parallel/__init__.py b/torch/nn/parallel/__init__.py
index 00d942abf35e0e..066f415a73409a 100644
--- a/torch/nn/parallel/__init__.py
+++ b/torch/nn/parallel/__init__.py
@@ -4,8 +4,7 @@
 from .scatter_gather import scatter, gather
 from .distributed import DistributedDataParallel
 from .distributed_cpu import DistributedDataParallelCPU
-from .distributed_c10d import _DistributedDataParallelC10d
-from .distributed_c10d_cpu import _DistributedDataParallelC10dCPU
+import torch.nn.parallel.deprecated
 
 __all__ = ['replicate', 'scatter', 'parallel_apply', 'gather', 'data_parallel',
            'DataParallel', 'DistributedDataParallel', 'DistributedDataParallelCPU']
diff --git a/torch/nn/parallel/deprecated/__init__.py b/torch/nn/parallel/deprecated/__init__.py
new file mode 100644
index 00000000000000..262827cf5c919d
--- /dev/null
+++ b/torch/nn/parallel/deprecated/__init__.py
@@ -0,0 +1,4 @@
+from .distributed import DistributedDataParallel
+from .distributed_cpu import DistributedDataParallelCPU
+
+__all__ = ['DistributedDataParallel', 'DistributedDataParallelCPU']
diff --git a/torch/nn/parallel/deprecated/distributed.py b/torch/nn/parallel/deprecated/distributed.py
new file mode 100644
index 00000000000000..655cdce0102bb5
--- /dev/null
+++ b/torch/nn/parallel/deprecated/distributed.py
@@ -0,0 +1,482 @@
+import sys
+import math
+import threading
+import copy
+
+import torch
+from torch.autograd import Variable
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
+    _take_tensors
+
+from torch.cuda.comm import broadcast_coalesced
+from torch.cuda import nccl
+import torch.distributed.deprecated as dist
+
+from ...modules import Module
+from ..replicate import replicate
+from ..scatter_gather import scatter_kwargs, gather
+from ..parallel_apply import parallel_apply
+
+if sys.version_info[0] == 3:
+    import queue
+else:
+    import Queue as queue
+
+
+class DistributedDataParallel(Module):
+    r"""Implements distributed data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. The module is replicated on each machine and each device, and
+    each such replica handles a portion of the input. During the backwards
+    pass, gradients from each node are averaged.
+
+    The batch size should be larger than the number of GPUs used locally. It
+    should also be an integer multiple of the number of GPUs so that each chunk
+    is the same size (so that each GPU processes the same number of samples).
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires the distributed package to be already
+    initialized in the process group mode
+    (see :func:`torch.distributed.deprecated.init_process_group`).
+
+    .. warning::
+        This module works only with the ``nccl`` and ``gloo`` backends.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) is a distributed synchronization
+        point. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    .. warning::
+        This module assumes all buffers and gradients are dense.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` backend or a ``gloo``
+        backend (that uses Infiniband), together with a DataLoader that uses
+        multiple workers, please change the multiprocessing start method to
+        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
+        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
+        likely experience deadlocks if you don't change this setting.
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        won't be invoked anymore, unless the hooks are initialized in the
+        :meth:`forward` method.
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
+                           the module at beginning of the forward function.
+                           (default: True)
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+
+        >>> torch.distributed.deprecated.init_process_group(world_size=4, init_method='...')
+        >>> net = torch.nn.DistributedDataParallel(model)
+    """
+
+    def __init__(self, module, device_ids=None, output_device=None, dim=0,
+                 broadcast_buffers=True):
+        super(DistributedDataParallel, self).__init__()
+        if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO):
+            raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel')
+
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.output_device = output_device
+        self.broadcast_buffers = broadcast_buffers
+
+        # Flag used by the NCCL backend to make sure we only reduce gradients
+        # one time in the execution engine
+        self.need_reduction = False
+
+        MB = 1024 * 1024
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = 10 * MB
+        self.nccl_reduce_bucket_size = 256 * MB
+
+        # Sync params and buffers
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+
+        if len(device_ids) > 1:
+            # TODO: we don't need to replicate params in here. they're always going to
+            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
+            # better to not pollute the caches with these small blocks
+            self._module_copies = replicate(self.module, self.device_ids, detach=True)
+            self._module_copies[0] = self.module
+
+            for module_copy in self._module_copies[1:]:
+                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
+                    copy_param.requires_grad = param.requires_grad
+
+        else:
+            self._module_copies = [self.module]
+
+        # For NCCL backend, since every single NCCL call is asynchoronous, we
+        # therefore directly enqueue all the NCCL reduction calls to the
+        # default CUDA stream without spawning up other reduction threads.
+        # This achieves the best performance.
+        if dist._backend == dist.dist_backend.NCCL:
+            self._register_nccl_grad_hook()
+            return
+
+        bucket_bytes_cap = 1 * MB
+
+        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
+        param_buckets = []
+        # Split the parameters into buckets and by types as well
+        for dev_idx, module in enumerate(self._module_copies):
+            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))
+
+        self.bucket_sizes = []
+        self.bucket_map = {}
+
+        # We transpose param_buckets, so the loop is over buckets.
+        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
+        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
+            self.bucket_sizes.append(0)
+            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
+            # of params from each device.
+            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
+                if idx == 0:
+                    # Bucket parameter type tracking
+                    bucket_param_type = param_tuple[0].type()
+                    # Only gloo and nccl support half-precision
+                    if bucket_param_type == torch.cuda.HalfTensor and \
+                            dist._backend != dist.dist_backend.GLOO:
+                        raise RuntimeError("DistributedDataParallel currently only "
+                                           "supports half precision parameters "
+                                           "with Nccl and Gloo backend")
+                if not param_tuple[0].requires_grad:
+                    continue
+                for p in param_tuple:
+                    self.bucket_map[p] = bucket_idx
+                self.bucket_sizes[bucket_idx] += 1
+
+        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
+        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
+        self.reduced = [False] * len(self.bucket_sizes)
+
+        self._register_grad_hooks()
+
+        self.dispatch_lock = threading.Lock()
+        self._start_reduction_threads()
+
+    def __getstate__(self):
+        attrs = copy.copy(self.__dict__)
+        if dist._backend != dist.dist_backend.NCCL:
+            del attrs['_grad_accs'], attrs['_reduction_queues'], \
+                attrs['_reduction_streams'], attrs['_reduction_threads'], \
+                attrs['_nccl_streams'], attrs['_default_streams']
+        return attrs
+
+    def __setstate__(self, state):
+        super(DistributedDataParallel, self).__setstate__(state)
+        if dist._backend == dist.dist_backend.NCCL:
+            self._register_nccl_grad_hook()
+        else:
+            self._register_grad_hooks()
+            self._start_reduction_threads()
+
+    def forward(self, *inputs, **kwargs):
+        self.need_reduction = True
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        self._sync_params()
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super(DistributedDataParallel, self).train(mode)
+        for module in self._module_copies[1:]:
+            module.train(mode)
+
+    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+        """
+        Broadcast a sequence of tensors to the default group from rank 0.
+        Small tensors are first coalesced into a buffer to reduce the number of
+        broadcasts.
+
+        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
+                            same GPU.
+        buffer_size (int): maximum size of the buffer for coalescing
+        """
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            dist.broadcast(flat_tensors, 0)
+            for tensor, synced in zip(tensors,
+                                      _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self):
+        if len(self.device_ids) > 1:
+            # intra-node parameter sync
+            params = [p.data for p in self.module.parameters()]
+            result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
+            for tensors, module in zip(result[1:], self._module_copies[1:]):
+                for tensor, param in zip(tensors, module.parameters()):
+                    param.data.set_(tensor)
+
+        # module buffer sync
+        if self.broadcast_buffers:
+            buffers = [b.data for b in self.module.buffers()]
+            if len(buffers) > 0:
+                # cross-node buffer sync
+                self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size)
+
+                if len(self.device_ids) > 1:
+                    # intra-node buffer sync
+                    result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
+                    for tensors, module in zip(result[1:], self._module_copies[1:]):
+                        for tensor, buf in zip(tensors, module.buffers()):
+                            buf.data.set_(tensor)
+
+    def _register_grad_hooks(self):
+        self._grad_accs = []  # need to keep them in scope
+        for device_idx, module in enumerate(self._module_copies):
+            for p in module.parameters():
+                if p.requires_grad:
+                    p_tmp = p.expand_as(p)
+                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(p, device_idx))
+                    self._grad_accs.append(grad_acc)
+
+    def _register_nccl_grad_hook(self):
+        """
+        This function registers the callback all-reduction function for the
+        NCCL backend. All gradients will be all reduced in one single step.
+        The NCCL reduction will directly be enqueued into the
+        default CUDA stream. Therefore, no synchronization is needed.
+        """
+        # Creating a new group
+        self.nccl_reduction_group_id = dist.new_group()
+
+        def reduction_fn_nccl():
+            # This function only needs to be called once
+            if not self.need_reduction:
+                return
+
+            self.need_reduction = False
+            all_grads = [[] for _ in range(len(self._module_copies))]
+            all_grads_buckets_iters = []
+
+            # Bucketing all the gradients
+            for dev_idx, module in enumerate(self._module_copies):
+                for param in module.parameters():
+                    if not param.requires_grad or param.grad is None:
+                        continue
+                    if param.grad.requires_grad:
+                        raise RuntimeError("DistributedDataParallel only works "
+                                           "with gradients that don't require "
+                                           "grad")
+                    # Adding the gradients for reduction
+                    all_grads[dev_idx].append(param.grad.data)
+
+                # Now bucketing the parameters
+                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
+                                                  self.nccl_reduce_bucket_size)
+
+                all_grads_buckets_iters.append(dev_grads_buckets)
+
+            # Now reduce each bucket one after another
+            for grads_batch in zip(*all_grads_buckets_iters):
+                grads_batch_coalesced = []
+                # Coalesce each bucket
+                for dev_idx, dev_grads_batch in enumerate(grads_batch):
+                    dev_id = self.device_ids[dev_idx]
+                    with torch.cuda.device(dev_id):
+                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
+                        grads_batch_coalesced.append(dev_grads_batch_coalesced)
+
+                # We will only use device 0's results, but this single op should be
+                # faster than doing the following two operation sequentially:
+                # (1) intra-node reduce to lead GPU, followed by
+                # (2) inter-node allreduce for all the first lead GPUs in all nodes
+                dist.all_reduce_multigpu(grads_batch_coalesced,
+                                         group=self.nccl_reduction_group_id)
+
+                # Now only work on the first device of self.device_ids, uncoalesce
+                # the gradients for each bucket
+                grads_batch_coalesced[0] /= dist.get_world_size()
+                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
+                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
+                    grad.copy_(reduced)
+
+            # clear the gradients and save memory for replicas
+            for module in self._module_copies[1:]:
+                for param in module.parameters():
+                    if param.requires_grad:
+                        param.grad = None
+                        param.data.set_()
+
+        # Now register the reduction hook on the parameters
+        for p in self.module.parameters():
+            if not p.requires_grad:
+                continue
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(reduction_fn_nccl)
+
+            p.register_hook(allreduce_hook)
+
+    def _make_param_hook(self, param, device_idx):
+
+        bucket_idx = self.bucket_map[param]
+
+        def distributed_data_parallel_hook(*unused):
+            if param.grad.requires_grad:
+                raise RuntimeError("DistributedDataParallel only works with "
+                                   "gradients that don't require grad")
+            bucket = self.buckets[bucket_idx][device_idx]
+            bucket.append(param.grad.data)
+
+            # We can flush these and save memory for replicas
+            if device_idx > 0:
+                param.grad = None
+                param.data.set_()
+
+            # Current device's bucket is full
+            if len(bucket) == self.bucket_sizes[bucket_idx]:
+                with torch.cuda.device(self.device_ids[device_idx]):
+                    event = torch.cuda.Event()
+                    event.record()
+                with self.dispatch_lock:
+                    self.bucket_events[bucket_idx][device_idx] = event
+                    self._queue_reduction(bucket_idx)
+
+        return distributed_data_parallel_hook
+
+    def _queue_reduction(self, bucket_idx):
+        dev_buckets = self.buckets[bucket_idx]
+        dev_events = self.bucket_events[bucket_idx]
+
+        # Check if it's ready
+        if any(evt is None for evt in dev_events):
+            return
+
+        # Queue the reduction and make sure backward waits for it
+        event = threading.Event()
+        self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event))
+        Variable._execution_engine.queue_callback(lambda: event.wait())
+
+        # Reset bucket state
+        self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))]
+        self.bucket_events[bucket_idx] = [None] * len(self.device_ids)
+        self.reduced[bucket_idx] = True
+        if all(self.reduced):
+            self.reduced = [False] * len(self.bucket_sizes)
+
+            def sync_reduction_streams():
+                # We only have to sync with the first one, but it's safer to do it this way
+                # in case we change the way in which we paralellize work
+                r_streams = zip(*self._reduction_streams)
+                for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams):
+                    with torch.cuda.device(dev_id):
+                        for reduction_stream in dev_r_streams:
+                            default_stream.wait_stream(reduction_stream)
+            Variable._execution_engine.queue_callback(sync_reduction_streams)
+
+    def _start_reduction_threads(self):
+        num_buckets = len(self.bucket_sizes)
+        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
+        self._reduction_threads = []
+        self._reduction_streams = [[] for _ in range(num_buckets)]
+        self._nccl_streams = []
+        self._default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                # TODO: don't assume we're on a default stream
+                self._default_streams.append(torch.cuda.current_stream())
+                self._nccl_streams.append(torch.cuda.Stream())
+        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
+            for dev_id in self.device_ids:
+                with torch.cuda.device(dev_id):
+                    reduction_streams.append(torch.cuda.Stream())
+            # We only use the first device for distributed reductions
+            dist._register_stream(reduction_streams[0])
+
+            group_id = dist.new_group()
+
+            self._reduction_threads.append(threading.Thread(
+                target=self._reduction_thread_fn,
+                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
+            self._reduction_threads[-1].daemon = True
+            self._reduction_threads[-1].start()
+
+    @staticmethod
+    def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams):
+
+        def _process_batch():
+            dev_grad_batch, dev_events, job_event = queue.get()
+            dev_coalesced = []
+            # Coalesce the tensors on all devices and start a local reduction
+            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
+                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
+                    stream.wait_event(event)
+                    coalesced = _flatten_dense_tensors(grad_batch)
+                    dev_coalesced.append(coalesced)
+            # Wait for all copies to complete before starting the NCCL kernel
+            for stream in reduction_streams:
+                stream.synchronize()
+            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)
+
+            # From now on we're only going to work on the first device (from device_ids)
+            grad_batch = dev_grad_batch[0]
+            coalesced = dev_coalesced[0]
+            reduce_stream = reduction_streams[0]
+            with torch.cuda.stream(reduce_stream):
+                reduce_stream.wait_stream(nccl_streams[0])
+                coalesced /= dist.get_world_size()
+                dist.all_reduce(coalesced, group=group_id)
+                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
+                    grad.copy_(reduced)
+            job_event.set()
+
+        with torch.cuda.device(device_ids[0]):
+            while True:
+                _process_batch()  # just to have a clear scope
diff --git a/torch/nn/parallel/distributed_c10d_cpu.py b/torch/nn/parallel/deprecated/distributed_cpu.py
similarity index 88%
rename from torch/nn/parallel/distributed_c10d_cpu.py
rename to torch/nn/parallel/deprecated/distributed_cpu.py
index d7cdeb45cae7f0..e93ed2a9b816f1 100644
--- a/torch/nn/parallel/distributed_c10d_cpu.py
+++ b/torch/nn/parallel/deprecated/distributed_cpu.py
@@ -1,16 +1,15 @@
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-import torch.distributed.c10d as dist
+import torch.distributed.deprecated as dist
 from torch.nn.modules import Module
 from collections import defaultdict
 from torch.autograd import Variable
 
 
-class _DistributedDataParallelC10dCPU(Module):
-    r"""Implements distributed data parallelism for CPU at the module level
-    and it will use PyTorch's new distributed package: c10d.
+class DistributedDataParallelCPU(Module):
+    r"""Implements distributed data parallelism for CPU at the module level.
 
-    This module support the ``mpi``, ``gloo``, backends.
+    This module support the ``mpi``, ``gloo``, ``tcp`` backends.
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
@@ -29,7 +28,7 @@ class _DistributedDataParallelC10dCPU(Module):
 
     Creation of this class requires the distributed package to be already
     initialized in the process group mode
-    (see :func:`torch.distributed.init_process_group`).
+    (see :func:`torch.distributed.deprecated.init_process_group`).
 
     .. warning::
         Constructor, forward method, and differentiation of the output (or a
@@ -64,12 +63,12 @@ class _DistributedDataParallelC10dCPU(Module):
 
     Example::
 
-        >>> torch.distributed.init_process_group(world_size=4, init_method='...')
-        >>> net = torch.nn._DistributedDataParallelC10dCPU(model)
+        >>> torch.distributed.deprecated.init_process_group(world_size=4, init_method='...')
+        >>> net = torch.nn.DistributedDataParallelCPU(model)
     """
 
     def __init__(self, module):
-        super(_DistributedDataParallelC10dCPU, self).__init__()
+        super(DistributedDataParallelCPU, self).__init__()
         self.module = module
         self.sync_parameters()
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index caf1f533acbec9..175aac70be9f3d 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1,10 +1,6 @@
-import sys
-import math
-import threading
 import copy
 
 import torch
-from torch.autograd import Variable
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
     _take_tensors
 
@@ -17,14 +13,13 @@
 from .scatter_gather import scatter_kwargs, gather
 from .parallel_apply import parallel_apply
 
-if sys.version_info[0] == 3:
-    import queue
-else:
-    import Queue as queue
-
 
 class DistributedDataParallel(Module):
-    r"""Implements distributed data parallelism at the module level.
+    r"""Implements distributed data parallelism that is based on c10d at the
+    module level.
+
+    Currently this module is EXPERIMENTAL ONLY and should not be
+    used by normal users. Instead, please use DistributedDataParallel.
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
@@ -39,12 +34,12 @@ class DistributedDataParallel(Module):
     See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
     The same constraints on input as in :class:`torch.nn.DataParallel` apply.
 
-    Creation of this class requires the distributed package to be already
-    initialized in the process group mode
-    (see :func:`torch.distributed.init_process_group`).
+    Creation of this class requires the c10d process group to be already
+    initialized. This class will basically operate on the provided c10d
+    process group.
 
     .. warning::
-        This module works only with the ``nccl`` and ``gloo`` backends.
+        This module works only with the ``gloo`` and ``nccl`` process groups.
 
     .. warning::
         Constructor, forward method, and differentiation of the output (or a
@@ -57,6 +52,14 @@ class DistributedDataParallel(Module):
         time it is created. No parameters should be added nor removed later.
         Same applies to buffers.
 
+    -- warning::
+        This module assumes all parameters are registered in the model of each
+        distributed processes are in the same order. The module itself will
+        conduct gradient all-reduction following the reverse order of the
+        registered parameters of the model. In other wise, it is users'
+        responsibility to ensure that each distributed process has the exact
+        same model and thus the exact parameter registeration order.
+
     .. warning::
         This module assumes all buffers and gradients are dense.
 
@@ -66,12 +69,13 @@ class DistributedDataParallel(Module):
         parameters).
 
     .. warning::
-        If you plan on using this module with a ``nccl`` backend or a ``gloo``
-        backend (that uses Infiniband), together with a DataLoader that uses
-        multiple workers, please change the multiprocessing start method to
-        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
-        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
-        likely experience deadlocks if you don't change this setting.
+        If you plan on using this module with a ``nccl`` process group or
+        a ``gloo`` process group (that uses Infiniband), together with a
+        DataLoader that uses multiple workers, please change the multiprocessing
+        start method to ``forkserver`` (Python 3 only) or ``spawn``.
+        Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe,
+        and you will likely experience deadlocks if you don't change this
+        setting.
 
     .. note::
         Parameters are never broadcast between processes. The module performs
@@ -92,40 +96,53 @@ class DistributedDataParallel(Module):
         broadcast_buffers: flag that enables syncing (broadcasting) buffers of
                            the module at beginning of the forward function.
                            (default: True)
+        process_group: the c10d process group to be used for distributed data
+                       all-reduction. If None, the default process group will
+                       be used. (default: None)
+        bucket_cap_mb: DistributedDataParallel will bucket parameters into
+                       multiple buckets so that gradient reduction of each
+                       bucket can potentially overlap with backward computation.
+                       bucket_cap_mb controls the bucket size in MegaBytes (MB)
+                       (default: 25)
 
     Attributes:
         module (Module): the module to be parallelized
 
     Example::
-
-        >>> torch.distributed.init_process_group(world_size=4, init_method='...')
-        >>> net = torch.nn.DistributedDataParallel(model)
+        >>> store = torch.distributed.FileStore("/tmp/tempfile.txt")
+        >>> pg = torch.distributed.ProcessGroupGloo(store, rank, world_size)
+        >>> net = torch.nn.DistributedDataParallel(model, pg)
     """
+    def __init__(self, module, device_ids=None,
+                 output_device=None, dim=0, broadcast_buffers=True,
+                 process_group=None, bucket_cap_mb=25):
 
-    def __init__(self, module, device_ids=None, output_device=None, dim=0,
-                 broadcast_buffers=True):
         super(DistributedDataParallel, self).__init__()
-        if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO):
-            raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel')
 
+        # Use all devices by default
         if device_ids is None:
             device_ids = list(range(torch.cuda.device_count()))
+
         if output_device is None:
             output_device = device_ids[0]
+
+        if process_group is None:
+            self.process_group = dist.get_default_group()
+        else:
+            self.process_group = process_group
+
         self.dim = dim
         self.module = module
         self.device_ids = device_ids
         self.output_device = output_device
         self.broadcast_buffers = broadcast_buffers
 
-        # Flag used by the NCCL backend to make sure we only reduce gradients
-        # one time in the execution engine
-        self.need_reduction = False
+        self.allreduce_opts = dist.AllreduceOptions()
 
         MB = 1024 * 1024
+
         # used for intra-node param sync and inter-node sync as well
-        self.broadcast_bucket_size = 10 * MB
-        self.nccl_reduce_bucket_size = 256 * MB
+        self.broadcast_bucket_size = 25 * MB
 
         # Sync params and buffers
         module_states = list(self.module.state_dict().values())
@@ -147,21 +164,19 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0,
         else:
             self._module_copies = [self.module]
 
-        # For NCCL backend, since every single NCCL call is asynchoronous, we
-        # therefore directly enqueue all the NCCL reduction calls to the
-        # default CUDA stream without spawning up other reduction threads.
-        # This achieves the best performance.
-        if dist._backend == dist.dist_backend.NCCL:
-            self._register_nccl_grad_hook()
-            return
+        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
+        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]
+
+        for dev_idx, module in enumerate(self._module_copies):
+            self.modules_params_data[dev_idx] = [p.data for p in module.parameters()]
+            self.modules_buffers_data[dev_idx] = [b.data for b in module.buffers()]
 
-        bucket_bytes_cap = 1 * MB
+        bucket_bytes_cap = bucket_cap_mb * MB
 
         # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
         param_buckets = []
         # Split the parameters into buckets and by types as well
-        for dev_idx, module in enumerate(self._module_copies):
-            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))
+        param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies]
 
         self.bucket_sizes = []
         self.bucket_map = {}
@@ -173,48 +188,45 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0,
             # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
             # of params from each device.
             for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
-                if idx == 0:
-                    # Bucket parameter type tracking
-                    bucket_param_type = param_tuple[0].type()
-                    # Only gloo and nccl support half-precision
-                    if bucket_param_type == torch.cuda.HalfTensor and \
-                            dist._backend != dist.dist_backend.GLOO:
-                        raise RuntimeError("DistributedDataParallel currently only "
-                                           "supports half precision parameters "
-                                           "with Nccl and Gloo backend")
                 if not param_tuple[0].requires_grad:
                     continue
                 for p in param_tuple:
-                    self.bucket_map[p] = bucket_idx
+                    self.bucket_map[p] = (bucket_idx, idx)
                 self.bucket_sizes[bucket_idx] += 1
 
-        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
-        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
-        self.reduced = [False] * len(self.bucket_sizes)
+        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
+                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+        # The number of params ready in each bucket
+        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
 
-        self._register_grad_hooks()
+        # coalesced bucket for only device 0
+        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
+        # We will always reduce the bucket following the reverse order
+        # that is, alway reduces following the order of: n - 1, n - 2, ..., 0
+        self.next_bucket = len(self.bucket_sizes) - 1
+        self.ready_buckets_not_reduced = set()
+        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
+
+        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
 
-        self.dispatch_lock = threading.Lock()
-        self._start_reduction_threads()
+        # default stream tracking to launch nccl reduce kernels
+        self.default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                self.default_streams.append(torch.cuda.current_stream())
+
+        self._register_grad_hooks()
 
     def __getstate__(self):
         attrs = copy.copy(self.__dict__)
-        if dist._backend != dist.dist_backend.NCCL:
-            del attrs['_grad_accs'], attrs['_reduction_queues'], \
-                attrs['_reduction_streams'], attrs['_reduction_threads'], \
-                attrs['_nccl_streams'], attrs['_default_streams']
+        del attrs['_grad_accs']
         return attrs
 
     def __setstate__(self, state):
         super(DistributedDataParallel, self).__setstate__(state)
-        if dist._backend == dist.dist_backend.NCCL:
-            self._register_nccl_grad_hook()
-        else:
-            self._register_grad_hooks()
-            self._start_reduction_threads()
+        self._register_grad_hooks()
 
     def forward(self, *inputs, **kwargs):
-        self.need_reduction = True
         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
         self._sync_params()
         if len(self.device_ids) == 1:
@@ -237,44 +249,32 @@ def train(self, mode=True):
             module.train(mode)
 
     def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        """
-        Broadcast a sequence of tensors to the default group from rank 0.
-        Small tensors are first coalesced into a buffer to reduce the number of
-        broadcasts.
-
-        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
-                            same GPU.
-        buffer_size (int): maximum size of the buffer for coalescing
-        """
-        for tensors in _take_tensors(tensors, buffer_size):
-            flat_tensors = _flatten_dense_tensors(tensors)
-            dist.broadcast(flat_tensors, 0)
-            for tensor, synced in zip(tensors,
-                                      _unflatten_dense_tensors(flat_tensors, tensors)):
-                tensor.copy_(synced)
+        dist._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
 
     def _sync_params(self):
         if len(self.device_ids) > 1:
             # intra-node parameter sync
-            params = [p.data for p in self.module.parameters()]
-            result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
-            for tensors, module in zip(result[1:], self._module_copies[1:]):
-                for tensor, param in zip(tensors, module.parameters()):
-                    param.data.set_(tensor)
+            result = broadcast_coalesced(self.modules_params_data[0],
+                                         self.device_ids,
+                                         self.broadcast_bucket_size)
+            for tensors, module_params_data in zip(result[1:], self.modules_params_data[1:]):
+                for tensor, param_data in zip(tensors, module_params_data):
+                    param_data.set_(tensor)
 
         # module buffer sync
         if self.broadcast_buffers:
-            buffers = [b.data for b in self.module.buffers()]
-            if len(buffers) > 0:
+            if len(self.modules_buffers_data[0]) > 0:
                 # cross-node buffer sync
-                self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size)
-
+                self._dist_broadcast_coalesced(self.modules_buffers_data[0],
+                                               self.broadcast_bucket_size)
                 if len(self.device_ids) > 1:
                     # intra-node buffer sync
-                    result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
-                    for tensors, module in zip(result[1:], self._module_copies[1:]):
-                        for tensor, buf in zip(tensors, module.buffers()):
-                            buf.data.set_(tensor)
+                    result = broadcast_coalesced(self.modules_buffers_data[0],
+                                                 self.device_ids,
+                                                 self.broadcast_bucket_size)
+                    for tensors, module_buffers_data in zip(result[1:], self.modules_buffers_data[1:]):
+                        for tensor, buffer_data in zip(tensors, module_buffers_data):
+                            buffer_data.set_(tensor)
 
     def _register_grad_hooks(self):
         self._grad_accs = []  # need to keep them in scope
@@ -286,94 +286,16 @@ def _register_grad_hooks(self):
                     grad_acc.register_hook(self._make_param_hook(p, device_idx))
                     self._grad_accs.append(grad_acc)
 
-    def _register_nccl_grad_hook(self):
-        """
-        This function registers the callback all-reduction function for the
-        NCCL backend. All gradients will be all reduced in one single step.
-        The NCCL reduction will directly be enqueued into the
-        default CUDA stream. Therefore, no synchronization is needed.
-        """
-        # Creating a new group
-        self.nccl_reduction_group_id = dist.new_group()
-
-        def reduction_fn_nccl():
-            # This function only needs to be called once
-            if not self.need_reduction:
-                return
-
-            self.need_reduction = False
-            all_grads = [[] for _ in range(len(self._module_copies))]
-            all_grads_buckets_iters = []
-
-            # Bucketing all the gradients
-            for dev_idx, module in enumerate(self._module_copies):
-                for param in module.parameters():
-                    if not param.requires_grad or param.grad is None:
-                        continue
-                    if param.grad.requires_grad:
-                        raise RuntimeError("DistributedDataParallel only works "
-                                           "with gradients that don't require "
-                                           "grad")
-                    # Adding the gradients for reduction
-                    all_grads[dev_idx].append(param.grad.data)
-
-                # Now bucketing the parameters
-                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
-                                                  self.nccl_reduce_bucket_size)
-
-                all_grads_buckets_iters.append(dev_grads_buckets)
-
-            # Now reduce each bucket one after another
-            for grads_batch in zip(*all_grads_buckets_iters):
-                grads_batch_coalesced = []
-                # Coalesce each bucket
-                for dev_idx, dev_grads_batch in enumerate(grads_batch):
-                    dev_id = self.device_ids[dev_idx]
-                    with torch.cuda.device(dev_id):
-                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
-                        grads_batch_coalesced.append(dev_grads_batch_coalesced)
-
-                # We will only use device 0's results, but this single op should be
-                # faster than doing the following two operation sequentially:
-                # (1) intra-node reduce to lead GPU, followed by
-                # (2) inter-node allreduce for all the first lead GPUs in all nodes
-                dist.all_reduce_multigpu(grads_batch_coalesced,
-                                         group=self.nccl_reduction_group_id)
-
-                # Now only work on the first device of self.device_ids, uncoalesce
-                # the gradients for each bucket
-                grads_batch_coalesced[0] /= dist.get_world_size()
-                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
-                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
-                    grad.copy_(reduced)
-
-            # clear the gradients and save memory for replicas
-            for module in self._module_copies[1:]:
-                for param in module.parameters():
-                    if param.requires_grad:
-                        param.grad = None
-                        param.data.set_()
-
-        # Now register the reduction hook on the parameters
-        for p in self.module.parameters():
-            if not p.requires_grad:
-                continue
-
-            def allreduce_hook(*unused):
-                Variable._execution_engine.queue_callback(reduction_fn_nccl)
-
-            p.register_hook(allreduce_hook)
-
     def _make_param_hook(self, param, device_idx):
-
-        bucket_idx = self.bucket_map[param]
+        bucket_idx, bucket_offset = self.bucket_map[param]
 
         def distributed_data_parallel_hook(*unused):
             if param.grad.requires_grad:
-                raise RuntimeError("DistributedDataParallel only works with "
-                                   "gradients that don't require grad")
+                raise RuntimeError("DistributedDataParallel only works "
+                                   "with gradients that don't require grad")
             bucket = self.buckets[bucket_idx][device_idx]
-            bucket.append(param.grad.data)
+            bucket[bucket_offset] = param.grad.data
+            self.buckets_ready_size[bucket_idx][device_idx] += 1
 
             # We can flush these and save memory for replicas
             if device_idx > 0:
@@ -381,102 +303,79 @@ def distributed_data_parallel_hook(*unused):
                 param.data.set_()
 
             # Current device's bucket is full
-            if len(bucket) == self.bucket_sizes[bucket_idx]:
-                with torch.cuda.device(self.device_ids[device_idx]):
-                    event = torch.cuda.Event()
-                    event.record()
-                with self.dispatch_lock:
-                    self.bucket_events[bucket_idx][device_idx] = event
+            if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]:
+                self.devs_ready[bucket_idx] += 1
+                if self.devs_ready[bucket_idx] < len(self.device_ids):
+                    return
+
+                # Now all devices's buckets with index: bucket_idx are ready
+                if bucket_idx == self.next_bucket:
                     self._queue_reduction(bucket_idx)
+                    self.next_bucket -= 1
+                    # Now reduce anything that is ready but not yet reduced
+                    if len(self.ready_buckets_not_reduced) > 0:
+                        sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True)
+                        for i in sorted_todo:
+                            # Nothing can be reduced now
+                            if i < self.next_bucket:
+                                break
+                            self._queue_reduction(i)
+                            self.ready_buckets_not_reduced.remove(i)
+                            if i == self.next_bucket:
+                                self.next_bucket -= 1
+                else:
+                    self.ready_buckets_not_reduced.add(bucket_idx)
+
+                # When all devices' buckets
+                if self.next_bucket == -1:
+                    # A final sync for all the reduction works
+                    self._sync_reduction_works()
 
         return distributed_data_parallel_hook
 
     def _queue_reduction(self, bucket_idx):
-        dev_buckets = self.buckets[bucket_idx]
-        dev_events = self.bucket_events[bucket_idx]
-
-        # Check if it's ready
-        if any(evt is None for evt in dev_events):
-            return
-
-        # Queue the reduction and make sure backward waits for it
-        event = threading.Event()
-        self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event))
-        Variable._execution_engine.queue_callback(lambda: event.wait())
-
-        # Reset bucket state
-        self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))]
-        self.bucket_events[bucket_idx] = [None] * len(self.device_ids)
-        self.reduced[bucket_idx] = True
-        if all(self.reduced):
-            self.reduced = [False] * len(self.bucket_sizes)
-
-            def sync_reduction_streams():
-                # We only have to sync with the first one, but it's safer to do it this way
-                # in case we change the way in which we paralellize work
-                r_streams = zip(*self._reduction_streams)
-                for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams):
-                    with torch.cuda.device(dev_id):
-                        for reduction_stream in dev_r_streams:
-                            default_stream.wait_stream(reduction_stream)
-            Variable._execution_engine.queue_callback(sync_reduction_streams)
-
-    def _start_reduction_threads(self):
-        num_buckets = len(self.bucket_sizes)
-        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
-        self._reduction_threads = []
-        self._reduction_streams = [[] for _ in range(num_buckets)]
-        self._nccl_streams = []
-        self._default_streams = []
-        for dev_id in self.device_ids:
+        grads_batch = self.buckets[bucket_idx]
+        grads_batch_coalesced = []
+
+        # coalesce the bucket
+        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
             with torch.cuda.device(dev_id):
-                # TODO: don't assume we're on a default stream
-                self._default_streams.append(torch.cuda.current_stream())
-                self._nccl_streams.append(torch.cuda.Stream())
-        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
-            for dev_id in self.device_ids:
-                with torch.cuda.device(dev_id):
-                    reduction_streams.append(torch.cuda.Stream())
-            # We only use the first device for distributed reductions
-            dist._register_stream(reduction_streams[0])
-
-            group_id = dist.new_group()
-
-            self._reduction_threads.append(threading.Thread(
-                target=self._reduction_thread_fn,
-                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
-            self._reduction_threads[-1].daemon = True
-            self._reduction_threads[-1].start()
-
-    @staticmethod
-    def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams):
-
-        def _process_batch():
-            dev_grad_batch, dev_events, job_event = queue.get()
-            dev_coalesced = []
-            # Coalesce the tensors on all devices and start a local reduction
-            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
-                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
-                    stream.wait_event(event)
-                    coalesced = _flatten_dense_tensors(grad_batch)
-                    dev_coalesced.append(coalesced)
-            # Wait for all copies to complete before starting the NCCL kernel
-            for stream in reduction_streams:
-                stream.synchronize()
-            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)
-
-            # From now on we're only going to work on the first device (from device_ids)
-            grad_batch = dev_grad_batch[0]
-            coalesced = dev_coalesced[0]
-            reduce_stream = reduction_streams[0]
-            with torch.cuda.stream(reduce_stream):
-                reduce_stream.wait_stream(nccl_streams[0])
-                coalesced /= dist.get_world_size()
-                dist.all_reduce(coalesced, group=group_id)
-                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
-                    grad.copy_(reduced)
-            job_event.set()
-
-        with torch.cuda.device(device_ids[0]):
-            while True:
-                _process_batch()  # just to have a clear scope
+                dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
+                grads_batch_coalesced.append(dev_grads_batch_coalesced)
+
+        # reduce to the first GPU in self.device_ids
+        if len(self.device_ids) > 1:
+            nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
+
+        # divide by the number of processes here to reduce chances of overflow
+        grads_batch_coalesced[0] /= self.process_group.size()
+
+        # now work on the first gpu
+        reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
+                                                      self.allreduce_opts)
+        self.reduction_works[bucket_idx] = reduction_work
+        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
+
+    def _sync_reduction_works(self):
+        # Now only work on the first GPU of self.device_ids, uncoalesce
+        # the gradients for each bucket
+        for bucket_idx, grads_batch in enumerate(self.buckets):
+            # wait will let current stream wait on the c10d reduction stream
+            self.reduction_works[bucket_idx].wait()
+
+            grads_batch_reduced = _unflatten_dense_tensors(
+                self.buckets_coalesced[bucket_idx], grads_batch[0])
+
+            for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
+                grad.copy_(reduced)
+
+        # Reset the module states
+        self.next_bucket = len(self.bucket_sizes) - 1
+        self.ready_buckets_not_reduced = set()
+        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
+        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
+
+        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
+                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
+        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
deleted file mode 100644
index 61acf2c8784f04..00000000000000
--- a/torch/nn/parallel/distributed_c10d.py
+++ /dev/null
@@ -1,366 +0,0 @@
-import copy
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
-    _take_tensors
-
-from torch.cuda.comm import broadcast_coalesced
-from torch.cuda import nccl
-import torch.distributed.c10d as c10d
-
-from ..modules import Module
-from .replicate import replicate
-from .scatter_gather import scatter_kwargs, gather
-from .parallel_apply import parallel_apply
-
-
-class _DistributedDataParallelC10d(Module):
-    r"""Implements distributed data parallelism that is based on c10d at the
-    module level.
-
-    Currently this module is EXPERIMENTAL ONLY and should not be
-    used by normal users. Instead, please use DistributedDataParallel.
-
-    This container parallelizes the application of the given module by
-    splitting the input across the specified devices by chunking in the batch
-    dimension. The module is replicated on each machine and each device, and
-    each such replica handles a portion of the input. During the backwards
-    pass, gradients from each node are averaged.
-
-    The batch size should be larger than the number of GPUs used locally. It
-    should also be an integer multiple of the number of GPUs so that each chunk
-    is the same size (so that each GPU processes the same number of samples).
-
-    See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
-    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
-
-    Creation of this class requires the c10d process group to be already
-    initialized. This class will basically operate on the provided c10d
-    process group.
-
-    .. warning::
-        This module works only with the ``gloo`` and ``nccl`` process groups.
-
-    .. warning::
-        Constructor, forward method, and differentiation of the output (or a
-        function of the output of this module) is a distributed synchronization
-        point. Take that into account in case different processes might be
-        executing different code.
-
-    .. warning::
-        This module assumes all parameters are registered in the model by the
-        time it is created. No parameters should be added nor removed later.
-        Same applies to buffers.
-
-    -- warning::
-        This module assumes all parameters are registered in the model of each
-        distributed processes are in the same order. The module itself will
-        conduct gradient all-reduction following the reverse order of the
-        registered parameters of the model. In other wise, it is users'
-        responsibility to ensure that each distributed process has the exact
-        same model and thus the exact parameter registeration order.
-
-    .. warning::
-        This module assumes all buffers and gradients are dense.
-
-    .. warning::
-        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
-        only work if gradients are to be accumulated in ``.grad`` attributes of
-        parameters).
-
-    .. warning::
-        If you plan on using this module with a ``nccl`` process group or
-        a ``gloo`` process group (that uses Infiniband), together with a
-        DataLoader that uses multiple workers, please change the multiprocessing
-        start method to ``forkserver`` (Python 3 only) or ``spawn``.
-        Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe,
-        and you will likely experience deadlocks if you don't change this
-        setting.
-
-    .. note::
-        Parameters are never broadcast between processes. The module performs
-        an all-reduce step on gradients and assumes that they will be modified
-        by the optimizer in all processes in the same way. Buffers
-        (e.g. BatchNorm stats) are broadcast from the module in process of rank
-        0, to all other replicas in the system in every iteration.
-
-    .. warning::
-        Forward and backward hooks defined on :attr:`module` and its submodules
-        won't be invoked anymore, unless the hooks are initialized in the
-        :meth:`forward` method.
-
-    Args:
-        module: module to be parallelized
-        device_ids: CUDA devices (default: all devices)
-        output_device: device location of output (default: device_ids[0])
-        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
-                           the module at beginning of the forward function.
-                           (default: True)
-        process_group: the c10d process group to be used for distributed data
-                       all-reduction. If None, the default process group will
-                       be used
-        bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into
-                       multiple buckets so that gradient reduction of each
-                       bucket can potentially overlap with backward computation.
-                       bucket_cap_mb controls the bucket size in MegaBytes (MB)
-                       (default: 25)
-
-    Attributes:
-        module (Module): the module to be parallelized
-
-    Example::
-        >>> store = torch.distributed.c10d.FileStore("/tmp/tempfile.txt")
-        >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size)
-        >>> net = torch.nn._DistributedDataParallelC10d(model, pg)
-    """
-    def __init__(self, module, device_ids=None,
-                 output_device=None, dim=0, broadcast_buffers=True,
-                 process_group=None, bucket_cap_mb=25):
-
-        super(_DistributedDataParallelC10d, self).__init__()
-
-        # Use all devices by default
-        if device_ids is None:
-            device_ids = list(range(torch.cuda.device_count()))
-
-        if output_device is None:
-            output_device = device_ids[0]
-
-        if process_group is None:
-            self.process_group = c10d.get_default_group()
-        else:
-            self.process_group = process_group
-
-        self.dim = dim
-        self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
-        self.broadcast_buffers = broadcast_buffers
-
-        self.allreduce_opts = c10d.AllreduceOptions()
-
-        MB = 1024 * 1024
-
-        # used for intra-node param sync and inter-node sync as well
-        self.broadcast_bucket_size = 25 * MB
-
-        # Sync params and buffers
-        module_states = list(self.module.state_dict().values())
-        if len(module_states) > 0:
-            self._dist_broadcast_coalesced(module_states,
-                                           self.broadcast_bucket_size)
-
-        if len(device_ids) > 1:
-            # TODO: we don't need to replicate params in here. they're always going to
-            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
-            # better to not pollute the caches with these small blocks
-            self._module_copies = replicate(self.module, self.device_ids, detach=True)
-            self._module_copies[0] = self.module
-
-            for module_copy in self._module_copies[1:]:
-                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
-                    copy_param.requires_grad = param.requires_grad
-
-        else:
-            self._module_copies = [self.module]
-
-        # .data() of each parameter for each model replica
-        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
-        # .data() of each buffer for each model replica
-        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]
-
-        for dev_idx, module in enumerate(self._module_copies):
-            self.modules_params_data[dev_idx] = [p.data for p in module.parameters()]
-            self.modules_buffers_data[dev_idx] = [b.data for b in module.buffers()]
-
-        bucket_bytes_cap = bucket_cap_mb * MB
-
-        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
-        param_buckets = []
-        # Split the parameters into buckets and by types as well
-        param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies]
-
-        self.bucket_sizes = []
-        self.bucket_map = {}
-
-        # We transpose param_buckets, so the loop is over buckets.
-        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
-        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
-            self.bucket_sizes.append(0)
-            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
-            # of params from each device.
-            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
-                if not param_tuple[0].requires_grad:
-                    continue
-                for p in param_tuple:
-                    self.bucket_map[p] = (bucket_idx, idx)
-                self.bucket_sizes[bucket_idx] += 1
-
-        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
-                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
-        # The number of params ready in each bucket
-        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
-
-        # coalesced bucket for only device 0
-        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
-        # We will always reduce the bucket following the reverse order
-        # that is, alway reduces following the order of: n - 1, n - 2, ..., 0
-        self.next_bucket = len(self.bucket_sizes) - 1
-        self.ready_buckets_not_reduced = set()
-        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
-
-        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
-
-        # default stream tracking to launch nccl reduce kernels
-        self.default_streams = []
-        for dev_id in self.device_ids:
-            with torch.cuda.device(dev_id):
-                self.default_streams.append(torch.cuda.current_stream())
-
-        self._register_grad_hooks()
-
-    def __getstate__(self):
-        attrs = copy.copy(self.__dict__)
-        del attrs['_grad_accs']
-        return attrs
-
-    def __setstate__(self, state):
-        super(_DistributedDataParallelC10d, self).__setstate__(state)
-        self._register_grad_hooks()
-
-    def forward(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        self._sync_params()
-        if len(self.device_ids) == 1:
-            return self.module(*inputs[0], **kwargs[0])
-        outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
-        return self.gather(outputs, self.output_device)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def parallel_apply(self, replicas, inputs, kwargs):
-        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
-
-    def gather(self, outputs, output_device):
-        return gather(outputs, output_device, dim=self.dim)
-
-    def train(self, mode=True):
-        super(_DistributedDataParallelC10d, self).train(mode)
-        for module in self._module_copies[1:]:
-            module.train(mode)
-
-    def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
-
-    def _sync_params(self):
-        c10d._sync_params(self.process_group,
-                          self.modules_params_data,
-                          self.modules_buffers_data,
-                          self.device_ids,
-                          self.broadcast_bucket_size,
-                          self.broadcast_buffers)
-
-    def _register_grad_hooks(self):
-        self._grad_accs = []  # need to keep them in scope
-        for device_idx, module in enumerate(self._module_copies):
-            for p in module.parameters():
-                if p.requires_grad:
-                    p_tmp = p.expand_as(p)
-                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
-                    grad_acc.register_hook(self._make_param_hook(p, device_idx))
-                    self._grad_accs.append(grad_acc)
-
-    def _make_param_hook(self, param, device_idx):
-        bucket_idx, bucket_offset = self.bucket_map[param]
-
-        def distributed_data_parallel_hook(*unused):
-            if param.grad.requires_grad:
-                raise RuntimeError("DistributedDataParallelC10d only works "
-                                   "with gradients that don't require grad")
-            bucket = self.buckets[bucket_idx][device_idx]
-            bucket[bucket_offset] = param.grad.data
-            self.buckets_ready_size[bucket_idx][device_idx] += 1
-
-            # We can flush these and save memory for replicas
-            if device_idx > 0:
-                param.grad = None
-                param.data.set_()
-
-            # Current device's bucket is full
-            if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]:
-                self.devs_ready[bucket_idx] += 1
-                if self.devs_ready[bucket_idx] < len(self.device_ids):
-                    return
-
-                # Now all devices's buckets with index: bucket_idx are ready
-                if bucket_idx == self.next_bucket:
-                    self._queue_reduction(bucket_idx)
-                    self.next_bucket -= 1
-                    # Now reduce anything that is ready but not yet reduced
-                    if len(self.ready_buckets_not_reduced) > 0:
-                        sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True)
-                        for i in sorted_todo:
-                            # Nothing can be reduced now
-                            if i < self.next_bucket:
-                                break
-                            self._queue_reduction(i)
-                            self.ready_buckets_not_reduced.remove(i)
-                            if i == self.next_bucket:
-                                self.next_bucket -= 1
-                else:
-                    self.ready_buckets_not_reduced.add(bucket_idx)
-
-                # When all devices' buckets
-                if self.next_bucket == -1:
-                    # A final sync for all the reduction works
-                    self._sync_reduction_works()
-
-        return distributed_data_parallel_hook
-
-    def _queue_reduction(self, bucket_idx):
-        grads_batch = self.buckets[bucket_idx]
-        grads_batch_coalesced = []
-
-        # coalesce the bucket
-        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
-            with torch.cuda.device(dev_id):
-                dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
-                grads_batch_coalesced.append(dev_grads_batch_coalesced)
-
-        # reduce to the first GPU in self.device_ids
-        if len(self.device_ids) > 1:
-            nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
-
-        # divide by the number of processes here to reduce chances of overflow
-        grads_batch_coalesced[0] /= self.process_group.size()
-
-        # now work on the first gpu
-        reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
-                                                      self.allreduce_opts)
-        self.reduction_works[bucket_idx] = reduction_work
-        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
-
-    def _sync_reduction_works(self):
-        # Now only work on the first GPU of self.device_ids, uncoalesce
-        # the gradients for each bucket
-        for bucket_idx, grads_batch in enumerate(self.buckets):
-            # wait will let current stream wait on the c10d reduction stream
-            self.reduction_works[bucket_idx].wait()
-
-            grads_batch_reduced = _unflatten_dense_tensors(
-                self.buckets_coalesced[bucket_idx], grads_batch[0])
-
-            for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
-                grad.copy_(reduced)
-
-        # Reset the module states
-        self.next_bucket = len(self.bucket_sizes) - 1
-        self.ready_buckets_not_reduced = set()
-        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
-        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
-
-        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
-                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
-        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
-        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
diff --git a/torch/nn/parallel/distributed_cpu.py b/torch/nn/parallel/distributed_cpu.py
index 07b59c528d25b4..d5a3b1235f9678 100644
--- a/torch/nn/parallel/distributed_cpu.py
+++ b/torch/nn/parallel/distributed_cpu.py
@@ -7,9 +7,10 @@
 
 
 class DistributedDataParallelCPU(Module):
-    r"""Implements distributed data parallelism for CPU at the module level.
+    r"""Implements distributed data parallelism for CPU at the module level
+    and it will use PyTorch's new distributed package: c10d.
 
-    This module support the ``mpi``, ``gloo``, ``tcp`` backends.
+    This module support the ``mpi``, ``gloo``, backends.
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index d3e871353ece96..ee58c1afc5a74d 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -1,7 +1,7 @@
 import math
 import torch
 from . import Sampler
-from torch.distributed import get_world_size, get_rank
+import torch.distributed as dist
 
 
 class DistributedSampler(Sampler):
@@ -24,9 +24,13 @@ class DistributedSampler(Sampler):
 
     def __init__(self, dataset, num_replicas=None, rank=None):
         if num_replicas is None:
-            num_replicas = get_world_size()
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
         if rank is None:
-            rank = get_rank()
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
         self.dataset = dataset
         self.num_replicas = num_replicas
         self.rank = rank

From 3d5fd12488103c9259a9d20f804102fe65597126 Mon Sep 17 00:00:00 2001
From: Teng Li <tengli@fb.com>
Date: Tue, 11 Sep 2018 02:06:05 -0700
Subject: [PATCH 025/237] Documentation for c10d: torch.distributed and
 deprecate the old distributed doc (#11450)

Summary:
This is the new documentation for c10d release, and it also deprecates the old torch.distributed document.

This PR depends on https://github.com/pytorch/pytorch/pull/11405

and should only be landed after https://github.com/pytorch/pytorch/pull/11405 is landed
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11450

Differential Revision: D9765504

Pulled By: teng-li

fbshipit-source-id: 48f38b27b8c270baf389f8e478ea226b9ecc63db
---
 docs/source/distributed.rst            | 114 ++++++----
 docs/source/distributed_deprecated.rst | 280 +++++++++++++++++++++++++
 docs/source/index.rst                  |   1 +
 3 files changed, 353 insertions(+), 42 deletions(-)
 create mode 100644 docs/source/distributed_deprecated.rst

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 9dbb9be74709bb..49d9d96ee62817 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -7,35 +7,35 @@ Distributed communication package - torch.distributed
 .. automodule:: torch.distributed
 .. currentmodule:: torch.distributed
 
-Currently torch.distributed supports four backends, each with
+Currently torch.distributed supports three backends, each with
 different capabilities. The table below shows which functions are available
 for use with CPU / CUDA tensors.
 MPI supports cuda only if the implementation used to build PyTorch supports it.
 
 
-+------------+-----------+-----------+-----------+-----------+
-| Backend    | ``tcp``   | ``gloo``  | ``mpi``   | ``nccl``  |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| Device     | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU |
-+============+=====+=====+=====+=====+=====+=====+=====+=====+
-| send       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| recv       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| broadcast  | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| all_reduce | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| reduce     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| all_gather | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| gather     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| scatter    | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| barrier    | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
++------------+-----------+-----------+-----------+
+| Backend    | ``gloo``  | ``mpi``   | ``nccl``  |
++------------+-----+-----+-----+-----+-----+-----+
+| Device     | CPU | GPU | CPU | GPU | CPU | GPU |
++============+=====+=====+=====+=====+=====+=====+
+| send       | ✓   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| recv       | ✓   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| broadcast  | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| all_reduce | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| reduce     | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| all_gather | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| gather     | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| scatter    | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| barrier    | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
 
 .. _distributed-basics:
 
@@ -79,6 +79,14 @@ joined.
 
 .. autofunction:: get_world_size
 
+.. autofunction:: is_initialized
+
+.. autofunction:: get_default_group
+
+.. autofunction:: is_mpi_available
+
+.. autofunction:: is_nccl_available
+
 --------------------------------------------------------------------------------
 
 Currently three initialization methods are supported:
@@ -91,10 +99,8 @@ reachable from all processes and a desired ``world_size``. The first way
 requires specifying an address that belongs to the rank 0 process. This
 initialization method requires that all processes have manually specified ranks.
 
-Alternatively, the address has to be a valid IP multicast address, in which case
-ranks can be assigned automatically. Multicast initialization also supports
-a ``group_name`` argument, which allows you to use the same address for multiple
-jobs, as long as they use different group names.
+Note that multicast address is not supported anymore in the latest distributed
+package. ``group_name`` is deprecated as well.
 
 ::
 
@@ -103,31 +109,40 @@ jobs, as long as they use different group names.
     # Use address of one of the machines
     dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
 
-    # or a multicast address - rank will be assigned automatically if unspecified
-    dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
-                            world_size=4)
-
 Shared file-system initialization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Another initialization method makes use of a file system that is shared and
 visible from all machines in a group, along with a desired ``world_size``. The URL should start
 with ``file://`` and contain a path to a non-existent file (in an existing
-directory) on a shared file system. This initialization method also supports a
-``group_name`` argument, which allows you to use the same shared file path for
-multiple jobs, as long as they use different group names.
+directory) on a shared file system. File-system initialization will automatically
+create that file if it doesn't exist, but will not delete the file. Therefore, it
+is your responsibility to make sure that the file is cleaned up before the next
+init_process_group call on the same file path/name.
+
+Note that automatic rank assignment is not supported anymore in the latest
+distributed package and ``group_name`` is deprecated as well.
 
 .. warning::
     This method assumes that the file system supports locking using ``fcntl`` - most
     local systems and NFS support it.
 
+.. warning::
+    This method does not clean up and remove the file and it is your responsibility
+    to remove the file at the end of the training. This is especially important
+    if you plan to call init_process_group multiple times on the same file name.
+    In other words, if the file is not removed/cleaned up and you call
+    init_process_group again on that file, it is unexpected behavior and will cause
+    failures. The rule of thumb here is that, make sure that the file is non-existent or
+    empty everytime init_process_group is called.
+
 ::
 
     import torch.distributed as dist
 
-    # Rank will be assigned automatically if unspecified
+    # rank should always be specified
     dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile',
-                            world_size=4, group_name=args.group)
+                            world_size=4, rank=args.rank)
 
 Environment variable initialization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -174,14 +189,29 @@ as they should never be created manually, but they are guaranteed to support two
 * ``wait()`` - will block the process until the operation is finished.
   ``is_completed()`` is guaranteed to return True once it returns.
 
-When using the MPI backend, :func:`~torch.distributed.isend` and :func:`~torch.distributed.irecv`
-support non-overtaking, which has some guarantees on supporting message order. For more detail, see
-http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54
-
 .. autofunction:: isend
 
 .. autofunction:: irecv
 
+Synchronous and asynchornous collective operations
+--------------------------------------------------
+Every collective operation function supports the following two kinds of operations:
+
+synchronous operation - the default mode, when ``async_op`` is set to False.
+when the function returns, it is guaranteed that
+the collective operation is performed (not necessarily completed if it's a CUDA op since all
+CUDA ops are asynchornous), and any further function calls depending on the data of the
+collective operation can be called. In the synchronous mode, the collective function does not
+return anything
+
+asynchornous operation - when ``async_op`` is set to True. The collective operation function
+returns a distributed request object. In general, you don't need to create it manually and it
+is guaranteed to support two methods:
+
+* ``is_completed()`` - returns True if the operation has finished
+* ``wait()`` - will block the process until the operation is finished.
+
+
 Collective functions
 --------------------
 
@@ -202,7 +232,7 @@ Collective functions
 Multi-GPU collective functions
 ------------------------------
 
-If you have more than one GPU on each node, when using the NCCL backend,
+If you have more than one GPU on each node, when using the NCCL and Gloo backend,
 :func:`~torch.distributed.broadcast_multigpu`
 :func:`~torch.distributed.all_reduce_multigpu`
 :func:`~torch.distributed.reduce_multigpu` and
diff --git a/docs/source/distributed_deprecated.rst b/docs/source/distributed_deprecated.rst
new file mode 100644
index 00000000000000..303ce7b8a54922
--- /dev/null
+++ b/docs/source/distributed_deprecated.rst
@@ -0,0 +1,280 @@
+.. role:: hidden
+    :class: hidden-section
+
+Distributed communication package (deprecated) - torch.distributed.deprecated
+=============================================================================
+
+.. warning::
+    torch.distributed.deprecated is the older version of torch.distributed and
+    currently deprecated. It will be removed soon. Please use and refer the doc
+    for torch.distributed, which is the latest distributed communication
+    package for PyTorch
+
+.. automodule:: torch.distributed.deprecated
+.. currentmodule:: torch.distributed.deprecated
+
+Currently torch.distributed.deprecated supports four backends, each with
+different capabilities. The table below shows which functions are available
+for use with CPU / CUDA tensors.
+MPI supports cuda only if the implementation used to build PyTorch supports it.
+
+
++------------+-----------+-----------+-----------+-----------+
+| Backend    | ``tcp``   | ``gloo``  | ``mpi``   | ``nccl``  |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| Device     | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU |
++============+=====+=====+=====+=====+=====+=====+=====+=====+
+| send       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| recv       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| broadcast  | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| all_reduce | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| reduce     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| all_gather | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| gather     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| scatter    | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| barrier    | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+
+.. _distributed-deprecated-basics:
+
+Basics
+------
+
+The `torch.distributed.deprecated` package provides PyTorch support and communication primitives
+for multiprocess parallelism across several computation nodes running on one or more
+machines. The class :func:`torch.nn.parallel.deprecated.DistributedDataParallel` builds on this
+functionality to provide synchronous distributed training as a wrapper around any
+PyTorch model. This differs from the kinds of parallelism provided by
+:doc:`multiprocessing` and :func:`torch.nn.DataParallel` in that it supports
+multiple network-connected machines and in that the user must explicitly launch a separate
+copy of the main training script for each process.
+
+In the single-machine synchronous case, `torch.distributed.deprecated` or the
+:func:`torch.nn.parallel.deprecated.DistributedDataParallel` wrapper may still have advantages over other
+approaches to data-parallelism, including :func:`torch.nn.DataParallel`:
+
+* Each process maintains its own optimizer and performs a complete optimization step with each
+  iteration. While this may appear redundant, since the gradients have already been gathered
+  together and averaged across processes and are thus the same for every process, this means
+  that no parameter broadcast step is needed, reducing time spent transferring tensors between
+  nodes.
+* Each process contains an independent Python interpreter, eliminating the extra interpreter
+  overhead and "GIL-thrashing" that comes from driving several execution threads, model
+  replicas, or GPUs from a single Python process. This is especially important for models that
+  make heavy use of the Python runtime, including models with recurrent layers or many small
+  components.
+
+Initialization
+--------------
+
+The package needs to be initialized using the :func:`torch.distributed.deprecated.init_process_group`
+function before calling any other methods. This blocks until all processes have
+joined.
+
+.. autofunction:: init_process_group
+
+.. autofunction:: get_rank
+
+.. autofunction:: get_world_size
+
+--------------------------------------------------------------------------------
+
+Currently three initialization methods are supported:
+
+TCP initialization
+^^^^^^^^^^^^^^^^^^
+
+There are two ways to initialize using TCP, both requiring a network address
+reachable from all processes and a desired ``world_size``. The first way
+requires specifying an address that belongs to the rank 0 process. This
+initialization method requires that all processes have manually specified ranks.
+
+Alternatively, the address has to be a valid IP multicast address, in which case
+ranks can be assigned automatically. Multicast initialization also supports
+a ``group_name`` argument, which allows you to use the same address for multiple
+jobs, as long as they use different group names.
+
+::
+
+    import torch.distributed.deprecated as dist
+
+    # Use address of one of the machines
+    dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
+
+    # or a multicast address - rank will be assigned automatically if unspecified
+    dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
+                            world_size=4)
+
+Shared file-system initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Another initialization method makes use of a file system that is shared and
+visible from all machines in a group, along with a desired ``world_size``. The URL should start
+with ``file://`` and contain a path to a non-existent file (in an existing
+directory) on a shared file system. This initialization method also supports a
+``group_name`` argument, which allows you to use the same shared file path for
+multiple jobs, as long as they use different group names.
+
+.. warning::
+    This method assumes that the file system supports locking using ``fcntl`` - most
+    local systems and NFS support it.
+
+::
+
+    import torch.distributed.deprecated as dist
+
+    # Rank will be assigned automatically if unspecified
+    dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile',
+                            world_size=4, group_name=args.group)
+
+Environment variable initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This method will read the configuration from environment variables, allowing
+one to fully customize how the information is obtained. The variables to be set
+are:
+
+* ``MASTER_PORT`` - required; has to be a free port on machine with rank 0
+* ``MASTER_ADDR`` - required (except for rank 0); address of rank 0 node
+* ``WORLD_SIZE`` - required; can be set either here, or in a call to init function
+* ``RANK`` - required; can be set either here, or in a call to init function
+
+The machine with rank 0 will be used to set up all connections.
+
+This is the default method, meaning that ``init_method`` does not have to be specified (or
+can be ``env://``).
+
+Groups
+------
+
+By default collectives operate on the default group (also called the world) and
+require all processes to enter the distributed function call. However, some workloads can benefit
+from more fine-grained communication. This is where distributed groups come
+into play. :func:`~torch.distributed.deprecated.new_group` function can be
+used to create new groups, with arbitrary subsets of all processes. It returns
+an opaque group handle that can be given as a ``group`` argument to all collectives
+(collectives are distributed functions to exchange information in certain well-known programming patterns).
+
+.. autofunction:: new_group
+
+Point-to-point communication
+----------------------------
+
+.. autofunction:: send
+
+.. autofunction:: recv
+
+:func:`~torch.distributed.deprecated.isend` and :func:`~torch.distributed.deprecated.irecv`
+return distributed request objects when used. In general, the type of this object is unspecified
+as they should never be created manually, but they are guaranteed to support two methods:
+
+* ``is_completed()`` - returns True if the operation has finished
+* ``wait()`` - will block the process until the operation is finished.
+  ``is_completed()`` is guaranteed to return True once it returns.
+
+When using the MPI backend, :func:`~torch.distributed.deprecated.isend` and :func:`~torch.distributed.deprecated.irecv`
+support non-overtaking, which has some guarantees on supporting message order. For more detail, see
+http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54
+
+.. autofunction:: isend
+
+.. autofunction:: irecv
+
+Collective functions
+--------------------
+
+.. autofunction:: broadcast
+
+.. autofunction:: all_reduce
+
+.. autofunction:: reduce
+
+.. autofunction:: all_gather
+
+.. autofunction:: gather
+
+.. autofunction:: scatter
+
+.. autofunction:: barrier
+
+Multi-GPU collective functions
+------------------------------
+
+If you have more than one GPU on each node, when using the NCCL backend,
+:func:`~torch.distributed.deprecated.broadcast_multigpu`
+:func:`~torch.distributed.deprecated.all_reduce_multigpu`
+:func:`~torch.distributed.deprecated.reduce_multigpu` and
+:func:`~torch.distributed.deprecated.all_gather_multigpu` support distributed collective
+operations among multiple GPUs within each node. These functions can potentially
+improve the overall distributed training performance and be easily used by
+passing a list of tensors. Each Tensor in the passed tensor list needs
+to be on a separate GPU device of the host where the function is called. Note
+that the length of the tensor list needs to be identical among all the
+distributed processes. Also note that currently the multi-GPU collective
+functions are only supported by the NCCL backend.
+
+For example, if the system we use for distributed training has 2 nodes, each
+of which has 8 GPUs. On each of the 16 GPUs, there is a tensor that we would
+like to all-reduce. The following code can serve as a reference:
+
+Code running on Node 0
+
+::
+
+    import torch
+    import torch.distributed.deprecated as dist
+
+    dist.init_process_group(backend="nccl",
+                            init_method="file:///distributed_test",
+                            world_size=2,
+                            rank=0)
+    tensor_list = []
+    for dev_idx in range(torch.cuda.device_count()):
+        tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx))
+
+    dist.all_reduce_multigpu(tensor_list)
+
+Code running on Node 1
+
+::
+
+    import torch
+    import torch.distributed.deprecated as dist
+
+    dist.init_process_group(backend="nccl",
+                            init_method="file:///distributed_test",
+                            world_size=2,
+                            rank=1)
+    tensor_list = []
+    for dev_idx in range(torch.cuda.device_count()):
+        tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx))
+
+    dist.all_reduce_multigpu(tensor_list)
+
+After the call, all 16 tensors on the two nodes will have the all-reduced value
+of 16
+
+.. autofunction:: broadcast_multigpu
+
+.. autofunction:: all_reduce_multigpu
+
+.. autofunction:: reduce_multigpu
+
+.. autofunction:: all_gather_multigpu
+
+
+Launch utility
+--------------
+
+The `torch.distributed.deprecated` package also provides a launch utility in
+`torch.distributed.deprecated.launch`.
+
+.. automodule:: torch.distributed.launch
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5b37b3301f3db2..7732366d38a61c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,6 +32,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    optim
    torch.autograd <autograd>
    torch.distributed <distributed>
+   torch.distributed.deprecated <distributed_deprecated>
    torch.distributions <distributions>
    torch.multiprocessing <multiprocessing>
    bottleneck

From f84693efa96317ffb3a1c68045fa924d2d6bf2b2 Mon Sep 17 00:00:00 2001
From: Duc Ngo <duc@fb.com>
Date: Tue, 11 Sep 2018 04:35:44 -0700
Subject: [PATCH 026/237] nomnigraph - Improvements to subgraph matching APIs
 (#11418)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11418

Several improvements that aim to make the APIs more straightforward to use

- Get rid of helper methods subgraph and nonTerminal . Users now should create a NNMatchGraph directly via graph's createNode and createEdge API

- Get rid of operatorSubgraph helper method

- invertGraphTraversal flag applies to both the match graph and the scanned graph. This allows user to create match graph in the same direction as the scanned graph, thus reduce confusion.

- additional parameters of matchNode (count, includeInSubgraph, nonTerminal) are removed from the constructors and moved into setter methods. (We no longer enforce that MatchNode is immutable but this helps improve code clarity).

- Tests are updated to reflect the changes

Follow up changes:
- Possibly clean up the tests further. This change aims to minimally modify the unit tests.
- Help a validity check that enforce the current limitation of the match graph (single source node), and throws if the match graph does not satisfy the criteria.
- Have the single source node be detected automatically and callers just need to pass in the matchGraph instead of the source node reference.

Differential Revision: D9732565

fbshipit-source-id: ae8320e2bc89b867f6bb4b1c1aad635f4b219fa1
---
 .../nomnigraph/Representations/NeuralNet.cc   |  15 ++-
 .../nomnigraph/Representations/NeuralNet.h    |  17 ++-
 .../Transformations/SubgraphMatcher.h         | 100 +++++++++---------
 caffe2/core/nomnigraph/tests/NeuralNetTest.cc |  31 +++---
 .../nomnigraph/tests/SubgraphMatcherTest.cc   |  62 ++++++-----
 caffe2/python/pybind_state_nomni.cc           |  24 +++--
 6 files changed, 127 insertions(+), 122 deletions(-)

diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
index 2765b48435e7bd..28f33a43cbff2f 100644
--- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc
+++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
@@ -216,17 +216,14 @@ NNNodeMatchCriteria criteriaSingleConsumer() {
       "Single consumer");
 }
 
-NNNodeMatchCriteria matchTensor() {
-  return matchOp<nom::repr::Tensor>("matchTensor");
+NNNodeMatchCriteria matchTensor(const std::string& debugString) {
+  return matchOp<nom::repr::Tensor>(debugString);
 }
 
-NNMatchGraph::NodeRef operatorSubgraph(
-    NNMatchGraph& g,
-    const NNNodeMatchCriteria& root,
-    const std::vector<NNMatchGraph::NodeRef>& childrenCriteria,
-    int count) {
-  return subgraph(
-      g, matchTensor(), {subgraph(g, root, childrenCriteria)}, count);
+NNMatchNode matchExternalTensorNode(const std::string& debugString) {
+  return NNMatchNode(matchTensor(debugString))
+      .nonTerminal()
+      .excludeFromSubgraph();
 }
 
 } // namespace nn
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index f4c3940acd0711..09273a5fe15981 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -474,8 +474,6 @@ NNNodeMatchCriteria matchOp(const std::string& debugString = "matchOp") {
       debugString);
 }
 
-CAFFE2_API NNNodeMatchCriteria matchTensor();
-
 template <typename NodeType>
 NNNodeMatchCriteria matchOp(
     const std::function<bool(const NodeType&)> predicate,
@@ -489,6 +487,12 @@ NNNodeMatchCriteria matchOp(
       debugString);
 };
 
+CAFFE2_API NNNodeMatchCriteria
+matchTensor(const std::string& debugString = "matchTensor");
+
+CAFFE2_API NNMatchNode
+matchExternalTensorNode(const std::string& debugString = "matchExternalTensor");
+
 struct CAFFE2_API NNNodeMatch {
   static bool isMatch(
       const NNGraph::NodeRef& node,
@@ -500,15 +504,6 @@ struct CAFFE2_API NNNodeMatch {
 using NNSubgraphMatcher =
     nom::matcher::SubgraphMatcher<NNGraph, NNNodeMatchCriteria, NNNodeMatch>;
 
-// This helper method makes it easy to create matching criteria in NNGraph.
-// For example, operatorSubgraph(opMatch, ...) will refer to a tree like this:
-// ... -> opMatch -> opMatch_Output
-CAFFE2_API NNMatchGraph::NodeRef operatorSubgraph(
-    NNMatchGraph& g,
-    const NNNodeMatchCriteria& root,
-    const std::vector<NNMatchGraph::NodeRef>& childrenCriteria = {},
-    int count = 1);
-
 } // namespace nn
 
 } // namespace repr
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
index f11975284b217c..66494dba3275ce 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
@@ -32,15 +32,8 @@ template <typename NodeMatchCriteria>
 class CAFFE2_API MatchNode {
  public:
   static const int kStarCount = -1;
-  MatchNode(
-      const NodeMatchCriteria& criteria,
-      bool includeInSubgraph = true,
-      int count = 1,
-      bool nonTerminal = false)
-      : criteria_(criteria),
-        includeInSubgraph_(includeInSubgraph),
-        count_(count),
-        nonTerminal_(nonTerminal) {}
+
+  MatchNode(const NodeMatchCriteria& criteria) : criteria_(criteria) {}
 
   MatchNode() = default;
   MatchNode(const MatchNode&) = default;
@@ -55,6 +48,25 @@ class CAFFE2_API MatchNode {
     return count_;
   }
 
+  MatchNode<NodeMatchCriteria>& count(int count) {
+    count_ = count;
+    return *this;
+  }
+
+  MatchNode<NodeMatchCriteria>& starCount() {
+    return count(kStarCount);
+  }
+
+  MatchNode<NodeMatchCriteria>& nonTerminal() {
+    nonTerminal_ = true;
+    return *this;
+  }
+
+  MatchNode<NodeMatchCriteria>& excludeFromSubgraph() {
+    includeInSubgraph_ = false;
+    return *this;
+  }
+
   bool isNonTerminal() const {
     return nonTerminal_;
   }
@@ -65,9 +77,9 @@ class CAFFE2_API MatchNode {
 
  private:
   NodeMatchCriteria criteria_;
-  bool includeInSubgraph_;
-  int count_;
-  bool nonTerminal_;
+  int count_ = 1;
+  bool includeInSubgraph_ = true;
+  bool nonTerminal_ = false;
 };
 
 template <typename NodeMatchCriteria>
@@ -76,38 +88,12 @@ using MatchGraph = Graph<MatchNode<NodeMatchCriteria>>;
 template <typename NodeMatchCriteria>
 using MatchNodeRef = typename MatchGraph<NodeMatchCriteria>::NodeRef;
 
-template <typename NodeMatchCriteria>
-MatchNodeRef<NodeMatchCriteria> subgraph(
-    MatchGraph<NodeMatchCriteria>& graph,
-    const NodeMatchCriteria& root,
-    const std::vector<MatchNodeRef<NodeMatchCriteria>>& children,
-    int count = 1,
-    bool includeInSubgraph = true) {
-  auto result = graph.createNode(
-      MatchNode<NodeMatchCriteria>(root, includeInSubgraph, count, false));
-  for (auto child : children) {
-    graph.createEdge(result, child);
-  }
-  return result;
-}
-
-// Note that for nonTerminalSubgraph, the default value for includeInSubgraph
-// is false since we typically do not want to include a nonTerminal node
-// in the matched subgraph.
-template <typename NodeMatchCriteria>
-MatchNodeRef<NodeMatchCriteria> nonTerminalSubgraph(
-    MatchGraph<NodeMatchCriteria>& graph,
-    const NodeMatchCriteria& root,
-    int count = 1,
-    bool includeInSubgraph = false) {
-  return graph.createNode(
-      MatchNode<NodeMatchCriteria>(root, includeInSubgraph, count, true));
-}
-
 // TODO: Reuse convertToDotString once convertToDotString can work
 // with subgraph.
 template <typename NodeMatchCriteria>
-std::string debugString(MatchNodeRef<NodeMatchCriteria> rootCriteriaRef) {
+std::string debugString(
+    MatchNodeRef<NodeMatchCriteria> rootCriteriaRef,
+    bool invertGraphTraversal) {
   std::ostringstream out;
   auto rootNode = rootCriteriaRef->data();
   out << "{rootCriteria = '" << rootNode.getCriteria() << "'";
@@ -117,11 +103,14 @@ std::string debugString(MatchNodeRef<NodeMatchCriteria> rootCriteriaRef) {
   if (rootNode.isNonTerminal()) {
     out << ", nonTerminal = " << rootNode.isNonTerminal();
   }
-  auto outEdges = rootCriteriaRef->getOutEdges();
-  if (!outEdges.empty()) {
+  auto edges = invertGraphTraversal ? rootCriteriaRef->getInEdges()
+                                    : rootCriteriaRef->getOutEdges();
+  if (!edges.empty()) {
     out << ", childrenCriteria = [";
-    for (auto& child : outEdges) {
-      out << debugString<NodeMatchCriteria>(child->head()) << ", ";
+    for (auto& child : edges) {
+      auto nextNode = invertGraphTraversal ? child->tail() : child->head();
+      out << debugString<NodeMatchCriteria>(nextNode, invertGraphTraversal)
+          << ", ";
     }
     out << "]";
   }
@@ -294,7 +283,8 @@ struct SubgraphMatcher {
           std::ostringstream debugMessage;
           debugMessage << "Subgraph root at " << root << " is not the same as "
                        << matchedNode << " which previously matched criteria "
-                       << debugString<NodeMatchCriteria>(rootCriteriaRef);
+                       << debugString<NodeMatchCriteria>(
+                              rootCriteriaRef, invertGraphTraversal);
           return SubgraphMatchResultType::notMatched(debugMessage.str());
         } else {
           return SubgraphMatchResultType::notMatched();
@@ -307,7 +297,8 @@ struct SubgraphMatcher {
         std::ostringstream debugMessage;
         debugMessage << "Subgraph root at " << root
                      << " does not match criteria "
-                     << debugString<NodeMatchCriteria>(rootCriteriaRef);
+                     << debugString<NodeMatchCriteria>(
+                            rootCriteriaRef, invertGraphTraversal);
         return SubgraphMatchResultType::notMatched(debugMessage.str());
       } else {
         return SubgraphMatchResultType::notMatched();
@@ -326,8 +317,10 @@ struct SubgraphMatcher {
         invertGraphTraversal ? root->getInEdges() : root->getOutEdges();
 
     int numEdges = edges.size();
-    const auto outEdges = rootCriteriaRef->getOutEdges();
-    int numChildrenCriteria = outEdges.size();
+    const auto criteriaEdges = invertGraphTraversal
+        ? rootCriteriaRef->getInEdges()
+        : rootCriteriaRef->getOutEdges();
+    int numChildrenCriteria = criteriaEdges.size();
 
     // The current algorithm implies that the ordering of the children is
     // important. The children nodes will be matched with the children subgraph
@@ -336,7 +329,9 @@ struct SubgraphMatcher {
     int currentEdgeIdx = 0;
     for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria;
          criteriaIdx++) {
-      auto childrenCriteriaRef = outEdges[criteriaIdx]->head();
+      auto childrenCriteriaRef = invertGraphTraversal
+          ? criteriaEdges[criteriaIdx]->tail()
+          : criteriaEdges[criteriaIdx]->head();
 
       int expectedCount = childrenCriteriaRef->data().getCount();
       bool isStarCount =
@@ -374,7 +369,7 @@ struct SubgraphMatcher {
               debugMessage << "Child node at " << child
                            << " does not match child criteria "
                            << debugString<NodeMatchCriteria>(
-                                  childrenCriteriaRef)
+                                  childrenCriteriaRef, invertGraphTraversal)
                            << ". We expected " << expectedCount
                            << " matches but only found " << countMatch << ".";
               return SubgraphMatchResultType::notMatched(debugMessage.str());
@@ -399,7 +394,8 @@ struct SubgraphMatcher {
           std::ostringstream debugMessage;
           debugMessage << "Expected " << expectedCount
                        << " matches for child criteria "
-                       << debugString<NodeMatchCriteria>(childrenCriteriaRef)
+                       << debugString<NodeMatchCriteria>(
+                              childrenCriteriaRef, invertGraphTraversal)
                        << " but only found " << countMatch;
           return SubgraphMatchResultType::notMatched(debugMessage.str());
         } else {
diff --git a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc
index 32c0f20f3571bf..874da120b5be8f 100644
--- a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc
+++ b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc
@@ -43,27 +43,28 @@ TEST(NeuralNetGraph, ReplaceGraph) {
   */
 
   auto mg = NNMatchGraph();
-  // clang-format off
-  auto matchSumOutput = operatorSubgraph(mg,
-      matchOp<Sum>(), {
-        nonTerminalSubgraph(mg, matchTensor(), 2)
-      });;
-  auto pattern = subgraph(mg,
-      matchOp<Relu>(), {
-        matchSumOutput
-      });
-  // clang-format on
+  auto matchSumInput =
+      mg.createNode(std::move(matchExternalTensorNode().count(2)));
+  auto matchSum = mg.createNode(matchOp<Sum>("matchSum"));
+  mg.createEdge(matchSumInput, matchSum);
+
+  auto matchSumOutput = mg.createNode(matchTensor("matchSumOutput"));
+  mg.createEdge(matchSum, matchSumOutput);
+
+  auto matchRelu = mg.createNode(matchOp<Relu>("matchRelu"));
+  mg.createEdge(matchSumOutput, matchRelu);
 
-  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, pattern).isMatch());
+  auto matchRoot = matchRelu;
+  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, matchRoot).isMatch());
   EXPECT_FALSE(
-      NNSubgraphMatcher::isSubgraphMatch(reluOutput, pattern).isMatch());
-  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, pattern).isMatch());
+      NNSubgraphMatcher::isSubgraphMatch(reluOutput, matchRoot).isMatch());
+  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, matchRoot).isMatch());
 
-  EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, pattern).isMatch());
+  EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, matchRoot).isMatch());
 
   NNSubgraphMatcher::replaceSubgraph(
       graph,
-      pattern,
+      matchRoot,
       [&matchSumOutput](
           NNGraph& g,
           NNGraph::NodeRef relu,
diff --git a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc
index 7ed996aa5f1bdc..ee677665c6546d 100644
--- a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc
+++ b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc
@@ -41,27 +41,22 @@ TestMatchGraph::NodeRef Tree(
     const Criteria& root,
     const std::vector<TestMatchGraph::NodeRef>& children = {},
     int count = 1) {
-  return subgraph(graph, root, children, count);
+  auto result = graph.createNode(std::move(TestMatchNode(root).count(count)));
+  for (auto& child : children) {
+    graph.createEdge(result, child);
+  }
+  return result;
 }
 
 TestMatchGraph::NodeRef NonTerminal(const Criteria& root, int count = 1) {
-  return nonTerminalSubgraph(graph, root, count);
+  return graph.createNode(
+      std::move(TestMatchNode(root).count(count).nonTerminal()));
 }
 
 Criteria any() {
   return Criteria("*");
 }
 
-// Make it more concise to create matching criteria in dataflow graph.
-// For example, operatorTree("opA", ...) will refer to a tree like this:
-// ... -> opA -> opA_Output
-TestMatchGraph::NodeRef operatorTree(
-    const Criteria& root,
-    const std::vector<TestMatchGraph::NodeRef>& childrenCriteria = {},
-    int count = 1) {
-  return Tree(any(), {Tree(root, childrenCriteria)}, count);
-}
-
 std::map<std::string, std::string> TestGraphNodePrinter(
     TestGraph::NodeRef node) {
   std::map<std::string, std::string> labelMap;
@@ -185,20 +180,35 @@ struct DataFlowTestGraphCriteria {
   TestMatchGraph::NodeRef matchOpG;
 
   DataFlowTestGraphCriteria() {
-    // clang-format off
-    matchOpCOutput = operatorTree("opC", {
-      NonTerminal(Criteria("input"), TestMatchNode::kStarCount)
-    });
-    matchOpG = Tree(
-      Criteria("opG"),{
-        operatorTree("opF", {
-            operatorTree("opB", {
-              matchOpCOutput, matchOpCOutput,
-            })
-        }),
-        NonTerminal(any()) // matches dataI
-      });
-    // clang-format on
+    auto matchOpCInputs =
+        graph.createNode(std::move(TestMatchNode(Criteria("input"))
+                                       .starCount()
+                                       .nonTerminal()
+                                       .excludeFromSubgraph()));
+    auto matchOpC = graph.createNode(Criteria("opC"));
+    graph.createEdge(matchOpCInputs, matchOpC);
+
+    matchOpCOutput = graph.createNode(any());
+    graph.createEdge(matchOpC, matchOpCOutput);
+
+    auto matchOpB = graph.createNode(Criteria("opB"));
+    graph.createEdge(matchOpCOutput, matchOpB);
+    graph.createEdge(matchOpCOutput, matchOpB);
+
+    auto matchOpBOutput = graph.createNode(any());
+    graph.createEdge(matchOpB, matchOpBOutput);
+
+    auto matchOpF = graph.createNode(Criteria("opF"));
+    graph.createEdge(matchOpBOutput, matchOpF);
+
+    auto matchOpFOutput = graph.createNode(any());
+    graph.createEdge(matchOpF, matchOpFOutput);
+
+    matchOpG = graph.createNode(Criteria("opG"));
+    auto matchDataI = graph.createNode(
+        std::move(TestMatchNode(any()).nonTerminal().excludeFromSubgraph()));
+    graph.createEdge(matchOpFOutput, matchOpG);
+    graph.createEdge(matchDataI, matchOpG);
   }
 };
 
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
index fbfe143f66cee0..a8cbb661d69c21 100644
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@@ -233,9 +233,11 @@ void addNomnigraphMethods(pybind11::module& m) {
                   auto nnOp = nn::get<NeuralNetOperator>(node);
                   return opName == nnOp->getName();
                 });
-            return g->createNode(
-                nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(
-                    match, true, 1, !strict));
+            auto node = nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(match);
+            if (!strict) {
+              node.nonTerminal();
+            }
+            return g->createNode(std::move(node));
           },
           py::return_value_policy::reference_internal,
           py::arg("node"),
@@ -243,9 +245,11 @@ void addNomnigraphMethods(pybind11::module& m) {
       .def(
           "createNode",
           [](nn::NNMatchGraph* g, nom::repr::Tensor& tensor, bool strict) {
-            return g->createNode(
-                nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(
-                    nn::matchTensor(), true, 1, !strict));
+            auto node = nn::NNMatchNode(nn::matchTensor());
+            if (!strict) {
+              node.nonTerminal();
+            }
+            return g->createNode(std::move(node));
           },
           py::return_value_policy::reference_internal,
           py::arg("tensor"),
@@ -255,9 +259,11 @@ void addNomnigraphMethods(pybind11::module& m) {
           [](nn::NNMatchGraph* g, bool strict) {
             auto match = nn::NNNodeMatchCriteria(
                 [](NNGraph::NodeRef node) { return true; });
-            return g->createNode(
-                nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(
-                    match, true, 1, !strict));
+            auto node = nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(match);
+            if (!strict) {
+              node.nonTerminal();
+            }
+            return g->createNode(std::move(node));
           },
           py::return_value_policy::reference_internal,
           py::arg("strict") = false)

From 0ddbe668cda54271aeca99adefcf052f163847eb Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 05:56:17 -0700
Subject: [PATCH 027/237] Improve shape analysis to cover all most commonly
 used ops (#11358)

Summary:
[Here's a list](https://gist.github.com/apaszke/f0821840bdcc67a977832dc58acc1b85) of ops that are in `register_aten_ops.cpp`, but aren't supported in shape prop. Everything else should work now.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11358

Differential Revision: D9753693

Pulled By: apaszke

fbshipit-source-id: efeae0126ce16cb56b8797fc5246405588bcae3c
---
 test/test_jit.py                          |  58 +-
 tools/jit/gen_jit_dispatch.py             |   6 +-
 tools/jit/templates/register_aten_ops.cpp |   4 -
 torch/csrc/jit/graph_executor.cpp         |   5 +-
 torch/csrc/jit/ivalue.h                   |  29 +
 torch/csrc/jit/passes/shape_analysis.cpp  | 805 ++++++++++++++++++++--
 torch/csrc/jit/pybind.h                   |   7 +-
 torch/csrc/jit/python_ir.cpp              |   8 +-
 torch/csrc/jit/type.h                     |   2 +-
 9 files changed, 834 insertions(+), 90 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 382daa03ec1614..ffd2c853481eb9 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -7059,6 +7059,19 @@ def forward(self, x, y):
     'test_split_dim_neg0',
 }
 
+EXCLUDE_TYPE_CHECK = {
+    # slogdet tests use itemgetter to select its only differentiable output,
+    # but this happens outside of the graph we handle, so there are fewer
+    # reference outputs than graph outputs.
+    'test_slogdet_1x1_neg_det',
+    'test_slogdet_1x1_pos_det',
+    'test_slogdet_distinct_singular_values',
+    'test_slogdet_neg_det',
+    'test_slogdet_pos_det',
+    'test_slogdet_symmetric',
+    'test_slogdet_symmetric_pd',
+}
+
 # known to be failing in script
 EXCLUDE_SCRIPT = {
     # TODO: Fix var/std
@@ -7182,7 +7195,9 @@ def traced_fn(*inputs, **kwargs):
         fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
         traced = torch.jit.trace(fn_tensors, inputs_tensors)
         self.assertExportImport(traced.graph, inputs_tensors)
-        return traced(*inputs_tensors)
+        output = traced(*inputs_tensors)
+        traced_fn.last_graph = traced.graph_for(*inputs_tensors)
+        return output
     return traced_fn
 
 script_template = '''
@@ -7222,12 +7237,30 @@ def script_fn(*args, **kwargs):
         script = script_template.format(', '.join(formals), call)
         CU = torch.jit.CompilationUnit(script)
         self.assertExportImport(CU.the_method.graph, tensors)
-
-        return output_process_fn(CU.the_method(*tensors))
+        output = output_process_fn(CU.the_method(*tensors))
+        script_fn.last_graph = CU.the_method.graph_for(*tensors)
+        return output
     return script_fn
 
 
-def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True):
+def check_output_types(self, func, ref_outputs, args, kwargs):
+    graph = getattr(func, 'last_graph', None)
+    if not isinstance(ref_outputs, tuple):
+        ref_outputs = (ref_outputs,)
+    types = [o.type() for o in graph.outputs()]
+    self.assertEqual(len(types), len(ref_outputs))
+    for i, (t, ref_out) in enumerate(zip(types, ref_outputs)):
+        if isinstance(ref_out, list):
+            assert len(ref_out) > 0
+            elem = ref_out[0]
+            assert isinstance(elem, torch.Tensor)
+            self.assertTrue(t.isSubtypeOf(torch._C.ListType.ofTensors()))
+        else:
+            ref_type = torch._C.Type.inferFrom(ref_out)
+            self.assertTrue(ref_type.isSubtypeOf(t))
+
+
+def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True, check_types=True):
     kwargs = kwargs if kwargs else {}
 
     def allSum(vs):
@@ -7252,6 +7285,9 @@ def clone_inputs(requires_grad):
     outputs_test = func(*nograd_inputs, **kwargs)
     self.assertEqual(outputs, outputs_test)
 
+    if check_types:
+        check_output_types(self, func, outputs_test, nograd_inputs, kwargs)
+
     # test single grad case
     outputs = reference_func(*recording_inputs, **kwargs)
     grads = torch.autograd.grad(allSum(outputs), recording_tensors,
@@ -7577,15 +7613,19 @@ def fn(*inputs, **kwargs):
                     output = getattr(inputs[0], name)(*inputs[1:], **kwargs)
                     return output_process_fn(output)
 
+                check_types = test_name not in EXCLUDE_TYPE_CHECK
+
                 if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name):
                     if test_name not in EXCLUDE_TRACED:
                         check_against_reference(self, create_traced_fn(self, fn),
-                                                fn, (self_variable,) + args_variable, kwargs_variable)
+                                                fn, (self_variable,) + args_variable, kwargs_variable,
+                                                check_types=check_types)
 
                     if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
                                                 create_script_fn(self, name, 'method', output_process_fn),
-                                                fn, (self_variable,) + args_variable, kwargs_variable)
+                                                fn, (self_variable,) + args_variable, kwargs_variable,
+                                                check_types=check_types)
 
                 # functional interface tests
                 if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
@@ -7597,12 +7637,14 @@ def fn(*inputs, **kwargs):
                     f_args_tensor = (self_tensor,) + args_tensor
 
                     if not is_inplace and test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(self, fn), fn, f_args_variable, kwargs_variable)
+                        check_against_reference(self, create_traced_fn(self, fn), fn,
+                                                f_args_variable, kwargs_variable, check_types=check_types)
 
                     if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
                                                 create_script_fn(self, name, 'functional', output_process_fn),
-                                                fn, f_args_variable, kwargs_variable)
+                                                fn, f_args_variable, kwargs_variable,
+                                                check_types=check_types)
 
             check(name)
             inplace_name = name + '_'
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index ff7fce56e91552..31ab5ed110f0de 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -56,11 +56,11 @@ def jit_type_of(arg):
 # map from aten 'simple_type' to the function that will turn a tensor into
 # that type
 FROM_IVALUE = {
-    'Device': 'as_device({}.toIntList()->elements())',
+    'Device': '{}.to<at::Device>()',
     'IntList': '{}.toIntList()->elements()',
-    'Layout': 'static_cast<at::Layout>({}.toInt())',
+    'Layout': '{}.to<at::Layout>()',
     'Scalar': '{}.toScalar()',
-    'ScalarType': 'static_cast<at::ScalarType>({}.toInt())',
+    'ScalarType': '{}.to<at::ScalarType>()',
     'Tensor': '{}.toTensor()',
     'TensorList': '{}.toTensorList()->elements()',
     'bool': 'bool({}.toInt())',
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
index 4ac499b99faaed..de205de52a484e 100644
--- a/tools/jit/templates/register_aten_ops.cpp
+++ b/tools/jit/templates/register_aten_ops.cpp
@@ -50,10 +50,6 @@ std::array<bool, N> as_bool_array(at::ArrayRef<int64_t> vec) {
   return res;
 }
 
-at::Device as_device(ArrayRef<int64_t> elements) {
-  return at::Device(static_cast<at::Device::Type>(elements[0]), elements[1]);
-}
-
 RegisterOperators reg({
   ${constructors}
 });
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index cdfeab0e730179..d77f1b2be1aa28 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -379,6 +379,10 @@ struct GraphExecutorImpl {
 
     // Phase 2. Propagate detailed information about the spec through the
     //          graph (enabled more specializations in later passes).
+    //          Shape propagation sometimes depends on certain arguments being
+    //          constants, and constant propagation doesn't need shape information
+    //          anyway, so it's better to run it first.
+    ConstantPropagation(opt_graph);
     PropagateInputShapes(*opt_graph, spec);
 
     // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites that
@@ -427,7 +431,6 @@ struct GraphExecutorImpl {
     EliminateDeadCode(graph);
     EliminateCommonSubexpression(graph);
     UnrollLoops(graph);
-    ConstantPropagation(graph);
     PeepholeOptimize(graph);
     CheckInplace(graph);
     BatchMM(graph);
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 7d9bea6e678eec..3602e7deb8eb89 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -358,6 +358,35 @@ DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
 
 #undef DEFINE_TO
 
+#define DEFINE_TO_WITH_BODY(type, body) \
+template<> \
+inline type IValue::to<type>() && { \
+  body(std::move(*this)); \
+} \
+template<> \
+inline type IValue::to<type>() const & { \
+  body((*this)); \
+}
+
+#define SCALAR_TYPE_BODY(this) return static_cast<at::ScalarType>(this.toInt());
+#define LAYOUT_BODY(this) return static_cast<at::Layout>(this.toInt());
+#define DEVICE_BODY(this) \
+  /* NB: const_list might be a move of the vector, so we need to */ \
+  /*     assign it to prevent its deallocation.                  */ \
+  auto && const_list = this.toIntList(); \
+  const auto & elems = const_list->elements(); \
+  JIT_ASSERT(elems.size() == 2); \
+  return at::Device(static_cast<at::Device::Type>(elems[0]), elems[1]);
+
+DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY)
+DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY)
+DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY)
+
+#undef DEFINE_TO_WITH_BODY
+#undef SCALAR_TYPE_BODY
+#undef LAYOUT_BODY
+#undef DEVICE_BODY
+
 inline IValue::IValue(c10::intrusive_ptr<Tuple> v)
 : tag(Tag::Tuple), is_intrusive_ptr(true) {
   as_intrusive_ptr = v.release();
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 7064e2b4b365a1..7499546f27684f 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -208,8 +208,7 @@ bool PropagateShapeOnNodeByRunningIt(Node* node) {
 // However, we allow primitive returns because we want to support mixed
 // primitive/tensor outputs.
 
-bool PropagateTensorShapeOnNode(
-    Node * node, bool insert_expands, std::vector<TensorTypePtr> types);
+bool PropagateTensorShapeOnNode(Node * node, bool insert_expands);
 bool PropagateCompleteShapeOnNode(
     Node * node, bool insert_expands, std::vector<CompleteTensorTypePtr> types);
 
@@ -306,8 +305,30 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     }
     case prim::ImplicitTensorToNum:
     case prim::TensorToNum:
-    case prim::NumToTensor:
       return; // correct num type is already set
+    case prim::NumToTensor: {
+      if (node->input()->type()->isSubtypeOf(IntType::get())) {
+        node->output()->setType(TensorType::create(at::kLong, -1, 0));
+      } else {
+        JIT_ASSERT(node->input()->type()->isSubtypeOf(FloatType::get()));
+        node->output()->setType(TensorType::create(at::kDouble, -1, 0));
+      }
+      return;
+    }
+    case prim::TupleConstruct: {
+      // We refresh the tuple type, because the input types could have been refined.
+      node->output()->setType(TupleType::create(fmap(node->inputs(), [](Value *v) { return v->type(); })));
+      return;
+    }
+    case prim::TupleUnpack: {
+      auto tuple_type = node->input()->type()->cast<TupleType>();
+      JIT_ASSERT(tuple_type && tuple_type->elements().size() == node->outputs().size());
+      auto elems = tuple_type->elements();
+      for (size_t i = 0; i < node->outputs().size(); ++i) {
+        node->output(i)->setType(elems[i]);
+      }
+      return;
+    }
     case prim::Constant: {
       if(node->output()->type()->isSubtypeOf(DynamicType::get())) {
         node->output()->inferTypeFrom(node->t(attr::value));
@@ -344,10 +365,8 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     }
   }
 
-  if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
-    if (PropagateTensorShapeOnNode(node, insert_expands, std::move(*maybe_tensor_types))) {
-      return;
-    }
+  if (PropagateTensorShapeOnNode(node, insert_expands)) {
+    return;
   }
 
   if (PropagateShapeOnNodeByRunningIt(node)) {
@@ -356,9 +375,11 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
   return setUnshapedType(node);
 }
 
-bool PropagateTensorShapeOnNode(Node * node, bool insert_expands,
-                                std::vector<TensorTypePtr> tensor_types) {
+bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) {
   static const auto broadcast = [](std::vector<TensorTypePtr>& tensor_types) -> TensorTypePtr {
+    if (tensor_types.size() == 1) {
+      return tensor_types[0];
+    }
     JIT_ASSERT(!tensor_types.empty());
     auto any_type = tensor_types[0];
     auto max_dims = any_type->dim();
@@ -367,85 +388,727 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands,
     }
     return TensorType::create(any_type->scalarType(), any_type->device(), max_dims);
   };
+
+  using type_vec_t = std::vector<TensorTypePtr>;
+  // Formula is expected to return a vector of length equal to the number of tensor
+  // outputs of the node, or an empty vector which implies that it failed to propagate.
+  using formula_t = std::function<type_vec_t(Node *)>;
+  static std::mutex shape_formulas_mutex;
+  static std::vector<std::pair<OperatorSet, formula_t>> shape_formulas;
+  struct register_formula_for {
+    register_formula_for(OperatorSet operators, formula_t formula) {
+      std::unique_lock<std::mutex> lock {shape_formulas_mutex};
+      shape_formulas.emplace_back(std::move(operators), std::move(formula));
+    }
+  };
+
+  // Requirements:
+  //   dims           : preserved
+  //   scalar type    : preserved
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  static const register_formula_for simple_unary_ops {{
+    "aten::abs(Tensor self) -> Tensor",
+    "aten::acos(Tensor self) -> Tensor",
+    "aten::neg(Tensor self) -> Tensor",
+    "aten::t(Tensor self) -> Tensor",
+    "aten::sigmoid(Tensor self) -> Tensor",
+    "aten::tanh(Tensor self) -> Tensor",
+    "aten::exp(Tensor self) -> Tensor",
+    "aten::relu(Tensor self) -> Tensor",
+    "aten::asin(Tensor self) -> Tensor",
+    "aten::atan(Tensor self) -> Tensor",
+    "aten::ceil(Tensor self) -> Tensor",
+    "aten::clone(Tensor self) -> Tensor",
+    "aten::contiguous(Tensor self) -> Tensor",
+    "aten::bernoulli(Tensor self) -> Tensor",
+    "aten::celu(Tensor self, Scalar alpha) -> Tensor",
+    "aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor",
+    "aten::clamp_max(Tensor self, Scalar max) -> Tensor",
+    "aten::clamp_min(Tensor self, Scalar min) -> Tensor",
+    "aten::alpha_dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::bernoulli(Tensor self, float p, Generator generator) -> Tensor",
+    "aten::cos(Tensor self) -> Tensor",
+    "aten::cosh(Tensor self) -> Tensor",
+    "aten::digamma(Tensor self) -> Tensor",
+    "aten::dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::elu(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale) -> Tensor",
+    "aten::erf(Tensor self) -> Tensor",
+    "aten::erfc(Tensor self) -> Tensor",
+    "aten::erfinv(Tensor self) -> Tensor",
+    "aten::exp(Tensor self) -> Tensor",
+    "aten::expm1(Tensor self) -> Tensor",
+    "aten::log(Tensor self) -> Tensor",
+    "aten::log10(Tensor self) -> Tensor",
+    "aten::log1p(Tensor self) -> Tensor",
+    "aten::log2(Tensor self) -> Tensor",
+    "aten::log_sigmoid(Tensor self) -> Tensor",
+    "aten::log_softmax(Tensor self, int dim) -> Tensor",
+    "aten::floor(Tensor self) -> Tensor",
+    "aten::frac(Tensor self) -> Tensor",
+    "aten::flip(Tensor self, int[] dims) -> Tensor",
+    "aten::feature_alpha_dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::feature_dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::hardshrink(Tensor self, Scalar lambd) -> Tensor",
+    "aten::hardtanh(Tensor self, Scalar min_val, Scalar max_val) -> Tensor",
+    "aten::glu(Tensor self, int dim) -> Tensor",
+    "aten::inverse(Tensor self) -> Tensor",
+    "aten::leaky_relu(Tensor self, Scalar negative_slope) -> Tensor",
+    "aten::lgamma(Tensor self) -> Tensor",
+    "aten::mvlgamma(Tensor self, int p) -> Tensor",
+    "aten::normal(float mean, Tensor std, *, Generator generator) -> Tensor",
+    "aten::normal(Tensor mean, float std, *, Generator generator) -> Tensor",
+    "aten::permute(Tensor self, int[] dims) -> Tensor",
+    "aten::pin_memory(Tensor self) -> Tensor",
+    "aten::pinverse(Tensor self, float rcond) -> Tensor",
+    "aten::reciprocal(Tensor self) -> Tensor",
+    "aten::relu(Tensor self) -> Tensor",
+    "aten::round(Tensor self) -> Tensor",
+    "aten::rrelu(Tensor self, Scalar lower, Scalar upper, int training, Generator generator) -> Tensor",
+    "aten::rsqrt(Tensor self) -> Tensor",
+    "aten::selu(Tensor self) -> Tensor",
+    "aten::sigmoid(Tensor self) -> Tensor",
+    "aten::sign(Tensor self) -> Tensor",
+    "aten::sin(Tensor self) -> Tensor",
+    "aten::sinh(Tensor self) -> Tensor",
+    "aten::softmax(Tensor self, int dim) -> Tensor",
+    "aten::softplus(Tensor self, Scalar beta, Scalar threshold) -> Tensor",
+    "aten::softshrink(Tensor self, Scalar lambd) -> Tensor",
+    "aten::sqrt(Tensor self) -> Tensor",
+    "aten::tan(Tensor self) -> Tensor",
+    "aten::tanh(Tensor self) -> Tensor",
+    "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor",
+    "aten::transpose(Tensor self, int dim0, int dim1) -> Tensor",
+    "aten::tril(Tensor self, int diagonal) -> Tensor",
+    "aten::triu(Tensor self, int diagonal) -> Tensor",
+    "aten::trunc(Tensor self) -> Tensor",
+    "aten::rot90(Tensor self, int k, int[] dims) -> Tensor",
+    "aten::narrow(Tensor self, int dim, int start, int length) -> Tensor",
+    "aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor",
+    "aten::alias(Tensor self) -> Tensor",
+    "aten::detach(Tensor self) -> Tensor",
+    "aten::cumprod(Tensor self, int dim) -> Tensor",
+    "aten::cumsum(Tensor self, int dim) -> Tensor",
+
+    "aten::empty_like(Tensor self) -> Tensor",
+    "aten::full_like(Tensor self, Scalar fill_value) -> Tensor",
+    "aten::ones_like(Tensor self) -> Tensor",
+    "aten::rand_like(Tensor self) -> Tensor",
+    "aten::randint_like(Tensor self, int high) -> Tensor",
+    "aten::randint_like(Tensor self, int low, int high) -> Tensor",
+    "aten::randn_like(Tensor self) -> Tensor",
+    "aten::zeros_like(Tensor self) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    auto input_type = node->input(0)->type()->cast<TensorType>();
+    return input_type ? type_vec_t{input_type} : type_vec_t{};
+  }};
+
+  // Requirements:
+  //   dims           : broadcast all tensor args
+  //   scalar type    : always matching and preserved
+  //   device         : always matching and preserved
+  //   tensor inputs  : *
+  //   tensor outputs : 1
+  static const register_formula_for broadcasting_ops {{
+    // Tensor-Tensor operators
+    "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::mul(Tensor self, Tensor other) -> Tensor",
+    "aten::div(Tensor self, Tensor other) -> Tensor",
+    "aten::pow(Tensor self, Tensor exponent) -> Tensor",
+    "aten::min(Tensor self, Tensor other) -> Tensor",
+    "aten::max(Tensor self, Tensor other) -> Tensor",
+    "aten::fmod(Tensor self, Tensor other) -> Tensor",
+    "aten::remainder(Tensor self, Tensor other) -> Tensor",
+    "aten::lerp(Tensor self, Tensor end, Scalar weight) -> Tensor",
+    "aten::max(Tensor self, Tensor other) -> Tensor",
+    "aten::min(Tensor self, Tensor other) -> Tensor",
+    "aten::__and__(Tensor self, Tensor other) -> Tensor",
+    "aten::__or__(Tensor self, Tensor other) -> Tensor",
+    "aten::__xor__(Tensor self, Tensor other) -> Tensor",
+    "aten::__lshift__(Tensor self, Tensor other) -> Tensor",
+    "aten::__rshift__(Tensor self, Tensor other) -> Tensor",
+    "aten::__iand__(Tensor self, Tensor other) -> Tensor",
+    "aten::__ior__(Tensor self, Tensor other) -> Tensor",
+    "aten::__ixor__(Tensor self, Tensor other) -> Tensor",
+    "aten::__ilshift__(Tensor self, Tensor other) -> Tensor",
+    "aten::__irshift__(Tensor self, Tensor other) -> Tensor",
+
+    // Tensor-Scalar operators
+    "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::mul(Tensor self, Scalar other) -> Tensor",
+    "aten::div(Tensor self, Scalar other) -> Tensor",
+    "aten::pow(Tensor self, Scalar exponent) -> Tensor",
+    "aten::fmod(Tensor self, Scalar other) -> Tensor",
+    "aten::remainder(Tensor self, Scalar other) -> Tensor",
+    "aten::add(Scalar other, Tensor self) -> Tensor",
+    "aten::sub(Scalar other, Tensor self) -> Tensor",
+    "aten::mul(Scalar other, Tensor self) -> Tensor",
+    "aten::div(Scalar other, Tensor self) -> Tensor",
+    "aten::pow(Scalar base, Tensor self) -> Tensor",
+    "aten::__and__(Tensor self, Scalar other) -> Tensor",
+    "aten::__or__(Tensor self, Scalar other) -> Tensor",
+    "aten::__xor__(Tensor self, Scalar other) -> Tensor",
+    "aten::__lshift__(Tensor self, Scalar other) -> Tensor",
+    "aten::__rshift__(Tensor self, Scalar other) -> Tensor",
+    "aten::__iand__(Tensor self, Scalar other) -> Tensor",
+    "aten::__ior__(Tensor self, Scalar other) -> Tensor",
+    "aten::__ixor__(Tensor self, Scalar other) -> Tensor",
+    "aten::__ilshift__(Tensor self, Scalar other) -> Tensor",
+    "aten::__irshift__(Tensor self, Scalar other) -> Tensor",
+
+    // Ops with Tensor-Tensor overloads only
+    "aten::atan2(Tensor self, Tensor other) -> Tensor",
+
+    // Non-binary ops
+    "aten::where(Tensor condition, Tensor self, Tensor other) -> Tensor",
+    "aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value) -> Tensor",
+    "aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
+      return {broadcast(*maybe_tensor_types)};
+    }
+    return {};
+  }};
+
+  static const auto any_tensor_type = [](Node * node) -> TensorTypePtr {
+    for (Value * input : node->inputs()) {
+      if (auto type = input->type()->cast<TensorType>()) {
+        return type;
+      }
+    }
+    return nullptr;
+  };
+
+  // Requirements:
+  //   dims           : always matching and preserved
+  //   scalar type    : always matching and preserved
+  //   device         : always matching and preserved
+  //   tensor inputs  : 2
+  //   tensor outputs : 1
+  static const register_formula_for binary_ops_strict_match {{
+    "aten::normal(Tensor mean, Tensor std, *, Generator generator) -> Tensor",
+    "aten::bernoulli(Tensor self, Tensor p, Generator generator) -> Tensor",
+    "aten::mm(Tensor self, Tensor mat2) -> Tensor",
+    "aten::bmm(Tensor self, Tensor mat2) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = any_tensor_type(node)) {
+      return {type};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : all tensor args are broadcast
+  //   scalar type    : byte/uint8
+  //   device         : always matching and preserved
+  //   tensor inputs  : *
+  //   tensor outputs : 1
+  static const register_formula_for comparison_ops {{
+    "aten::lt(Tensor self, Tensor other) -> Tensor",
+    "aten::le(Tensor self, Tensor other) -> Tensor",
+    "aten::gt(Tensor self, Tensor other) -> Tensor",
+    "aten::ge(Tensor self, Tensor other) -> Tensor",
+    "aten::eq(Tensor self, Tensor other) -> Tensor",
+    "aten::ne(Tensor self, Tensor other) -> Tensor",
+    "aten::lt(Tensor self, Scalar other) -> Tensor",
+    "aten::le(Tensor self, Scalar other) -> Tensor",
+    "aten::gt(Tensor self, Scalar other) -> Tensor",
+    "aten::ge(Tensor self, Scalar other) -> Tensor",
+    "aten::eq(Tensor self, Scalar other) -> Tensor",
+    "aten::ne(Tensor self, Scalar other) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
+      return {broadcast(*maybe_tensor_types)->toScalarType(at::kByte)};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : preserved from the first argument
+  //   scalar type    : preserved from the first argument (doesn't have to match other arguments)
+  //   device         : always matching and preserved
+  //   tensor inputs  : *
+  //   tensor outputs : 1
+  // NB: those ops (with slight adjustments) are good candidates for restarts.
+  //     Knowing the type and device of weights or biases is usually enough to
+  //     infer the output type.
+  static const register_formula_for nn_ops_first_input_preserving {{
+    "aten::batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, int training, float momentum, float eps, int cudnn_enabled) -> Tensor",
+    "aten::conv1d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
+    "aten::conv2d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
+    "aten::conv3d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
+    "aten::conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad) -> Tensor",
+    "aten::conv_transpose1d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor",
+    "aten::conv_transpose2d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor",
+    "aten::conv_transpose3d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor",
+    "aten::convolution(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int transposed, int[] output_padding, int groups) -> Tensor",
+    "aten::adaptive_avg_pool1d(Tensor self, int[] output_size) -> Tensor",
+    "aten::adaptive_avg_pool2d(Tensor self, int[] output_size) -> Tensor",
+    "aten::adaptive_avg_pool3d(Tensor self, int[] output_size) -> Tensor",
+    "aten::avg_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor",
+    "aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor",
+    "aten::avg_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor",
+    "aten::max_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor",
+    "aten::max_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor",
+    "aten::max_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor",
+    "aten::max_unpool2d(Tensor self, Tensor indices, int[] output_size) -> Tensor",
+    "aten::max_unpool3d(Tensor self, Tensor indices, int[] output_size, int[] stride, int[] padding) -> Tensor",
+    "aten::reflection_pad1d(Tensor self, int[] padding) -> Tensor",
+    "aten::reflection_pad2d(Tensor self, int[] padding) -> Tensor",
+    "aten::replication_pad1d(Tensor self, int[] padding) -> Tensor",
+    "aten::replication_pad2d(Tensor self, int[] padding) -> Tensor",
+    "aten::replication_pad3d(Tensor self, int[] padding) -> Tensor",
+    "aten::upsample_bilinear2d(Tensor self, int[] output_size, int align_corners) -> Tensor",
+    "aten::upsample_linear1d(Tensor self, int[] output_size, int align_corners) -> Tensor",
+    "aten::upsample_nearest1d(Tensor self, int[] output_size) -> Tensor",
+    "aten::upsample_nearest2d(Tensor self, int[] output_size) -> Tensor",
+    "aten::upsample_nearest3d(Tensor self, int[] output_size) -> Tensor",
+    "aten::upsample_trilinear3d(Tensor self, int[] output_size, int align_corners) -> Tensor",
+    "aten::prelu(Tensor self, Tensor weight) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->input(0)->type()->cast<TensorType>()) {
+      return {type};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : 0
+  //   scalar type    : preserved
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  static const register_formula_for all_reduce_ops {{
+    "aten::argmax(Tensor self) -> Tensor",
+    "aten::argmin(Tensor self) -> Tensor",
+    "aten::det(Tensor self) -> Tensor",
+    "aten::logdet(Tensor self) -> Tensor",
+    "aten::max(Tensor self) -> Tensor",
+    "aten::min(Tensor self) -> Tensor",
+    "aten::mean(Tensor self) -> Tensor",
+    "aten::median(Tensor self) -> Tensor",
+    "aten::norm(Tensor self, Scalar p) -> Tensor",
+    "aten::std(Tensor self, int unbiased) -> Tensor",
+    "aten::sum(Tensor self) -> Tensor",
+    "aten::trace(Tensor self) -> Tensor",
+    "aten::var(Tensor self, int unbiased) -> Tensor",
+    "aten::all(Tensor self) -> Tensor",
+    "aten::any(Tensor self) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->input(0)->type()->cast<TensorType>()) {
+      return {type->withDim(0)};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : 0
+  //   scalar type    : preserved if floating point, otherwise long/int64
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  static const register_formula_for all_reduce_ops_with_integer_upcast {{
+    "aten::sum(Tensor self) -> Tensor",
+    "aten::prod(Tensor self) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->input(0)->type()->cast<TensorType>()) {
+      return {at::isFloatingType(type->scalarType()) ? type->withDim(0) : type->withDim(0)->toScalarType(at::kLong)};
+    }
+    return {};
+  }};
+
+  static const auto multidim_reduce_with_postprocess =
+    [](Node * node, size_t num_reduced_dim, bool upcast_integer) -> type_vec_t {
+      auto maybe_keepdim = node->get<int64_t>(attr::keepdim);
+      if (!maybe_keepdim) return {};
+      if (auto type = node->input(0)->type()->cast<TensorType>()) {
+        if (upcast_integer && !at::isFloatingType(type->scalarType())) {
+          type = type->toScalarType(at::kLong);
+        }
+        if (*maybe_keepdim) {
+          return {type};
+        } else if (type->dim() > num_reduced_dim) {
+          return {type->withDim(type->dim() - num_reduced_dim)};
+        }
+      }
+      return {};
+    };
+
+  // Requirements:
+  //   dims           : preserved if keepdim == false, 1 smaller otherwise
+  //   scalar type    : preserved for first output, byte/uint8 for second output if exists
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1 or 2
+  // Additionally:
+  //   - First input should be the only tensor input
+  //   - Has a bool keepdim argument
+  static const register_formula_for dim_reduce_ops {{
+    "aten::argmax(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::argmin(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::max_values(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::min_values(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::mean(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::norm(Tensor self, Scalar p, int dim, int keepdim) -> Tensor",
+    "aten::std(Tensor self, int dim, int unbiased, int keepdim) -> Tensor",
+    "aten::var(Tensor self, int dim, int unbiased, int keepdim) -> Tensor",
+    "aten::logsumexp(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::all(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::any(Tensor self, int dim, int keepdim) -> Tensor",
+
+    // Ops returning indices as second output
+    "aten::kthvalue(Tensor self, int k, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::max(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::min(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::median(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::mode(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+  }, [](Node * node) -> type_vec_t {
+    // NB: Note that while this function is generally meant to be used with ops that
+    // have a single output, we will fix up its return right below.
+    auto output_types = multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/1, /*integer_upcast=*/false);
+    if (!output_types.empty() && node->outputs().size() == 2) {
+      output_types.push_back(output_types.back()->toScalarType(at::kLong));
+    }
+    return output_types;
+  }};
+
+  // Requirements:
+  //   dims           : preserved if keepdim == false, 1 smaller otherwise
+  //   scalar type    : preserved if floating point, otherwise long/int64
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  //   - has a bool keepdim argument
+  static const register_formula_for dim_reduce_ops_with_integer_upcast {{
+    "aten::prod(Tensor self, int dim, int keepdim) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    return multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/1, /*integer_upcast=*/true);
+  }};
+
+  // Requirements:
+  //   dims           : preserved if keepdim == false, 1 smaller otherwise
+  //   scalar type    : preserved if floating point, otherwise long/int64
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - has bool keepdim and int[] dim arguments
+  static const register_formula_for multidim_reduce_ops_with_integer_upcast {{
+    "aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto dim = node->get<std::vector<int64_t>>(attr::dim)) {
+      // TODO: can dim contain duplicates?
+      return multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/dim->size(), /*integer_upcast=*/true);
+    }
+    return {};
+  }};
+
+  static const auto get_device_int = [](at::optional<at::Device> dev) -> at::optional<int> {
+    if (!dev) return {};
+    if (dev->is_cpu()) {
+      return {-1};
+    }
+    return dev->has_index() ? at::optional<int>{dev->index()} : at::nullopt;
+  };
+  static const auto factory_with_ndim = [](Node * node, int dim) -> type_vec_t{
+    auto maybe_layout = node->get<at::Layout>(attr::layout);
+    if (!maybe_layout || maybe_layout != at::kStrided) return {};
+    auto maybe_device = get_device_int(node->get<at::Device>(attr::device));
+    if (!maybe_device) return {};
+    auto maybe_scalar_type = node->get<at::ScalarType>(attr::dtype);
+    if (!maybe_scalar_type) return {};
+    return {TensorType::create(*maybe_scalar_type, *maybe_device, dim)};
+  };
+
+  // Requirements:
+  //   dims           : preserved
+  //   scalar type    : equal to value of dtype
+  //   device         : equal to value of device
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - has ScalarType dtype, Layeout layout and Device device arguments
+  static const register_formula_for like_factories_with_options {{
+    "aten::empty_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::full_like(Tensor self, Scalar fill_value, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::ones_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::rand_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint_like(Tensor self, int high, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint_like(Tensor self, int low, int high, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randn_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::zeros_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->namedInput(attr::self)->type()->cast<TensorType>()) {
+      return factory_with_ndim(node, type->dim());
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : equal to number of elements in size
+  //   scalar type    : equal to value of dtype
+  //   device         : equal to value of device
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - has int[] size, ScalarType dtype, Layeout layout and Device device arguments
+  static const register_formula_for size_factories_with_options {{
+    "aten::empty(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::full(int[] size, Scalar fill_value, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::ones(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::rand(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randn(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::zeros(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint(int high, int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint(int low, int high, int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto maybe_size = node->get<std::vector<int64_t>>(attr::size)) {
+      return factory_with_ndim(node, maybe_size->size());
+    }
+    return {};
+  }};
+
+  static const auto get_cast_scalar_type = [](Node *node) -> at::ScalarType {
+    switch (node->kind()) {
+      case aten::_cast_Byte: return at::kByte;
+      case aten::_cast_Char: return at::kChar;
+      case aten::_cast_Double: return at::kDouble;
+      case aten::_cast_Float: return at::kFloat;
+      case aten::_cast_Half: return at::kHalf;
+      case aten::_cast_Int: return at::kInt;
+      case aten::_cast_Long: return at::kLong;
+      case aten::_cast_Short: return at::kShort;
+      default: AT_ASSERTM(false, "unknown node kind in get_cast_scalar_type: ", node->kind().toQualString());
+    }
+  };
+  static const register_formula_for cast_ops {{
+    "aten::_cast_Byte(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Char(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Double(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Float(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Half(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Int(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Long(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Short(Tensor self, int non_blocking) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->namedInput(attr::self)->type()->cast<TensorType>()) {
+      return {type->toScalarType(get_cast_scalar_type(node))};
+    }
+    return {};
+  }};
+
+  // First, try to match one of the registered formulas to their operator sets.
+  for (auto & entry : shape_formulas) {
+    if (entry.first.find(node)) {
+      auto types = entry.second(node);
+      if (types.empty()) {
+        return false;
+      } else {
+        auto outputs = node->outputs();
+        JIT_ASSERT(types.size() == outputs.size());
+        for (size_t i = 0; i < types.size(); ++i) {
+          JIT_ASSERT(outputs[i]->type()->isSubtypeOf(DynamicType::get()));
+          outputs[i]->setType(types[i]);
+        }
+        return true;
+      }
+    }
+  }
+
+  // This section implements shape prop for an assorted set of nodes that only
+  // need partial information about their input types.
+  const auto input_type = [node](size_t index) {
+    return node->input(index)->type()->cast<TensorType>();
+  };
+  if (node->matches("aten::masked_select(Tensor self, Tensor mask) -> Tensor")) {
+    auto type = input_type(0);
+    auto mask_type = input_type(1);
+    if (type && mask_type) {
+      if (type->dim() == 0 && mask_type->dim() == 0) {
+        node->output()->setType(type->withDim(0));
+      } else {
+        node->output()->setType(type->withDim(1));
+      }
+      return true;
+    }
+    if (auto type = input_type(0)) {
+      node->output()->setType(type->withDim(1));
+      return true;
+    }
+  } else if (node->matches("aten::dot(Tensor self, Tensor tensor) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(0));
+      return true;
+    }
+  } else if (node->matches("aten::mv(Tensor self, Tensor vec) -> Tensor") ||
+             node->matches("aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta, Scalar alpha) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(1));
+      return true;
+    }
+  } else if (node->matches("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor") ||
+             node->matches("aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor") ||
+             node->matches("aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta, Scalar alpha) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(2));
+      return true;
+    }
+  } else if (node->matches("aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(3));
+      return true;
+    }
+  } else if (node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor")) {
+    auto type = input_type(0);
+    auto index_type = input_type(1);
+    // index_select behaves very weirdly when self.dim() == 0. It allows both 0D and 1D
+    // indices, and returns a value that has as many dimensions as index.
+    if (type && index_type) {
+      if (type->dim() == 0) {
+        node->output()->setType(type->withDim(index_type->dim()));
+      } else {
+        node->output()->setType(type);
+      }
+      return true;
+    }
+  } else if (node->matches("aten::gather(Tensor self, int dim, Tensor index) -> Tensor")) {
+    auto type = input_type(0);
+    auto index_type = input_type(1);
+    // Gather has this annoying edge case where index always needs to match the number of
+    // dims of self, **except** when self is 1D and index is 0D in which case we return
+    // a 0D output.
+    if (type && index_type) {
+      if (index_type->dim() == 0) {
+        node->output()->setType(type->withDim(0));
+      } else {
+        node->output()->setType(type);
+      }
+      return true;
+    }
+  } else if (node->matches("aten::embedding(Tensor weight, Tensor indices, int padding_idx, int scale_grad_by_freq, int sparse) -> Tensor")) {
+    auto weight_type = input_type(0);
+    auto indices_type = input_type(1);
+    if (weight_type && indices_type) {
+      node->output()->setType(weight_type->withDim(indices_type->dim() + 1));
+      return true;
+    }
+  } else if (node->matches("aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor bias) -> Tensor")) {
+    if (auto type = input_type(0)) {
+      node->output()->setType(type);
+      return true;
+    }
+    if (auto type = input_type(1)) {
+      node->output()->setType(type);
+      return true;
+    }
+  } else if (node->matches("aten::dist(Tensor self, Tensor other, Scalar p) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(0));
+      return true;
+    }
+  }
+
+  // The code below implements formulas that need type information for all their
+  // tensor inputs, and have exactly one output.
+  std::vector<TensorTypePtr> tensor_types;
+  static const auto reshape_prop =
+    [](Node * node, Symbol shape_input, const std::vector<TensorTypePtr>& tensor_types) -> TensorTypePtr {
+      if (auto shape = node->get<std::vector<int64_t>>(shape_input)) {
+        return tensor_types.at(0)->withDim(shape->size());
+      }
+      return nullptr;
+    };
   const auto getSingleOutputType = [&]() -> TypePtr {
-    if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
-        node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
-        node->matches("aten::mul(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::pow(Tensor self, Tensor exponent) -> Tensor") ||
-        node->matches("aten::min(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::max(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor")) {
-      return broadcast(tensor_types);
-    } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
-               node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
-               node->matches("aten::mul(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor") ||
-               node->matches("aten::add(Scalar other, Tensor self) -> Tensor") ||
-               node->matches("aten::sub(Scalar other, Tensor self) -> Tensor") ||
-               node->matches("aten::mul(Scalar other, Tensor self) -> Tensor") ||
-               node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) {
-      return tensor_types.at(0);
-    } else if (node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::ne(Tensor self, Tensor other) -> Tensor")) {
-      return broadcast(tensor_types)->toScalarType(at::kByte);
-    } else if (node->matches("aten::lt(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::le(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::gt(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::ge(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::eq(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::ne(Tensor self, Scalar other) -> Tensor")) {
-      return tensor_types.at(0)->toScalarType(at::kByte);
-    } else if (node->matches("aten::neg(Tensor self) -> Tensor") ||
-               node->matches("aten::t(Tensor self) -> Tensor") ||
-               node->matches("aten::sigmoid(Tensor self) -> Tensor") ||
-               node->matches("aten::tanh(Tensor self) -> Tensor") ||
-               node->matches("aten::exp(Tensor self) -> Tensor") ||
-               node->matches("aten::relu(Tensor self) -> Tensor") ||
-               node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor") ||
-               node->matches("aten::narrow(Tensor self, int dim, int start, int length) -> Tensor") ||
-               node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor")) {
-      return tensor_types.at(0);
-    } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
+    if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
       return tensor_types.at(0)->toScalarType(tensor_types.at(1)->scalarType());
-    } else if (node->matches("aten::sum(Tensor self) -> Tensor")) {
-      // TODO: this depends on the dtype argument. why don't we have access to it in here?
-      // TODO: integral types are upcast
+    } else if (node->matches("aten::view_as(Tensor self, Tensor other) -> Tensor") ||
+               node->matches("aten::expand_as(Tensor self, Tensor other) -> Tensor") ||
+               node->matches("aten::reshape_as(Tensor self, Tensor other) -> Tensor")) {
+      return tensor_types.at(0)->withDim(tensor_types.at(1)->dim());
+    } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor") ||
+               node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor") ||
+               node->matches("aten::as_strided(Tensor self, int[] size, int[] stride) -> Tensor") ||
+               node->matches("aten::as_strided(Tensor self, int[] size, int[] stride, int storage_offset) -> Tensor")) {
+      return reshape_prop(node, attr::size, tensor_types);
+    } else if (node->matches("aten::reshape(Tensor self, int[] shape) -> Tensor")) {
+      return reshape_prop(node, attr::shape, tensor_types);
+    } else if (node->matches("aten::repeat(Tensor self, int[] repeats) -> Tensor")) {
+      return reshape_prop(node, attr::repeats, tensor_types);
+    } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor")) {
       auto & t = tensor_types.at(0);
-      return TensorType::create(t->scalarType(), t->device(), 0);
-    } else if (node->matches("aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor",
-              /*with_const=*/attr::keepdim)) {
+      return t->withDim(t->dim() + 1);
+    } else if (node->matches("aten::select(Tensor self, int dim, int index) -> Tensor") ||
+               node->matches("aten::diagonal(Tensor self, int offset, int dim1, int dim2) -> Tensor")) {
       auto & t = tensor_types.at(0);
-      bool keepdim = node->get<int64_t>(attr::keepdim).value();
-      if (!keepdim) {
-        if (auto dims = node->get<std::vector<int64_t>>(attr::dim)) {
-          // TODO: do we need to account for duplicates in dim here?
-          return t->withDim(t->dim() - dims->size());
+      return t->dim() > 0 ? t->withDim(t->dim() - 1) : nullptr;
+    } else if (node->matches("aten::matmul(Tensor self, Tensor other) -> Tensor")) {
+      int dim1 = tensor_types.at(0)->dim();
+      int dim2 = tensor_types.at(1)->dim();
+      if (dim1 == 1 && dim2 == 1) {
+        // Dot product
+        return tensor_types.at(0)->withDim(0);
+      } else if (dim1 == 2 && dim2 == 2) {
+        // Matrix multiply
+        return tensor_types.at(0);
+      } else if (dim1 == 1 && dim2 == 2) {
+        // Unsqueeze + matrix multiply + squeeze
+        return tensor_types.at(0);
+      } else if (dim1 == 2 && dim2 == 1) {
+        // Matrix vector multiply
+        return tensor_types.at(1);
+      } else {
+        // Batched matrix multiply (possibly with squeeze + unsqueeze if one argument is 1D)
+        auto type = broadcast(tensor_types);
+        if (tensor_types.at(0)->dim() == 1 || tensor_types.at(1)->dim() == 1) {
+          type = type->withDim(type->dim() - 1);
         }
-        return nullptr;
+        return type;
+      }
+    } else if (node->matches("aten::nonzero(Tensor self) -> Tensor")) {
+      return tensor_types.at(0)->toScalarType(at::kLong);
+    } else if (node->matches("aten::take(Tensor self, Tensor index) -> Tensor")) {
+      return tensor_types.at(1)->toScalarType(tensor_types.at(0)->scalarType());
+    } else if (node->matches("aten::diagflat(Tensor self, int offset) -> Tensor")) {
+      return tensor_types.at(0)->withDim(2);
+    } else if (node->matches("aten::diag(Tensor self, int diagonal) -> Tensor")) {
+      auto & t = tensor_types.at(0);
+      if (t->dim() == 1) {
+        return t->withDim(2);
+      } else if (t->dim() == 2) {
+        return t->withDim(1);
       } else {
-        return t;
+        return nullptr;
       }
-      return nullptr;
-    } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor")) {
+    } else if (node->matches("aten::unfold(Tensor self, int dimension, int size, int step) -> Tensor")) {
       auto & t = tensor_types.at(0);
-      return t->withDim(t->dim() + 1);
-    } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor", /*with_const=*/attr::size) ||
-               node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor", /*with_const=*/attr::size)) {
-      return tensor_types.at(0)->withDim(node->get<std::vector<int64_t>>(attr::size)->size());
+      return t->dim() == 0 ? t : t->withDim(t->dim() + 1);
+    } else if (node->matches("aten::polygamma(int n, Tensor self) -> Tensor")) {
+      return tensor_types.at(0);
     }
     return nullptr;
   };
+  if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
+    tensor_types = std::move(*maybe_tensor_types);
+  } else {
+    return false;
+  }
   if (node->outputs().size() == 1) {
     if (auto type = getSingleOutputType()) {
       node->output()->setType(type);
       return true;
     }
   }
-  setUnshapedType(node);
   return false;
 }
 
diff --git a/torch/csrc/jit/pybind.h b/torch/csrc/jit/pybind.h
index 30a03bdd350ca1..f517302ec121f3 100644
--- a/torch/csrc/jit/pybind.h
+++ b/torch/csrc/jit/pybind.h
@@ -24,7 +24,12 @@ template <> struct type_caster<torch::jit::IValue> {
   PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue"));
 
   bool load(handle src, bool) {
-    return false;
+    try {
+      value = torch::jit::toIValue(src);
+      return true;
+    } catch (std::exception& e) {
+      return false;
+    }
   }
 
   static handle cast(torch::jit::IValue src, return_value_policy /* policy */, handle /* parent */) {
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 416de46f1e6953..11ec332251b6c5 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -440,6 +440,11 @@ void initPythonIRBindings(PyObject * module_) {
     .def("__repr__",[](Type & t) {
       return t.python_str();
     })
+    .def("str",[](Type & t) {
+      std::ostringstream s;
+      s << t;
+      return s.str();
+    })
     .def("kind",[](Type& t_) {
       Type * t = &t_;
       switch(t->kind()) {
@@ -486,7 +491,8 @@ void initPythonIRBindings(PyObject * module_) {
     })
     .def("isSubtypeOf", [](std::shared_ptr<Type>& self, std::shared_ptr<Type> other) {
         return self->isSubtypeOf(other);
-    });
+    })
+    .def_static("inferFrom", inferTypeFrom);
 
   py::class_<NumberType, Type, std::shared_ptr<NumberType>>(m, "NumberType")
     .def_static("get", &NumberType::get);
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index f3be38cbb2b005..e812f2e02fdee5 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -270,7 +270,7 @@ struct TORCH_API CompleteTensorType : public TensorType {
     if (rhs->kind() == TypeKind::DynamicType)
       return true;
     if (rhs->kind() == TypeKind::TensorType)
-      return *dynamic_cast<const TensorType*>(this) == *rhs;
+      return *expect<TensorType>() ==  *rhs;
     return *this == *rhs;
   }
   std::string str() const override {

From 120d7694327dafcfca5fa4447f438772c9b1c729 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 05:58:11 -0700
Subject: [PATCH 028/237] Add support for tracing strings (#11506)

Summary:
This enabled `torch.einsum` both in tracing and in script mode. It's used all over Pyro at the moment, and is needed for any use of the JIT in there.

Fixes #11157.

zdevito fritzo neerajprad
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11506

Differential Revision: D9764787

Pulled By: apaszke

fbshipit-source-id: 9b5251b9e7c5897034602bd07ff67b425d33326c
---
 test/expect/TestJit.test_cpp_cuda.expect     | 12 ++++++------
 test/expect/TestScript.test_string_cu.expect |  4 ++--
 test/test_jit.py                             | 15 +++++++++++++++
 tools/jit/gen_jit_dispatch.py                |  4 +++-
 torch/csrc/jit/constants.cpp                 |  4 ++--
 torch/csrc/jit/interned_strings.h            |  1 -
 torch/csrc/jit/ir.cpp                        |  2 +-
 torch/csrc/jit/operator.cpp                  |  1 +
 torch/csrc/jit/tracer.cpp                    |  2 +-
 9 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect
index 54a3c16d459c69..50d2cbe963833c 100644
--- a/test/expect/TestJit.test_cpp_cuda.expect
+++ b/test/expect/TestJit.test_cpp_cuda.expect
@@ -105,19 +105,19 @@ graph(%0 : Float(2, 3, 4)
       %3 : Float(2, 3, 4)
       %4 : Float(2, 3, 4)) {
   %5 : int = prim::Constant[value=1]()
-  %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0)
+  %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0)
     block0() {
       %8 : Float(2, 3, 4) = aten::mul(%0, %5)
       -> (%0, %8)
     }
-  %9 : Float(2, 3, 4), %10 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%6)
+  %9 : Float(2, 3, 4), %10 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%6)
     block0() {
       %11 : Float(2, 3, 4) = aten::mul(%6, %2)
       %12 : Float(2, 3, 4) = aten::mul(%6, %4)
       -> (%11, %12)
     }
   %13 : Dynamic = prim::AutogradAdd(%1, %9)
-  %14 : Float(2, 3, 4), %15 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%13)
+  %14 : Float(2, 3, 4), %15 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%13)
     block0() {
       %16 : Float(2, 3, 4) = aten::mul(%13, %3)
       %17 : Float(2, 3, 4) = aten::mul(%13, %2)
@@ -146,19 +146,19 @@ graph(%0 : Float(2, 3, 4)
       %2 : Float(2, 3, 4)
       %3 : Float(2, 3, 4)) {
   %4 : int = prim::Constant[value=1]()
-  %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0)
+  %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0)
     block0() {
       %7 : Float(2, 3, 4) = aten::mul(%0, %4)
       -> (%0, %7)
     }
-  %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%5)
+  %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%5)
     block0() {
       %10 : Float(2, 3, 4) = aten::mul(%5, %2)
       %11 : Float(2, 3, 4) = aten::mul(%5, %3)
       -> (%10, %11)
     }
   %12 : Dynamic = prim::AutogradAdd(%1, %8)
-  %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%12)
+  %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%12)
     block0() {
       %15 : Float(2, 3, 4) = aten::mul(%12, %4)
       -> (%12, %15)
diff --git a/test/expect/TestScript.test_string_cu.expect b/test/expect/TestScript.test_string_cu.expect
index 0fd663ca5cc661..cefcd07bbba130 100644
--- a/test/expect/TestScript.test_string_cu.expect
+++ b/test/expect/TestScript.test_string_cu.expect
@@ -1,7 +1,7 @@
 graph(%a : Dynamic) {
   %2 : int = prim::Constant[value=2]()
-  %1 : string = prim::Constant[string=a\n\tb\n]()
-  %3 : string = prim::Constant[string=aa]()
+  %1 : string = prim::Constant[value="a\n\tb\n"]()
+  %3 : string = prim::Constant[value="aa"]()
    = prim::Print(%a, %1, %2, %3)
   return (%a);
 }
diff --git a/test/test_jit.py b/test/test_jit.py
index ffd2c853481eb9..1b880fe3b0fdec 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -230,6 +230,9 @@ def getExportImportCopy(self, m):
             os.unlink(f.name)
         return imported
 
+    def assertGraphContains(self, graph, kind):
+        self.assertTrue(any(n.kind() == kind for n in graph.nodes()))
+
     def assertExpectedONNXGraph(self, trace, *args, **kwargs):
         torch.onnx._optimize_trace(trace, operator_export_type=OperatorExportTypes.ONNX)
         self.assertExpectedGraph(trace, *args, **kwargs)
@@ -1322,6 +1325,18 @@ def foo(a):
         x = torch.randn(5, 5)
         self.assertEqual(foo(x), x + x + x)
 
+    def test_einsum(self):
+        def outer(x, y):
+            return torch.einsum('i,j->ij', (x, y))
+
+        traced = torch.jit.trace(outer, (torch.randn(4), torch.randn(5)))
+        script = torch.jit.script(outer)
+        fns = [traced, script]
+        x, y = torch.randn(10), torch.randn(2)
+        for fn in [traced, script]:
+            self.assertGraphContains(fn.graph, kind='aten::einsum')
+            self.assertEqual(fn(x, y), outer(x, y))
+
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "calls .cuda()")
     @skipIfRocm
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 31ab5ed110f0de..2f24986c39cdc3 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -25,6 +25,7 @@
     'std::array<bool,2>': 'bool[2]',
     'std::array<bool,3>': 'bool[3]',
     'std::array<bool,4>': 'bool[4]',
+    'std::string': 'str',
     'Scalar': 'Scalar',
     'Tensor': 'Tensor',
     'TensorList': 'Tensor[]',
@@ -66,6 +67,7 @@ def jit_type_of(arg):
     'bool': 'bool({}.toInt())',
     'double': '{}.toDouble()',
     'int64_t': '{}.toInt()',
+    'std::string': '{}.toString()->string()',
     'std::array<bool,2>': 'as_bool_array<2>({}.toIntList()->elements())',
     'std::array<bool,3>': 'as_bool_array<3>({}.toIntList()->elements())',
     'std::array<bool,4>': 'as_bool_array<4>({}.toIntList()->elements())',
@@ -121,7 +123,7 @@ def is_magic_method(api_name):
     return api_name.startswith('__') and api_name.endswith('__')
 
 
-blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional<ScalarType>', 'std::string', 'void*'}
+blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional<ScalarType>', 'void*'}
 default_only_types = {'Generator'}
 
 
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index d7876411c687a6..8bf1ab566b31a9 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -33,7 +33,7 @@ Value* insertConstant(
     }));
     n->output()->setType(ListType::ofTensors());
   } else if(val.isString()) {
-    n->s_(attr::string, val.toString()->string());
+    n->s_(attr::value, val.toString()->string());
     n->output()->setType(StringType::get());
   } else {
     throw constant_not_supported_error("Unsupported value kind: " + val.tagKind());
@@ -86,7 +86,7 @@ RegisterOperators reg({
             return 0;
           };
         } else if (type == StringType::get()) {
-          auto s = node->s(attr::string);
+          auto s = node->s(attr::value);
           return [s](Stack& stack) {
             push(stack, s);
             return 0;
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 6d8f409fa3a2c0..60e086db187472 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -109,7 +109,6 @@ namespace torch { namespace jit {
   _(attr, transA)                  \
   _(attr, transB)                  \
   _(attr, name)                    \
-  _(attr, string)                  \
   _(attr, a)                       \
   _(attr, b)
 
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index a65a743ddb24cf..48da3ec9cea913 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -125,7 +125,7 @@ void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=fa
         printPrimList(out,n->is(name));
         break;
       case AttributeKind::s:
-        out << escapeString(n->s(name));
+        out << "\"" << escapeString(n->s(name)) << "\"";
         break;
       case AttributeKind::ss:
         printPrimList(out,n->ss(name));
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index c9c4ceb0d2d8d3..75e5833535bcfc 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -54,6 +54,7 @@ struct SchemaParser {
       {"Layout", IntType::get() },
       {"Device", ListType::ofInts() },
       {"Scalar", NumberType::get() },
+      {"str", StringType::get() },
       {"float", FloatType::get() },
       {"int", IntType::get() },
       {"bool", IntType::get() }, // TODO: add separate bool type
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 03e31440ef5c95..4f496420956b0b 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -39,8 +39,8 @@ void addInputs(Node *n, const char * name, int64_t value)            { detail::g
 void addInputs(Node *n, const char * name, bool value)               { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, double value)             { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Scalar& value)  { detail::genericAddInput(n, value); }
+void addInputs(Node *n, const char * name, const std::string& value) { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Tensor& value)  { n->addInput(getValueTrace(value)); }
-void addInputs(Node *n, const char * name, const std::string& value)         { detail::badArgType(value); }
 void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); }
 void addInputs(Node *n, const char * name, at::Generator * value)            { detail::badArgType(value); }
 void addInputs(Node *n, const char * name, at::ScalarType value)             { detail::badArgType(value); }

From 80fa8e10075f4c8e0de1f209c323bcdb06f03dee Mon Sep 17 00:00:00 2001
From: Neeraj Pradhan <npradhan@uber.com>
Date: Tue, 11 Sep 2018 06:50:33 -0700
Subject: [PATCH 029/237] Add .expand() method to distribution classes (#11341)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This adds a `.expand` method for distributions that is akin to the `torch.Tensor.expand` method for tensors. It returns a new distribution instance with batch dimensions expanded to the desired `batch_shape`. Since this calls `torch.Tensor.expand` on the distribution's parameters, it does not allocate new memory for the expanded distribution instance's parameters.

e.g.
```python
>>> d = dist.Normal(torch.zeros(100, 1), torch.ones(100, 1))
>>> d.sample().shape
  torch.Size([100, 1])
>>> d.expand([100, 10]).sample().shape
  torch.Size([100, 10])
```

We have already been using the `.expand` method in Pyro in our [patch](https://github.com/uber/pyro/blob/dev/pyro/distributions/torch.py#L10) of `torch.distributions`. We use this in our models to enable dynamic broadcasting. This has also been requested by a few users on the distributions slack, and we believe will be useful to the larger community.

Note that currently, there is no convenient and efficient way to expand distribution instances:
 - Many distributions use `TransformedDistribution` (or wrap over another distribution instance. e.g. `OneHotCategorical` uses a `Categorical` instance) under the hood, or have lazy parameters. This makes it difficult to collect all the relevant parameters, broadcast them and construct new instances.
 - In the few cases where this is even possible, the resulting implementation would be inefficient since we will go through a lot of broadcasting and args validation logic in `__init__.py` that can be avoided.

The `.expand` method allows for a safe and efficient way to expand distribution instances. Additionally, this bypasses `__init__.py` (using `__new__` and populating relevant attributes) since we do not need to do any broadcasting or args validation (which was already done when the instance was first created). This can result in significant savings as compared to constructing new instances via `__init__` (that said, the `sample` and `log_prob` methods will probably be the rate determining steps in many applications).

e.g.
```python
>>> a = dist.Bernoulli(torch.ones([10000, 1]), validate_args=True)

>>> %timeit a.expand([10000, 100])
15.2 µs ± 224 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

>>> %timeit dist.Bernoulli(torch.ones([10000, 100]), validate_args=True)
11.8 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

cc. fritzo, apaszke, vishwakftw, alicanb
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11341

Differential Revision: D9728485

Pulled By: soumith

fbshipit-source-id: 3b94c23bc6a43ee704389e6287aa83d1e278d52f
---
 test/test_distributions.py                    | 60 +++++++++++++++++++
 torch/distributions/bernoulli.py              | 13 ++++
 torch/distributions/beta.py                   |  8 +++
 torch/distributions/binomial.py               | 14 +++++
 torch/distributions/categorical.py            | 15 +++++
 torch/distributions/cauchy.py                 |  9 +++
 torch/distributions/chi2.py                   |  6 ++
 torch/distributions/dirichlet.py              |  8 +++
 torch/distributions/distribution.py           | 28 +++++++++
 torch/distributions/exponential.py            |  8 +++
 torch/distributions/fishersnedecor.py         | 11 ++++
 torch/distributions/gamma.py                  | 10 +++-
 torch/distributions/geometric.py              | 11 ++++
 torch/distributions/gumbel.py                 | 11 +++-
 torch/distributions/half_cauchy.py            | 12 +++-
 torch/distributions/half_normal.py            | 12 +++-
 torch/distributions/independent.py            | 10 ++++
 torch/distributions/laplace.py                |  9 +++
 torch/distributions/log_normal.py             | 12 +++-
 torch/distributions/logistic_normal.py        | 18 +++++-
 .../lowrank_multivariate_normal.py            | 14 +++++
 torch/distributions/multinomial.py            |  9 +++
 torch/distributions/multivariate_normal.py    | 19 ++++++
 torch/distributions/negative_binomial.py      | 14 +++++
 torch/distributions/normal.py                 |  9 +++
 torch/distributions/one_hot_categorical.py    |  8 +++
 torch/distributions/pareto.py                 | 11 ++--
 torch/distributions/poisson.py                |  8 +++
 torch/distributions/relaxed_bernoulli.py      | 27 ++++++++-
 torch/distributions/relaxed_categorical.py    | 24 +++++++-
 torch/distributions/studentT.py               | 11 ++++
 torch/distributions/uniform.py                | 10 +++-
 torch/distributions/weibull.py                | 18 +++++-
 33 files changed, 446 insertions(+), 21 deletions(-)

diff --git a/test/test_distributions.py b/test/test_distributions.py
index 3aa414a77515e7..60bcec1605b6e5 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -793,6 +793,47 @@ def test_has_examples(self):
                 self.assertIn(Dist, distributions_with_examples,
                               "Please add {} to the EXAMPLES list in test_distributions.py".format(Dist.__name__))
 
+    def test_distribution_expand(self):
+        shapes = [torch.Size(), torch.Size((2,)), torch.Size((2, 1))]
+        for Dist, params in EXAMPLES:
+            if Dist.__name__ == "TransformedDistribution":
+                continue
+            for param in params:
+                for shape in shapes:
+                    d = Dist(**param)
+                    expanded_shape = shape + d.batch_shape
+                    original_shape = d.batch_shape + d.event_shape
+                    expected_shape = shape + original_shape
+                    expanded = d.expand(batch_shape=expanded_shape)
+                    sample = expanded.sample()
+                    actual_shape = expanded.sample().shape
+                    self.assertEqual(expanded.__class__, d.__class__)
+                    self.assertEqual(d.sample().shape, original_shape)
+                    self.assertEqual(expanded.log_prob(sample), d.log_prob(sample))
+                    self.assertEqual(actual_shape, expected_shape)
+
+    def test_distribution_subclass_expand(self):
+        expand_by = torch.Size((2,))
+        for Dist, params in EXAMPLES:
+            if Dist.__name__ == "TransformedDistribution":
+                continue
+
+            class SubClass(Dist):
+                pass
+
+            for param in params:
+                d = SubClass(**param)
+                expanded_shape = expand_by + d.batch_shape
+                original_shape = d.batch_shape + d.event_shape
+                expected_shape = expand_by + original_shape
+                expanded = d.expand(batch_shape=expanded_shape)
+                sample = expanded.sample()
+                actual_shape = expanded.sample().shape
+                self.assertEqual(expanded.__class__, d.__class__)
+                self.assertEqual(d.sample().shape, original_shape)
+                self.assertEqual(expanded.log_prob(sample), d.log_prob(sample))
+                self.assertEqual(actual_shape, expected_shape)
+
     def test_bernoulli(self):
         p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True)
         r = torch.tensor(0.3, requires_grad=True)
@@ -2178,6 +2219,25 @@ def test_independent_shape(self):
                     except NotImplementedError:
                         pass
 
+    def test_independent_expand(self):
+        for Dist, params in EXAMPLES:
+            if Dist.__name__ == "TransformedDistribution":
+                continue
+            for param in params:
+                base_dist = Dist(**param)
+                for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
+                    for s in [torch.Size(), torch.Size((2,)), torch.Size((2, 3))]:
+                        indep_dist = Independent(base_dist, reinterpreted_batch_ndims)
+                        expanded_shape = s + indep_dist.batch_shape
+                        expanded = indep_dist.expand(expanded_shape)
+                        expanded_sample = expanded.sample()
+                        expected_shape = expanded_shape + indep_dist.event_shape
+                        self.assertEqual(expanded_sample.shape, expected_shape)
+                        self.assertEqual(expanded.log_prob(expanded_sample),
+                                         indep_dist.log_prob(expanded_sample))
+                        self.assertEqual(expanded.event_shape, indep_dist.event_shape)
+                        self.assertEqual(expanded.batch_shape, expanded_shape)
+
     def test_cdf_icdf_inverse(self):
         # Tests the invertibility property on the distributions
         for Dist, params in EXAMPLES:
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 9db9691c4ddb9e..067a6eba9a43c4 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -46,6 +46,19 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             batch_shape = self._param.size()
         super(Bernoulli, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Bernoulli, _instance)
+        batch_shape = torch.Size(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(Bernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index f23415d5bf0cbd..35a94e8545190b 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -36,6 +36,14 @@ def __init__(self, concentration1, concentration0, validate_args=None):
         self._dirichlet = Dirichlet(concentration1_concentration0)
         super(Beta, self).__init__(self._dirichlet._batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Beta, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new._dirichlet = self._dirichlet.expand(batch_shape)
+        super(Beta, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.concentration1 / (self.concentration1 + self.concentration0)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index acbb636ce5ee16..ad56856c90aef1 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -51,6 +51,20 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
             batch_shape = self._param.size()
         super(Binomial, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Binomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count.expand(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(Binomial, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 6dc046dfab42d4..267bed8826e7de 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -53,6 +53,21 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         batch_shape = self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size()
         super(Categorical, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Categorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        param_shape = batch_shape + torch.Size((self._num_events,))
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(param_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(param_shape)
+            new._param = new.logits
+        new._num_events = self._num_events
+        super(Categorical, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index dec9cfafe134fb..da31c78abbb413 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -36,6 +36,15 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = self.loc.size()
         super(Cauchy, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Cauchy, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Cauchy, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.loc.new_tensor(nan).expand(self._extended_shape())
diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py
index fcb0c5b7837727..fe8c5956fb5b68 100644
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@@ -1,3 +1,4 @@
+import torch
 from torch.distributions import constraints
 from torch.distributions.gamma import Gamma
 
@@ -21,6 +22,11 @@ class Chi2(Gamma):
     def __init__(self, df, validate_args=None):
         super(Chi2, self).__init__(0.5 * df, 0.5, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Chi2, _instance)
+        batch_shape = torch.Size(batch_shape)
+        return super(Chi2, self).expand(batch_shape, new)
+
     @property
     def df(self):
         return self.concentration * 2
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index ca014e0d9da40d..94d66d60c0848c 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -58,6 +58,14 @@ def __init__(self, concentration, validate_args=None):
         batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:]
         super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Dirichlet, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape + self.event_shape)
+        super(Dirichlet, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=()):
         shape = self._extended_shape(sample_shape)
         concentration = self.concentration.expand(shape)
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index 3d1aeb9cadc9c9..8d357a4e9371a0 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -35,6 +35,27 @@ def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_
                 if not constraint.check(getattr(self, param)).all():
                     raise ValueError("The parameter {} has invalid values".format(param))
 
+    def expand(self, batch_shape, _instance=None):
+        """
+        Returns a new distribution instance (or populates an existing instance
+        provided by a derived class) with batch dimensions expanded to
+        `batch_shape`. This method calls :class:`~torch.Tensor.expand` on
+        the distribution's parameters. As such, this does not allocate new
+        memory for the expanded distribution instance. Additionally,
+        this does not repeat any args checking or parameter broadcasting in
+        `__init__.py`, when an instance is first created.
+
+        Args:
+            batch_shape (torch.Size): the desired expanded size.
+            _instance: new instance provided by subclasses that
+                need to override `.expand`.
+
+        Returns:
+            New distribution instance with batch dimensions expanded to
+            `batch_size`.
+        """
+        raise NotImplementedError
+
     @property
     def batch_shape(self):
         """
@@ -228,6 +249,13 @@ def _validate_sample(self, value):
         if not self.support.check(value).all():
             raise ValueError('The value argument must be within the support')
 
+    def _get_checked_instance(self, cls, _instance=None):
+        if _instance is None and type(self).__init__ != cls.__init__:
+            raise NotImplementedError("Subclass {} of {} that defines a custom __init__ method "
+                                      "must also define a custom .expand() method.".
+                                      format(self.__class__.__name__, cls.__name__))
+        return self.__new__(type(self)) if _instance is None else _instance
+
     def __repr__(self):
         param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__]
         args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p]
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index 85decc0e239fe5..a2fde7b382593c 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -41,6 +41,14 @@ def __init__(self, rate, validate_args=None):
         batch_shape = torch.Size() if isinstance(rate, Number) else self.rate.size()
         super(Exponential, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Exponential, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Exponential, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         return self.rate.new(shape).exponential_() / self.rate
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index 202621218eea7f..6fe09a7cff14db 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -37,6 +37,17 @@ def __init__(self, df1, df2, validate_args=None):
             batch_shape = self.df1.size()
         super(FisherSnedecor, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(FisherSnedecor, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.df1 = self.df1.expand(batch_shape)
+        new.df2 = self.df2.expand(batch_shape)
+        new._gamma1 = self._gamma1.expand(batch_shape)
+        new._gamma2 = self._gamma2.expand(batch_shape)
+        super(FisherSnedecor, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         df2 = self.df2.clone()
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index 3f96fc031b2345..655ddac5f0c643 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -1,7 +1,6 @@
 from numbers import Number
 
 import torch
-from torch.autograd import Function
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import _finfo, broadcast_all, lazy_property
@@ -48,6 +47,15 @@ def __init__(self, concentration, rate, validate_args=None):
             batch_shape = self.concentration.size()
         super(Gamma, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Gamma, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Gamma, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         value = _standard_gamma(self.concentration.expand(shape)) / self.rate.expand(shape)
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index 196a28b29e7d75..a84963664b4cf3 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -45,6 +45,17 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             batch_shape = probs_or_logits.size()
         super(Geometric, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Geometric, _instance)
+        batch_shape = torch.Size(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+        else:
+            new.logits = self.logits.expand(batch_shape)
+        super(Geometric, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return 1. / self.probs - 1.
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index e4e96aa9eb4373..f6d10aa4b0efb9 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -31,15 +31,22 @@ def __init__(self, loc, scale, validate_args=None):
         self.loc, self.scale = broadcast_all(loc, scale)
         finfo = _finfo(self.loc)
         if isinstance(loc, Number) and isinstance(scale, Number):
-            batch_shape = torch.Size()
             base_dist = Uniform(finfo.tiny, 1 - finfo.eps)
         else:
-            batch_shape = self.scale.size()
             base_dist = Uniform(self.loc.new(self.loc.size()).fill_(finfo.tiny), 1 - finfo.eps)
         transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
                       ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)]
         super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Gumbel, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist = self.base_dist.expand(batch_shape)
+        transforms = self.transforms
+        super(Gumbel, new).__init__(base_dist, transforms, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.loc + self.scale * euler_constant
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index 77a50d3f03c49f..8b81ea8b8c862d 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -1,5 +1,6 @@
 import math
 
+import torch
 from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
@@ -28,9 +29,18 @@ class HalfCauchy(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, scale, validate_args=None):
-        super(HalfCauchy, self).__init__(Cauchy(0, scale), AbsTransform(),
+        base_dist = Cauchy(0, scale)
+        super(HalfCauchy, self).__init__(base_dist, AbsTransform(),
                                          validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(HalfCauchy, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(HalfCauchy, new).__init__(base_dist, AbsTransform(), validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def scale(self):
         return self.base_dist.scale
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index 059f3837604a63..d18cb3b971cd2c 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -1,5 +1,6 @@
 import math
 
+import torch
 from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
@@ -28,9 +29,18 @@ class HalfNormal(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, scale, validate_args=None):
-        super(HalfNormal, self).__init__(Normal(0, scale), AbsTransform(),
+        base_dist = Normal(0, scale)
+        super(HalfNormal, self).__init__(base_dist, AbsTransform(),
                                          validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(HalfNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(HalfNormal, new).__init__(base_dist, AbsTransform(), validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def scale(self):
         return self.base_dist.scale
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 938e478472a9b7..ed8edb43f11644 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -46,6 +46,16 @@ def __init__(self, base_distribution, reinterpreted_batch_ndims, validate_args=N
         self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
         super(Independent, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Independent, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.base_dist = self.base_dist.expand(batch_shape +
+                                              self.event_shape[:self.reinterpreted_batch_ndims])
+        new.reinterpreted_batch_ndims = self.reinterpreted_batch_ndims
+        super(Independent, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def has_rsample(self):
         return self.base_dist.has_rsample
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index d3c09faec43051..cc5b079fbdbdcb 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -43,6 +43,15 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = self.loc.size()
         super(Laplace, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Laplace, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Laplace, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         u = self.loc.new(shape).uniform_(_finfo(self.loc).eps - 1, 1)
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index 9487dc9b5a9679..274bcd83c47a83 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -1,3 +1,4 @@
+import torch
 from torch.distributions import constraints
 from torch.distributions.transforms import ExpTransform
 from torch.distributions.normal import Normal
@@ -27,7 +28,16 @@ class LogNormal(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, loc, scale, validate_args=None):
-        super(LogNormal, self).__init__(Normal(loc, scale), ExpTransform(), validate_args=validate_args)
+        base_dist = Normal(loc, scale)
+        super(LogNormal, self).__init__(base_dist, ExpTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(LogNormal, new).__init__(base_dist, ExpTransform(), validate_args=False)
+        new._validate_args = self._validate_args
+        return new
 
     @property
     def loc(self):
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index 39b2f3e488f183..909a30d3db8a7c 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -32,12 +32,24 @@ class LogisticNormal(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, loc, scale, validate_args=None):
-        super(LogisticNormal, self).__init__(
-            Normal(loc, scale), StickBreakingTransform(),
-            validate_args=validate_args)
+        base_dist = Normal(loc, scale)
+        super(LogisticNormal, self).__init__(base_dist,
+                                             StickBreakingTransform(),
+                                             validate_args=validate_args)
         # Adjust event shape since StickBreakingTransform adds 1 dimension
         self._event_shape = torch.Size([s + 1 for s in self._event_shape])
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogisticNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist = self.base_dist.expand(batch_shape + self.base_dist.batch_shape[-1:])
+        super(LogisticNormal, new).__init__(base_dist,
+                                            StickBreakingTransform(),
+                                            validate_args=False)
+        new._event_shape = self._event_shape
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def loc(self):
         return self.base_dist.loc
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 4e3e25ae947fad..5e6239a69d0d37 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -116,6 +116,20 @@ def __init__(self, loc, cov_factor, cov_diag, validate_args=None):
         super(LowRankMultivariateNormal, self).__init__(batch_shape, event_shape,
                                                         validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LowRankMultivariateNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new.cov_diag = self.cov_diag.expand(loc_shape)
+        new.cov_factor = self.cov_factor.expand(loc_shape + self.cov_factor.shape[-1:])
+        new._capacitance_tril = self._capacitance_tril.expand(batch_shape + self._capacitance_tril.shape[-2:])
+        super(LowRankMultivariateNormal, new).__init__(batch_shape,
+                                                       self.event_shape,
+                                                       validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.loc
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index dd1e20d373469b..d02266c0d8db50 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -58,6 +58,15 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
         event_shape = self._categorical.param_shape[-1:]
         super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Multinomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count
+        new._categorical = self._categorical.expand(batch_shape)
+        super(Multinomial, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 51de7816cafaff..855c727dc9b951 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -146,6 +146,25 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri
         batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:]
         super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(MultivariateNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        cov_shape = batch_shape + self.event_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape)
+        if 'covariance_matrix' in self.__dict__:
+            new.covariance_matrix = self.covariance_matrix.expand(cov_shape)
+        if 'scale_tril' in self.__dict__:
+            new.scale_tril = self.scale_tril.expand(cov_shape)
+        if 'precision_matrix' in self.__dict__:
+            new.precision_matrix = self.precision_matrix.expand(cov_shape)
+        super(MultivariateNormal, new).__init__(batch_shape,
+                                                self.event_shape,
+                                                validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @lazy_property
     def scale_tril(self):
         return self._unbroadcasted_scale_tril.expand(
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index de0b748f6a6fdf..01a009b9869d0a 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -38,6 +38,20 @@ def __init__(self, total_count, probs=None, logits=None, validate_args=None):
         batch_shape = self._param.size()
         super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(NegativeBinomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count.expand(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(NegativeBinomial, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 0f1375e6640f53..98a5318d7b9829 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -48,6 +48,15 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = self.loc.size()
         super(Normal, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Normal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Normal, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def sample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         with torch.no_grad():
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index fbfec01ab7159a..c28949b079a115 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -38,6 +38,14 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         event_shape = self._categorical.param_shape[-1:]
         super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(OneHotCategorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new._categorical = self._categorical.expand(batch_shape)
+        super(OneHotCategorical, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 82fff3c34b02cb..6fa499642065de 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -1,7 +1,3 @@
-from numbers import Number
-
-import math
-
 import torch
 from torch.distributions import constraints
 from torch.distributions.exponential import Exponential
@@ -32,6 +28,13 @@ def __init__(self, scale, alpha, validate_args=None):
         transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
         super(Pareto, self).__init__(base_dist, transforms, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Pareto, _instance)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(Pareto, new).__init__(base_dist, self.transforms, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         # mean is inf for alpha <= 1
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index 6be54070a55f2c..954ed6e0d32064 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -43,6 +43,14 @@ def __init__(self, rate, validate_args=None):
             batch_shape = self.rate.size()
         super(Poisson, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Poisson, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Poisson, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def sample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         with torch.no_grad():
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 6b6c540ec48c00..643fddc2c96b68 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -46,6 +46,20 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
             batch_shape = self._param.size()
         super(LogitRelaxedBernoulli, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogitRelaxedBernoulli, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.temperature = self.temperature
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(LogitRelaxedBernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
@@ -99,8 +113,17 @@ class RelaxedBernoulli(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
-        super(RelaxedBernoulli, self).__init__(LogitRelaxedBernoulli(temperature, probs, logits),
-                                               SigmoidTransform(), validate_args=validate_args)
+        base_dist = LogitRelaxedBernoulli(temperature, probs, logits)
+        super(RelaxedBernoulli, self).__init__(base_dist,
+                                               SigmoidTransform(),
+                                               validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(RelaxedBernoulli, _instance)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(RelaxedBernoulli, new).__init__(base_dist, SigmoidTransform(), validate_args=False)
+        new._validate_args = self._validate_args
+        return new
 
     @property
     def temperature(self):
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 0c35defaf35bd7..a43d631bec9b2b 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -41,6 +41,15 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
         event_shape = self._categorical.param_shape[-1:]
         super(ExpRelaxedCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(ExpRelaxedCategorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.temperature = self.temperature
+        new._categorical = self._categorical.expand(batch_shape)
+        super(ExpRelaxedCategorical, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
@@ -100,8 +109,19 @@ class RelaxedOneHotCategorical(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
-        super(RelaxedOneHotCategorical, self).__init__(ExpRelaxedCategorical(temperature, probs, logits),
-                                                       ExpTransform(), validate_args=validate_args)
+        base_dist = ExpRelaxedCategorical(temperature, probs, logits)
+        super(RelaxedOneHotCategorical, self).__init__(base_dist,
+                                                       ExpTransform(),
+                                                       validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(RelaxedOneHotCategorical, _instance)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(RelaxedOneHotCategorical, new).__init__(base_dist,
+                                                      ExpTransform(),
+                                                      validate_args=False)
+        new._validate_args = self._validate_args
+        return new
 
     @property
     def temperature(self):
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index c7738afa9cd63b..cefa4684298261 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -45,6 +45,17 @@ def __init__(self, df, loc=0., scale=1., validate_args=None):
         batch_shape = torch.Size() if isinstance(df, Number) else self.df.size()
         super(StudentT, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(StudentT, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.df = self.df.expand(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        new._chi2 = self._chi2.expand(batch_shape)
+        super(StudentT, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         # NOTE: This does not agree with scipy implementation as much as other distributions.
         # (see https://github.com/fritzo/notebooks/blob/master/debug-student-t.ipynb). Using DoubleTensor
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index fa39f800e97df7..ce9cf496656346 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -1,4 +1,3 @@
-import math
 from numbers import Number
 
 import torch
@@ -50,6 +49,15 @@ def __init__(self, low, high, validate_args=None):
         if self._validate_args and not torch.lt(self.low, self.high).all():
             raise ValueError("Uniform is not defined when low>= high")
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Uniform, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.low = self.low.expand(batch_shape)
+        new.high = self.high.expand(batch_shape)
+        super(Uniform, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @constraints.dependent_property
     def support(self):
         return constraints.interval(self.low, self.high)
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index 4e196174f84e98..0636a74d113f82 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -1,5 +1,3 @@
-from numbers import Number
-import math
 import torch
 from torch.distributions import constraints
 from torch.distributions.exponential import Exponential
@@ -32,7 +30,21 @@ def __init__(self, scale, concentration, validate_args=None):
         base_dist = Exponential(self.scale.new(self.scale.size()).fill_(1.0))
         transforms = [PowerTransform(exponent=self.concentration_reciprocal),
                       AffineTransform(loc=0, scale=self.scale)]
-        super(Weibull, self).__init__(base_dist, transforms, validate_args=validate_args)
+        super(Weibull, self).__init__(base_dist,
+                                      transforms,
+                                      validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Weibull, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        base_dist = self.base_dist.expand(batch_shape)
+        super(Weibull, new).__init__(base_dist,
+                                     self.transforms,
+                                     validate_args=False)
+        new._validate_args = self._validate_args
+        return new
 
     @property
     def mean(self):

From 86ab92b0a9b5c2b76bf41aad00f9fb9e21d62eda Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Tue, 11 Sep 2018 07:23:23 -0700
Subject: [PATCH 030/237] Move TensorImpl / UndefinedTensor(Impl) to core
 (#11441)

Summary:
Moves TensorImpl to core.
Renames UndefinedTensor to UndefinedTensorImpl and moves to core.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11441

Differential Revision: D9736620

Pulled By: gchanan

fbshipit-source-id: 0322ae3b903e338de253b35a0d74a9d3e219204b
---
 aten/src/ATen/SparseTensorImpl.h              |   2 +-
 aten/src/ATen/TensorImpl.h                    | 204 +-----------------
 aten/src/ATen/Utils.h                         |   2 +-
 aten/src/ATen/WrapDimUtils.h                  |   2 +-
 aten/src/ATen/WrapDimUtilsMulti.h             |   2 +-
 aten/src/ATen/copy_wrapper.py                 |   2 +-
 aten/src/ATen/{ => core}/TensorImpl.cpp       |   2 +-
 aten/src/ATen/core/TensorImpl.h               | 204 ++++++++++++++++++
 .../UndefinedTensorImpl.cpp}                  |  20 +-
 .../UndefinedTensorImpl.h}                    |  10 +-
 aten/src/ATen/function_wrapper.py             |  22 +-
 aten/src/ATen/gen.py                          |   2 +-
 aten/src/ATen/templates/SparseTypeDerived.cpp |   2 +-
 aten/src/ATen/templates/Tensor.h              |  12 +-
 aten/src/ATen/templates/TypeDefault.cpp       |   4 +-
 aten/src/ATen/templates/TypeDerived.cpp       |   4 +-
 aten/src/ATen/test/undefined_tensor_test.cpp  |   4 +-
 aten/src/TH/THHalf.h                          |   2 +-
 aten/src/TH/THTensor.cpp                      |   2 +-
 aten/src/TH/generic/THTensor.cpp              |   8 +-
 aten/src/TH/generic/THTensor.h                |   2 +-
 aten/src/THC/generic/THCTensor.cpp            |   6 +-
 .../src/THC/generic/THCTensorMathPointwise.cu |   4 +-
 .../complex_registration_extension.cpp        |   4 +-
 torch/csrc/jit/ivalue.h                       |   6 +-
 25 files changed, 269 insertions(+), 265 deletions(-)
 rename aten/src/ATen/{ => core}/TensorImpl.cpp (98%)
 create mode 100644 aten/src/ATen/core/TensorImpl.h
 rename aten/src/ATen/{UndefinedTensor.cpp => core/UndefinedTensorImpl.cpp} (55%)
 rename aten/src/ATen/{UndefinedTensor.h => core/UndefinedTensorImpl.h} (72%)

diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 835b45c2a541ee..77ad805ac99e6f 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "ATen/Tensor.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/core/Error.h"
 
 namespace at {
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index e0a649a49b6ccd..34ece0fc0d03fb 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -1,204 +1,2 @@
 #pragma once
-
-#include <atomic>
-#include <memory>
-
-#include "ATen/core/Storage.h"
-#include "ATen/core/optional.h"
-#include "ATen/core/TensorTypeId.h"
-#include "ATen/core/TensorTypeIdRegistration.h"
-#include "ATen/core/LegacyTypeDispatch.h"
-#include "ATen/core/Backend.h"
-
-struct THTensor;
-
-namespace at {
-class Scalar;
-struct Type;
-struct Storage;
-struct Tensor;
-} // namespace at
-
-namespace at {
-struct AT_API TensorImpl : public c10::intrusive_ptr_target {
-  TensorImpl() = delete;
-  TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
-  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
-
-  virtual void release_resources() override;
-
-  Type & type() const {
-    // NB: It's valid to use getTypeRaw here, because the TensorImpl
-    // could not have been created without initializing the Type first.
-    // TODO: This is not actually true via the Caffe2 codepath!  Make
-    // it so.
-    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable());
-  }
-
-  TensorTypeId type_id() const { return type_id_; }
-  virtual IntList sizes() const;
-  virtual IntList strides() const;
-  virtual int64_t dim() const;
-  virtual const Storage& storage() const;
-  friend struct Type;
-
-  virtual int64_t numel() const {
-#ifdef DEBUG
-    AT_ASSERT(compute_numel() == numel_);
-#endif
-    return numel_;
-  }
-
-  virtual bool is_contiguous() const {
-#ifdef DEBUG
-    AT_ASSERT(compute_contiguous() == is_contiguous_);
-#endif
-    return is_contiguous_;
-  }
-
-  // this is called by the generated wrapper code when there are conditions
-  // when this output tensor should be zero dimensional. e.g. when all inputs
-  // to a function 'add' were zero dimensional, then condition_when_zero_dim == true.
-  // we also prevent this from getting marked as a zero dim tensor if it is not
-  // the right shape afterall.
-  virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim);
-
-  // True if a tensor was auto-wrapped from a C++ or Python number.
-  // Wrapped numbers do not participate in the result type computation for
-  // mixed-type operations if there are any Tensors that are not wrapped
-  // numbers. Otherwise, they behave like their non-wrapped equivalents.
-  // See [Result type computation] in TensorIterator.h.
-  bool is_wrapped_number() const {
-    return is_wrapped_number_;
-  }
-  void set_wrapped_number(bool value) {
-    AT_ASSERT(dim() == 0);
-    is_wrapped_number_ = value;
-  }
-
-  // ~~~~~ Autograd API ~~~~~
-  // Some methods below are defined in TensorImpl.cpp because Tensor is an
-  // incomplete type.
-
-  virtual void set_requires_grad(bool requires_grad) {
-    AT_ERROR("set_requires_grad is not implemented for Tensor");
-  }
-  virtual bool requires_grad() const {
-    AT_ERROR("requires_grad is not implemented for Tensor");
-  }
-
-  virtual Tensor& grad();
-  virtual const Tensor& grad() const;
-
-  // TODO: make these protected
-  // Note: storage->size() may be greater than the recorded size
-  // of a tensor
-  at::Storage storage_;
-
-  template <typename T>
-  inline T * data() const {
-    return storage_.data<T>() + storage_offset_;
-  }
-
-  template <typename T>
-  inline T * unsafe_data() const {
-    return storage_.unsafe_data<T>() + storage_offset_;
-  }
-
-  inline at::ScalarType scalar_type() const {
-    return scalar_type_;
-  }
-
-  virtual int64_t storage_offset() const {
-    return storage_offset_;
-  }
-
-  // represents that numel() == 0.
-  inline bool is_empty() const {
-    return numel() == 0;
-  }
-
-  virtual void resize_dim(int64_t ndim) {
-    // NB: This is *truly* a resize; calling code (e.g., squeeze)
-    // assumes that old values are preserved
-    sizes_.resize(ndim);
-    strides_.resize(ndim);
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual void set_size(int64_t dim, int64_t new_size) {
-    sizes_[dim] = new_size;
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual void set_stride(int64_t dim, int64_t new_stride) {
-    strides_[dim] = new_stride;
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual void set_storage_offset(int64_t storage_offset) {
-    storage_offset_ = storage_offset;
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  // WARNING: This function does not check if the requested
-  // sizes/strides are in bounds for the storage that is allocated;
-  // this is the responsibility of the caller
-  void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) {
-    AT_CHECK(
-        new_size.size() == new_stride.size(),
-        "dimensionality of sizes (",
-        new_size.size(),
-        ") must match dimensionality of strides (",
-        new_stride.size(),
-        ")");
-    sizes_ = new_size.vec();
-    strides_ = new_stride.vec();
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual int64_t size(int64_t d) const;
-  virtual int64_t stride(int64_t d) const;
-
-  bool is_variable() const { return is_variable_; };
-
- private:
-  int64_t storage_offset_;
-  std::vector<int64_t> sizes_;
-  std::vector<int64_t> strides_;
-
-  bool is_contiguous_;
-  int64_t numel_;
-
-  int64_t compute_numel() const {
-    int64_t n = 1;
-    for (auto s : sizes()) {
-      n *= s;
-    }
-    return n;
-  }
-  bool compute_contiguous() const;
-
- protected:
-  void refresh_numel() {
-    numel_ = compute_numel();
-  }
-  void refresh_contiguous() {
-    is_contiguous_ = compute_contiguous();
-  }
-  TensorTypeId type_id_;
-  // INVARIANT: When storage is non-null, this scalar type must
-  // agree with the scalar type in storage
-  ScalarType scalar_type_;
-  bool is_variable_ = false;
-  bool is_wrapped_number_ = false;
-
- private:
-  TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
-};
-} // namespace at
+#include <ATen/core/TensorImpl.h>
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index fff88e39aba053..cea2f83d3e9180 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -2,7 +2,7 @@
 
 #include "ATen/core/ATenGeneral.h"
 #include "ATen/StorageImpl.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 
 #include <ATen/core/ScalarType.h>
 #include "ATen/Formatting.h"
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 8e9db589c5267a..467a5664f6e05b 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "ATen/core/WrapDimMinimal.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 
 namespace at {
 
diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h
index f3d3a81a365c26..4d3df92fe0bc57 100644
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/WrapDimUtils.h"
 #include <sstream>
 #include <bitset>
diff --git a/aten/src/ATen/copy_wrapper.py b/aten/src/ATen/copy_wrapper.py
index e4651f8c846070..a746f5543901ae 100644
--- a/aten/src/ATen/copy_wrapper.py
+++ b/aten/src/ATen/copy_wrapper.py
@@ -238,7 +238,7 @@ def create(all_types, backend):
         top_env['copy_includes'].append(
             '#include "ATen/{}.h"'.format(the_type['Type']))
     top_env['copy_includes'].append(
-        '#include "ATen/TensorImpl.h"')
+        '#include "ATen/core/TensorImpl.h"')
 
     # Code generation
     for the_type in all_types:
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
similarity index 98%
rename from aten/src/ATen/TensorImpl.cpp
rename to aten/src/ATen/core/TensorImpl.cpp
index f4ecaf0b6253fa..d5e5781b038ca5 100644
--- a/aten/src/ATen/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -1,4 +1,4 @@
-#include <ATen/TensorImpl.h>
+#include <ATen/core/TensorImpl.h>
 
 #include <ATen/core/optional.h>
 #include <ATen/core/Backend.h>
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
new file mode 100644
index 00000000000000..e0a649a49b6ccd
--- /dev/null
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -0,0 +1,204 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include "ATen/core/Storage.h"
+#include "ATen/core/optional.h"
+#include "ATen/core/TensorTypeId.h"
+#include "ATen/core/TensorTypeIdRegistration.h"
+#include "ATen/core/LegacyTypeDispatch.h"
+#include "ATen/core/Backend.h"
+
+struct THTensor;
+
+namespace at {
+class Scalar;
+struct Type;
+struct Storage;
+struct Tensor;
+} // namespace at
+
+namespace at {
+struct AT_API TensorImpl : public c10::intrusive_ptr_target {
+  TensorImpl() = delete;
+  TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
+  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
+
+  virtual void release_resources() override;
+
+  Type & type() const {
+    // NB: It's valid to use getTypeRaw here, because the TensorImpl
+    // could not have been created without initializing the Type first.
+    // TODO: This is not actually true via the Caffe2 codepath!  Make
+    // it so.
+    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable());
+  }
+
+  TensorTypeId type_id() const { return type_id_; }
+  virtual IntList sizes() const;
+  virtual IntList strides() const;
+  virtual int64_t dim() const;
+  virtual const Storage& storage() const;
+  friend struct Type;
+
+  virtual int64_t numel() const {
+#ifdef DEBUG
+    AT_ASSERT(compute_numel() == numel_);
+#endif
+    return numel_;
+  }
+
+  virtual bool is_contiguous() const {
+#ifdef DEBUG
+    AT_ASSERT(compute_contiguous() == is_contiguous_);
+#endif
+    return is_contiguous_;
+  }
+
+  // this is called by the generated wrapper code when there are conditions
+  // when this output tensor should be zero dimensional. e.g. when all inputs
+  // to a function 'add' were zero dimensional, then condition_when_zero_dim == true.
+  // we also prevent this from getting marked as a zero dim tensor if it is not
+  // the right shape afterall.
+  virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim);
+
+  // True if a tensor was auto-wrapped from a C++ or Python number.
+  // Wrapped numbers do not participate in the result type computation for
+  // mixed-type operations if there are any Tensors that are not wrapped
+  // numbers. Otherwise, they behave like their non-wrapped equivalents.
+  // See [Result type computation] in TensorIterator.h.
+  bool is_wrapped_number() const {
+    return is_wrapped_number_;
+  }
+  void set_wrapped_number(bool value) {
+    AT_ASSERT(dim() == 0);
+    is_wrapped_number_ = value;
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+  // Some methods below are defined in TensorImpl.cpp because Tensor is an
+  // incomplete type.
+
+  virtual void set_requires_grad(bool requires_grad) {
+    AT_ERROR("set_requires_grad is not implemented for Tensor");
+  }
+  virtual bool requires_grad() const {
+    AT_ERROR("requires_grad is not implemented for Tensor");
+  }
+
+  virtual Tensor& grad();
+  virtual const Tensor& grad() const;
+
+  // TODO: make these protected
+  // Note: storage->size() may be greater than the recorded size
+  // of a tensor
+  at::Storage storage_;
+
+  template <typename T>
+  inline T * data() const {
+    return storage_.data<T>() + storage_offset_;
+  }
+
+  template <typename T>
+  inline T * unsafe_data() const {
+    return storage_.unsafe_data<T>() + storage_offset_;
+  }
+
+  inline at::ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+
+  virtual int64_t storage_offset() const {
+    return storage_offset_;
+  }
+
+  // represents that numel() == 0.
+  inline bool is_empty() const {
+    return numel() == 0;
+  }
+
+  virtual void resize_dim(int64_t ndim) {
+    // NB: This is *truly* a resize; calling code (e.g., squeeze)
+    // assumes that old values are preserved
+    sizes_.resize(ndim);
+    strides_.resize(ndim);
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual void set_size(int64_t dim, int64_t new_size) {
+    sizes_[dim] = new_size;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual void set_stride(int64_t dim, int64_t new_stride) {
+    strides_[dim] = new_stride;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual void set_storage_offset(int64_t storage_offset) {
+    storage_offset_ = storage_offset;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  // WARNING: This function does not check if the requested
+  // sizes/strides are in bounds for the storage that is allocated;
+  // this is the responsibility of the caller
+  void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) {
+    AT_CHECK(
+        new_size.size() == new_stride.size(),
+        "dimensionality of sizes (",
+        new_size.size(),
+        ") must match dimensionality of strides (",
+        new_stride.size(),
+        ")");
+    sizes_ = new_size.vec();
+    strides_ = new_stride.vec();
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual int64_t size(int64_t d) const;
+  virtual int64_t stride(int64_t d) const;
+
+  bool is_variable() const { return is_variable_; };
+
+ private:
+  int64_t storage_offset_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  bool is_contiguous_;
+  int64_t numel_;
+
+  int64_t compute_numel() const {
+    int64_t n = 1;
+    for (auto s : sizes()) {
+      n *= s;
+    }
+    return n;
+  }
+  bool compute_contiguous() const;
+
+ protected:
+  void refresh_numel() {
+    numel_ = compute_numel();
+  }
+  void refresh_contiguous() {
+    is_contiguous_ = compute_contiguous();
+  }
+  TensorTypeId type_id_;
+  // INVARIANT: When storage is non-null, this scalar type must
+  // agree with the scalar type in storage
+  ScalarType scalar_type_;
+  bool is_variable_ = false;
+  bool is_wrapped_number_ = false;
+
+ private:
+  TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
+};
+} // namespace at
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/core/UndefinedTensorImpl.cpp
similarity index 55%
rename from aten/src/ATen/UndefinedTensor.cpp
rename to aten/src/ATen/core/UndefinedTensorImpl.cpp
index 956c70b9f178af..d270e379a3415e 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/core/UndefinedTensorImpl.cpp
@@ -1,40 +1,40 @@
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/core/Error.h"
 
 namespace at {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
-UndefinedTensor::UndefinedTensor()
+UndefinedTensorImpl::UndefinedTensorImpl()
 : TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) {
 }
 
-IntList UndefinedTensor::sizes() const {
+IntList UndefinedTensorImpl::sizes() const {
   AT_ERROR("sizes() called on undefined Tensor");
 }
 
-int64_t UndefinedTensor::size(int64_t d) const {
+int64_t UndefinedTensorImpl::size(int64_t d) const {
   AT_ERROR("size(dim) called on an undefined Tensor");
 }
 
-int64_t UndefinedTensor::stride(int64_t d) const {
+int64_t UndefinedTensorImpl::stride(int64_t d) const {
   AT_ERROR("stride(dim) called on an undefined Tensor");
 }
 
-int64_t UndefinedTensor::dim() const {
+int64_t UndefinedTensorImpl::dim() const {
   AT_ERROR("dim() called on undefined Tensor");
 }
 
-const Storage& UndefinedTensor::storage() const {
+const Storage& UndefinedTensorImpl::storage() const {
   AT_ERROR("storage() called on undefined Tensor");
 }
 
-int64_t UndefinedTensor::storage_offset() const {
+int64_t UndefinedTensorImpl::storage_offset() const {
   AT_ERROR("storage_offset() called on an undefined Tensor");
 }
 
-IntList UndefinedTensor::strides() const {
+IntList UndefinedTensorImpl::strides() const {
   AT_ERROR("strides() called on undefined Tensor");
 }
-UndefinedTensor UndefinedTensor::_singleton;
+UndefinedTensorImpl UndefinedTensorImpl::_singleton;
 
 }
diff --git a/aten/src/ATen/UndefinedTensor.h b/aten/src/ATen/core/UndefinedTensorImpl.h
similarity index 72%
rename from aten/src/ATen/UndefinedTensor.h
rename to aten/src/ATen/core/UndefinedTensorImpl.h
index 86faf028802c1e..6c734950d90cad 100644
--- a/aten/src/ATen/UndefinedTensor.h
+++ b/aten/src/ATen/core/UndefinedTensorImpl.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 
 namespace at {
 
-struct AT_API UndefinedTensor final : public TensorImpl {
+struct AT_API UndefinedTensorImpl final : public TensorImpl {
 public:
   // Without this, we get:
-  //  error: identifier "at::UndefinedTensor::_singleton" is undefined in device code
+  //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code
   // (ostensibly because the constexpr tricks MSVC into trying to compile this
   // function for device as well).
 #ifdef _WIN32
@@ -25,8 +25,8 @@ struct AT_API UndefinedTensor final : public TensorImpl {
   const Storage& storage() const override;
   int64_t storage_offset() const override;
 private:
-  UndefinedTensor();
-  static UndefinedTensor _singleton;
+  UndefinedTensorImpl();
+  static UndefinedTensorImpl _singleton;
 public:
   friend struct UndefinedType;
 };
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 85b99469cdee15..b830aa3e5fe65e 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -187,7 +187,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 }""")
 
 BUFFER_DEFINITION = CodeTemplate("""\
-auto ${name}_ = c10::make_intrusive<TensorImpl, UndefinedTensor>(
+auto ${name}_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
     ${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false).release();
 auto ${name} = Tensor(${name}_, false);""")
 
@@ -330,17 +330,17 @@ def __init__(self, reason):
 CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL')
 
 ALLOC_NOARGS_WRAP = {
-    'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
+    'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
                  '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
-    'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
+    'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
                      '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()',
-    'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
+    'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
                       '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()',
-    'THIntegerTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
+    'THIntegerTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
                         '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()',
-    'THDenseTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
+    'THDenseTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
                       '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
-    'THDenseIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
+    'THDenseIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
                            '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()'
 }
 
@@ -1295,11 +1295,12 @@ def allocate_arg(env, arg, output_count):
         tensor_arg = '{}_'.format(name)
         if arg.get('mask', False):
             allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation)
-            tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensor::singleton() : (TensorImpl*){}_'
+            tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_'
                           .format(name, name))
+        intrusive_ptr_type = 'c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>'
         return [
             'auto {}_ = {};'.format(name, allocation),
-            'auto {} = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim({}));'.format(name, tensor_arg),
+            'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg),
         ]
 
     def resize_arg(arg):
@@ -1509,7 +1510,8 @@ def emit_body(env, option):
                     env, arguments=[call])
                 return_tensor = (
                     "return Tensor(" +
-                    "c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim((${wrapped_tensor})${maybe_scalar}));")
+                    "c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(" +
+                    "(${wrapped_tensor})${maybe_scalar}));")
                 body.append(CodeTemplate(return_tensor).substitute(
                     env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar))
             # return the same underlying Tensor type for both real and accreal; this ensures
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 3f962961f55812..025faffe5d2fe8 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -236,7 +236,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['DenseBackend'] = backend
     env['storage_tensor_headers'] = []
     if density != 'Sparse':
-        env['storage_tensor_headers'] = ['#include "ATen/TensorImpl.h"']
+        env['storage_tensor_headers'] = ['#include "ATen/core/TensorImpl.h"']
 
     # used for generating switch logic for external functions
     tag = density_tag + backend + scalar_name
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 064da2bc2186e0..4bc8265a258f70 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -11,7 +11,7 @@
 #include "ATen/Allocator.h"
 #include "ATen/DeviceGuard.h"
 #include "ATen/NativeFunctions.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/Utils.h"
 #include "ATen/WrapDimUtils.h"
 #include "ATen/core/Half.h"
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index e54c6e4b62fc40..b80e354b60b347 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -9,9 +9,9 @@
 #include "ATen/core/SparseTensorRef.h"
 #include "ATen/core/Storage.h"
 #include "ATen/core/TensorAccessor.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/core/optional.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/core/Error.h"
 
 namespace at {
@@ -41,7 +41,7 @@ namespace at {
 // special care must be taken to handle this.
 struct AT_API Tensor {
   Tensor(){};
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl)
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : tensor_impl_(std::move(tensor_impl)) {
     if (tensor_impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
@@ -61,7 +61,7 @@ struct AT_API Tensor {
   TensorImpl * unsafeReleaseTensorImpl() {
     return tensor_impl_.release();
   }
-  const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& getIntrusivePtr() const {
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
     return tensor_impl_;
   }
 
@@ -259,7 +259,7 @@ struct AT_API Tensor {
   friend struct WeakTensor;
 
 protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl_;
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
 };
 
 struct AT_API WeakTensor {
@@ -287,6 +287,6 @@ struct AT_API WeakTensor {
   }
 
 private:
-  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensor> weak_tensor_impl_;
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
 };
 } // namespace at
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 4e9c454977d588..530aa09fddb734 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -110,8 +110,8 @@ Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) co
     return Storage(scalarType(), size, allocator);
 }
 Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const {
-  auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(static_cast<TensorImpl*>(th_pointer));
-  if (retain && tensor_impl.get() != UndefinedTensor::singleton()) {
+  auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(static_cast<TensorImpl*>(th_pointer));
+  if (retain && tensor_impl.get() != UndefinedTensorImpl::singleton()) {
     c10::raw::intrusive_ptr::incref(tensor_impl.get());
   }
   return Tensor(std::move(tensor_impl));
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index 5b26400ea52f12..bc2e12acb14f29 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -10,11 +10,11 @@
 $th_headers
 $storage_tensor_headers
 #include "ATen/${Generator}.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/Allocator.h"
 #include "ATen/DeviceGuard.h"
 #include "ATen/NativeFunctions.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/Utils.h"
 #include "ATen/WrapDimUtils.h"
 #include "ATen/core/Half.h"
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index 1b605e7271c6c7..e47772a7e8a80b 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -2,7 +2,7 @@
 #include "catch.hpp"
 
 #include "ATen/ATen.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include <string>
 #include "test_seed.h"
 
@@ -48,5 +48,5 @@ TEST_CASE( "undefined tensor test", "[]" ) {
   Tensor to_move = ones({1}, CPU(kFloat));
   Tensor m(std::move(to_move));
   REQUIRE(!to_move.defined());
-  REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensor::singleton());
+  REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton());
 }
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
index fb68639ec44752..68a884f22bd0ad 100644
--- a/aten/src/TH/THHalf.h
+++ b/aten/src/TH/THHalf.h
@@ -4,7 +4,7 @@
 #include <TH/THGeneral.h>
 
 #ifdef __cplusplus
-#include <ATen/TensorImpl.h>
+#include <ATen/core/TensorImpl.h>
 #endif
 
 #ifdef __cplusplus
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 0c731779b95685..5f861457c58ef4 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -8,7 +8,7 @@
 
 #include <numeric>
 
-// NB: This is NOT valid on UndefinedTensor
+// NB: This is NOT valid on UndefinedTensorImpl
 void THTensor_free(THTensor *self)
 {
   if (!self) return;
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index 3f373ee2119c26..5eac1e319347d1 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -54,13 +54,13 @@ scalar_t *THTensor_(data)(const THTensor *self) {
 /* Empty init */
 THTensor *THTensor_(new)(void)
 {
-  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THStorage_(new)(), at::CPUTensorId(), false).release();
 }
 
 /* Pointer-copy init */
 THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
-  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THStorage_(new)(), at::CPUTensorId(), false).release();
   THTensor_(setStorageNd)(self,
                           THTensor_getStoragePtr(tensor),
                           tensor->storage_offset(),
@@ -75,7 +75,7 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
   if (strides.data()) {
     AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
-  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THStorage_(new)(), at::CPUTensorId(), false).release();
   THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(),
                           const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
@@ -547,7 +547,7 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self)
   }
 }
 
-// NB: It is INVALID to call this on an UndefinedTensor
+// NB: It is INVALID to call this on an UndefinedTensorImpl
 void THTensor_(retain)(THTensor *self)
 {
   c10::raw::intrusive_ptr::incref(self);
diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h
index d5316919d4e31e..27807ea57a7dd0 100644
--- a/aten/src/TH/generic/THTensor.h
+++ b/aten/src/TH/generic/THTensor.h
@@ -5,7 +5,7 @@
 /* a la lua? dim, storageoffset, ...  et les methodes ? */
 
 #ifdef __cplusplus
-#include <ATen/TensorImpl.h>
+#include <ATen/core/TensorImpl.h>
 #endif
 
 #ifdef __cplusplus
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index db2b44511c2329..483a1fbec593e7 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -63,13 +63,13 @@ scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self)
 /* Empty init */
 THCTensor *THCTensor_(new)(THCState *state)
 {
-  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
 }
 
 /* Pointer-copy init */
 THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 {
-  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
   THCTensor_(setStorageNd)(state,
                            self,
                            THTensor_getStoragePtr(tensor),
@@ -85,7 +85,7 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
   if (strides.data()) {
     AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
-  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
   THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(),
                            const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index 2d69e60a40fed3..f5e1443b0eb5e7 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -229,9 +229,9 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b,
 #endif
 
 namespace {
-c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor> retainTensorImpl(THCTensor* self) {
+c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl> retainTensorImpl(THCTensor* self) {
   c10::raw::intrusive_ptr::incref(self);
-  return c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor>::reclaim(self);
+  return c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(self);
 }
 }
 
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
index 4f7cd29cc5a180..acfaccced2be09 100644
--- a/test/cpp_extensions/complex_registration_extension.cpp
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -10,7 +10,7 @@
 #include "ATen/DeviceGuard.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/TensorImpl.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/Utils.h"
 #include "ATen/WrapDimUtils.h"
 #include "ATen/core/Half.h"
@@ -53,7 +53,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault {
         numel,
         getCPUAllocator(),
         /* resizable */ true)};
-    Tensor t{c10::make_intrusive<TensorImpl, UndefinedTensor>(
+    Tensor t{c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
         std::move(s),
         at::CPUTensorId(),
         /* is_variable */ false)};
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 3602e7deb8eb89..d32a24b54917a8 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -118,14 +118,14 @@ struct TORCH_API IValue final {
   bool isTensor() const { return Tag::Tensor == tag; }
   at::Tensor toTensor() && {
     JIT_ASSERT(isTensor());
-    at::Tensor t(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor>::reclaim(as_tensor_impl));
+    at::Tensor t(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(as_tensor_impl));
     clearToNone();
     return t;
   }
   at::Tensor toTensor() const & {
     JIT_ASSERT(isTensor());
-    JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensor::singleton()));
-    auto tensor_impl = c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensor>::reclaim(as_tensor_impl);
+    JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensorImpl::singleton()));
+    auto tensor_impl = c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(as_tensor_impl);
     if (is_intrusive_ptr) {
       c10::raw::intrusive_ptr::incref(tensor_impl.get());
     }

From de460c7ad312a1f30d6329406b9b2fb53c530a0a Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Tue, 11 Sep 2018 08:39:20 -0700
Subject: [PATCH 031/237] Improvements on conv/pool/fold/stft/ParamDict docs
 (#11106)

Summary:
Also fixes some incorrect formula rendering.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11106

Differential Revision: D9752433

Pulled By: SsnL

fbshipit-source-id: 535fc8498638e8b645757fc7535d8771992b7d21
---
 aten/src/ATen/native/native_functions.yaml |   2 +-
 docs/source/conf.py                        |   2 +-
 torch/_tensor_docs.py                      |   9 +-
 torch/_torch_docs.py                       | 191 ++++++++++++---------
 torch/functional.py                        |  18 +-
 torch/nn/functional.py                     |  95 +++++-----
 torch/nn/modules/container.py              |   8 +-
 torch/nn/modules/conv.py                   |  75 ++++----
 torch/nn/modules/fold.py                   |  28 ++-
 torch/nn/modules/instancenorm.py           |  27 +++
 torch/nn/modules/linear.py                 |  20 ++-
 torch/nn/modules/module.py                 |   2 +-
 torch/nn/modules/pooling.py                | 157 ++++++++---------
 torch/nn/parallel/data_parallel.py         |   7 +-
 torch/nn/utils/rnn.py                      |   9 +-
 15 files changed, 360 insertions(+), 290 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2b71982be8cf6a..44fea84527acc2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -374,7 +374,7 @@
 
 - func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor
 
-- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor
+- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad=0) -> Tensor
 
 - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor)
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index cbcdf707f2fdb4..ca578c40e3df5a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -229,7 +229,7 @@
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
     'python': ('https://docs.python.org/', None),
-    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
 }
 
 # -- A patch that prevents Sphinx from cross-referencing ivar tags -------
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index f7ace5e0edcac2..c3039583b04fb7 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1412,14 +1412,7 @@ def callable(a, b) -> number
                r"""
 narrow(dimension, start, length) -> Tensor
 
-Returns a new tensor that is a narrowed version of :attr:`self` tensor. The
-dimension :attr:`dim` is narrowed from :attr:`start` to :attr:`start + length`. The
-returned tensor and :attr:`self` tensor share the same underlying storage.
-
-Args:
-    dimension (int): the dimension along which to narrow
-    start (int): the starting dimension
-    length (int): the distance to the ending dimension
+See :func:`torch.narrow`
 
 Example::
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index ea6016f778f711..f66a4e1c17d5bc 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -117,7 +117,7 @@ def parse_kwargs(desc):
 and returns a new resulting tensor.
 
 .. math::
-    out = input + value
+    \text{out} = \text{input} + \text{value}
 
 If :attr:`input` is of type FloatTensor or DoubleTensor, :attr:`value` must be
 a real number, otherwise it should be an integer.
@@ -147,7 +147,7 @@ def parse_kwargs(desc):
 :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out = input + value \times other
+    \text{out} = \text{input} + \text{value} \times \text{other}
 
 If :attr:`other` is of type FloatTensor or DoubleTensor, :attr:`value` must be
 a real number, otherwise it should be an integer.
@@ -197,7 +197,7 @@ def parse_kwargs(desc):
 and :attr:`out` will be a :math:`(n \times p)` tensor.
 
 .. math::
-    out = \beta\ mat + \alpha\ (\sum_{i=0}^{b} batch1_i \mathbin{@} batch2_i)
+    out = \beta\ \text{mat} + \alpha\ (\sum_{i=0}^{b} \text{batch1}_i \mathbin{@} \text{batch2}_i)
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
 must be real numbers, otherwise they should be integers.
@@ -229,7 +229,7 @@ def parse_kwargs(desc):
 multiply the result by the scalar :attr:`value` and add it to :attr:`tensor`.
 
 .. math::
-    out_i = tensor_i + value \times \frac{tensor1_i}{tensor2_i}
+    \text{out}_i = \text{tensor}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
 
 The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
 :ref:`broadcastable <broadcasting-semantics>`.
@@ -239,7 +239,7 @@ def parse_kwargs(desc):
 
 Args:
     tensor (Tensor): the tensor to be added
-    value (Number, optional): multiplier for :math:`tensor1 ./ tensor2`
+    value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
     tensor1 (Tensor): the numerator tensor
     tensor2 (Tensor): the denominator tensor
     out (Tensor, optional): the output tensor
@@ -264,7 +264,7 @@ def parse_kwargs(desc):
 and add it to :attr:`tensor`.
 
 .. math::
-    out_i = tensor_i + value \times tensor1_i \times tensor2_i
+    \text{out}_i = \text{tensor}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
 
 The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
 :ref:`broadcastable <broadcasting-semantics>`.
@@ -306,7 +306,7 @@ def parse_kwargs(desc):
 :attr:`mat1` and :attr`mat2` and the added matrix :attr:`mat` respectively.
 
 .. math::
-    out = \beta\ mat + \alpha\ (mat1_i \mathbin{@} mat2_i)
+    \text{out} = \beta\ \text{mat} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers.
@@ -346,7 +346,7 @@ def parse_kwargs(desc):
 :attr:`mat` and :attr:`vec` and the added tensor :attr:`tensor` respectively.
 
 .. math::
-    out = \beta\ tensor + \alpha\ (mat \mathbin{@} vec)
+    \text{out} = \beta\ \text{tensor} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers
@@ -380,7 +380,7 @@ def parse_kwargs(desc):
 :attr:`mat` respectively.
 
 .. math::
-    out = \beta\ mat + \alpha\ (vec1 \otimes vec2)
+    \text{out} = \beta\ \text{mat} + \alpha\ (\text{vec1} \otimes \text{vec2})
 
 If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
 of size `m`, then :attr:`mat` must be
@@ -394,7 +394,7 @@ def parse_kwargs(desc):
 Args:
     beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
     mat (Tensor): matrix to be added
-    alpha (Number, optional): multiplier for :math:`vec1 \otimes vec2` (:math:`\alpha`)
+    alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
     vec1 (Tensor): the first vector of the outer product
     vec2 (Tensor): the second vector of the outer product
     out (Tensor, optional): the output tensor
@@ -427,7 +427,7 @@ def parse_kwargs(desc):
     other (Tensor): second tensor to compare
     atol (float, optional): absolute tolerance. Default: 1e-08
     rtol (float, optional): relative tolerance. Default: 1e-05
-    equal_nan (float, optional): if ``True``, then two ``NaN``s will be compared as ``True``. Default: ``False``
+    equal_nan (float, optional): if ``True``, then two ``NaN`` s will be compared as equal. Default: ``False``
 
 Example::
 
@@ -560,7 +560,7 @@ def parse_kwargs(desc):
 same as the scaling factors used in :meth:`torch.addbmm`.
 
 .. math::
-    out_i = \beta\ mat_i + \alpha\ (batch1_i \mathbin{@} batch2_i)
+    \text{out}_i = \beta\ \text{mat}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers.
@@ -568,7 +568,7 @@ def parse_kwargs(desc):
 Args:
     beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
     mat (Tensor): the tensor to be added
-    alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+    alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
     batch1 (Tensor): the first batch of matrices to be multiplied
     batch2 (Tensor): the second batch of matrices to be multiplied
     out (Tensor, optional): the output tensor
@@ -609,7 +609,7 @@ def parse_kwargs(desc):
 
 Example::
 
-    >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1]
+    >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
     >>> a
     tensor([[ 0.1737,  0.0950,  0.3609],
             [ 0.7148,  0.0289,  0.2676],
@@ -642,8 +642,8 @@ def parse_kwargs(desc):
 tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least
 :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size
 :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``,
-:math:`out[n] += weights[i]` if :attr:`weights` is specified else
-:math:`out[n] += 1`.
+``out[n] += weights[i]`` if :attr:`weights` is specified else
+``out[n] += 1``.
 
 Arguments:
     input (Tensor): 1-d int tensor
@@ -651,9 +651,9 @@ def parse_kwargs(desc):
         Should be of same size as input tensor.
     minlength (int): optional, minimum number of bins. Should be non-negative.
 
-Shape:
-    output (Tensor): ``Size([max(input) + 1])`` if :attr:`input` is non-empty, else
-                     ``Size(0)``
+Returns:
+    output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if
+    :attr:`input` is non-empty, else ``Size(0)``
 
 Example::
 
@@ -685,7 +685,7 @@ def parse_kwargs(desc):
 :math:`(b \times n \times p)` tensor.
 
 .. math::
-    out_i = batch1_i \mathbin{@} batch2_i
+    \text{out}_i = \text{batch1}_i \mathbin{@} \text{batch2}_i
 
 .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
           For broadcasting matrix products, see :func:`torch.matmul`.
@@ -1259,7 +1259,7 @@ def parse_kwargs(desc):
 and returns a new resulting tensor.
 
 .. math::
-    out_i = \frac{input_i}{value}
+    \text{out}_i = \frac{\text{input}_i}{\text{value}}
 
 If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
 should be a real number, otherwise it should be an integer
@@ -1285,7 +1285,7 @@ def parse_kwargs(desc):
 :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = \frac{input_i}{other_i}
+    \text{out}_i = \frac{\text{input}_i}{\text{other}_i}
 
 Args:
     input (Tensor): the numerator tensor
@@ -1341,16 +1341,17 @@ def parse_kwargs(desc):
     (Tensor, Tensor): A tuple containing
 
         - **e** (*Tensor*): Shape :math:`(n \times 2)`. Each row is an eigenvalue of ``a``,
-            where the first element is the real part and the second element is the imaginary part.
-            The eigenvalues are not necessarily ordered.
+          where the first element is the real part and the second element is the imaginary part.
+          The eigenvalues are not necessarily ordered.
         - **v** (*Tensor*): If ``eigenvectors=False``, it's an empty tensor.
-            Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length)
-            eigenvectors of corresponding eigenvalues ``e`` as follows.
-            If the corresponding e[j] is a real number, column v[:, j] is the eigenvector corresponding to
-            eigenvalue e[j].
-            If the corresponding e[j] and e[j + 1] eigenvalues form a complex conjugate pair, then the true eigenvectors
-            can be computed as
-            :math:`eigenvector[j] = v[:, j] + i * v[:, j + 1], eigenvector[j + 1] = v[:, j] - i * v[:, j + 1]`.
+          Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length)
+          eigenvectors of corresponding eigenvalues ``e`` as follows.
+          If the corresponding e[j] is a real number, column v[:, j] is the eigenvector corresponding to
+          eigenvalue e[j].
+          If the corresponding e[j] and e[j + 1] eigenvalues form a complex conjugate pair, then the true eigenvectors
+          can be computed as
+          :math:`\text{eigenvector}[j] = v[:, j] + i \times v[:, j + 1]`,
+          :math:`\text{eigenvector}[j + 1] = v[:, j] - i \times v[:, j + 1]`.
 """)
 
 add_docstr(torch.eq,
@@ -1645,7 +1646,7 @@ def parse_kwargs(desc):
 
 If :attr:`input` is an n-dimensional tensor with size
 :math:`(x_0, x_1..., x_{i-1}, x_i, x_{i+1}, ..., x_{n-1})`
-and :attr:`dim` :math:`= i`, then :attr:`index` must be an :math:`n`-dimensional tensor with
+and ``dim = i``, then :attr:`index` must be an :math:`n`-dimensional tensor with
 size :math:`(x_0, x_1, ..., x_{i-1}, y, x_{i+1}, ..., x_{n-1})` where :math:`y \geq 1`
 and :attr:`out` will have the same size as :attr:`index`.
 
@@ -1667,7 +1668,7 @@ def parse_kwargs(desc):
            r"""
 ge(input, other, out=None) -> Tensor
 
-Computes :math:`input \geq other` element-wise.
+Computes :math:`\text{input} \geq \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -1708,7 +1709,7 @@ def parse_kwargs(desc):
 .. math::
 
    \begin{array}{ll}
-   \min_X & \|X\|_2 & \mbox{subject to} & AX = B.
+   \min_X & \|X\|_2 & \text{subject to} & AX = B.
    \end{array}
 
 Returned tensor :math:`X` has shape :math:`(\max(m, n) \times k)`. The first :math:`n`
@@ -1821,7 +1822,7 @@ def parse_kwargs(desc):
 
 .. note::
 
-    The `out` keyword only supports 2D matrix inputs, that is,
+    The :attr:`out` keyword only supports 2D matrix inputs, that is,
     `B, A` must be 2D matrices.
 
 .. note::
@@ -1832,10 +1833,10 @@ def parse_kwargs(desc):
     `A.contiguous().transpose(-1, -2).strides()` respectively.
 
 Args:
-    B (Tensor): input matrix of size :math:`(*, m, k)` , where `*`
-    is zero or more batch dimensions.
+    B (Tensor): input matrix of size :math:`(*, m, k)` , where :math:`*`
+      is zero or more batch dimensions.
     A (Tensor): input square matrix of size :math:`(*, m, m)`, where
-    `*` is zero or more batch dimensions.
+      :math:`*` is zero or more batch dimensions.
     out ((Tensor, Tensor), optional): optional output tuple.
 
 Example::
@@ -1893,7 +1894,7 @@ def parse_kwargs(desc):
            r"""
 gt(input, other, out=None) -> Tensor
 
-Computes :math:`input > other` element-wise.
+Computes :math:`\text{input} > \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -2054,7 +2055,7 @@ def parse_kwargs(desc):
            r"""
 le(input, other, out=None) -> Tensor
 
-Computes :math:`input \leq other` element-wise.
+Computes :math:`\text{input} \leq \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -2082,7 +2083,7 @@ def parse_kwargs(desc):
 on a scalar :attr:`weight` and returns the resulting :attr:`out` tensor.
 
 .. math::
-    out_i = start_i + weight \times (end_i - start_i)
+    \text{out}_i = \text{start}_i + \text{weight} \times (\text{end}_i - \text{start}_i)
 
 The shapes of :attr:`start` and :attr:`end` must be
 :ref:`broadcastable <broadcasting-semantics>`.
@@ -2299,7 +2300,7 @@ def parse_kwargs(desc):
            r"""
 lt(input, other, out=None) -> Tensor
 
-Computes :math:`input < other` element-wise.
+Computes :math:`\text{input} < \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -2471,7 +2472,7 @@ def parse_kwargs(desc):
 but they must be :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = \max(tensor_i, other_i)
+    \text{out}_i = \max(\text{tensor}_i, \text{other}_i)
 
 .. note:: When the shapes do not match, the shape of the returned output tensor
           follows the :ref:`broadcasting rules <broadcasting-semantics>`.
@@ -2647,7 +2648,7 @@ def parse_kwargs(desc):
 but they must be :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = \min(tensor_i, other_i)
+    \text{out}_i = \min(\text{tensor}_i, \text{other}_i)
 
 .. note:: When the shapes do not match, the shape of the returned output tensor
           follows the :ref:`broadcasting rules <broadcasting-semantics>`.
@@ -2803,7 +2804,7 @@ def parse_kwargs(desc):
 :attr:`value` and returns a new resulting tensor.
 
 .. math::
-    out_i = value \times input_i
+    \text{out}_i = \text{value} \times \text{input}_i
 
 If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
 should be a real number, otherwise it should be an integer
@@ -2830,7 +2831,7 @@ def parse_kwargs(desc):
 :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = input_i \times other_i
+    \text{out}_i = \text{input}_i \times \text{other}_i
 
 Args:
     input (Tensor): the first multiplicand tensor
@@ -2874,7 +2875,7 @@ def parse_kwargs(desc):
 If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
 
 If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
-:math:`(m \times num\_samples)`.
+:math:`(m \times \text{num\_samples})`.
 
 If replacement is ``True``, samples are drawn with replacement.
 
@@ -2953,6 +2954,32 @@ def parse_kwargs(desc):
             [1.0311, 0.3901, 0.5049]])
 """)
 
+add_docstr(torch.narrow,
+           r"""
+narrow(input, dimension, start, length) -> Tensor
+
+Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+dimension :attr:`dim` is input from :attr:`start` to :attr:`start + length`. The
+returned tensor and :attr:`self` tensor share the same underlying storage.
+
+Args:
+    input (Tensor): the tensor to narrow
+    dimension (int): the dimension along which to narrow
+    start (int): the starting dimension
+    length (int): the distance to the ending dimension
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> torch.narrow(x, 0, 0, 2)
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+    >>> torch.narrow(x, 1, 1, 2)
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
+""")
+
 add_docstr(torch.ne,
            r"""
 ne(input, other, out=None) -> Tensor
@@ -2984,7 +3011,7 @@ def parse_kwargs(desc):
 Returns a new tensor with the negative of the elements of :attr:`input`.
 
 .. math::
-    out = -1 \times input
+    \text{out} = -1 \times \text{input}
 
 Args:
     input (Tensor): the input tensor
@@ -3313,13 +3340,13 @@ def parse_kwargs(desc):
 Cholesky factor :attr:`u`: returns matrix `inv`
 
 If :attr:`upper` is ``True`` or not provided, :attr:`u` is upper
-triangular such that:
+triangular such that the returned tensor is
 
 .. math::
     inv = (u^T u)^{-1}
 
 If :attr:`upper` is ``False``, :attr:`u` is lower triangular
-such that:
+such that the tensor value is
 
 .. math::
     inv = (uu^{T})^{-1}
@@ -3412,12 +3439,12 @@ def parse_kwargs(desc):
 When :attr:`exponent` is a scalar value, the operation applied is:
 
 .. math::
-    out_i = x_i ^ {exponent}
+    \text{out}_i = x_i ^ \text{exponent}
 
 When :attr:`exponent` is a tensor, the operation applied is:
 
 .. math::
-    out_i = x_i ^ {exponent_i}
+    \text{out}_i = x_i ^ {\text{exponent}_i}
 
 When :attr:`exponent` is a tensor, the shapes of :attr:`input`
 and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
@@ -3623,7 +3650,7 @@ def parse_kwargs(desc):
     >>> torch.rand(2, 3)
     tensor([[ 0.8237,  0.5781,  0.6879],
             [ 0.3816,  0.7249,  0.0998]])
-""")
+""".format(**factory_common_args))
 
 add_docstr(torch.rand_like,
            r"""
@@ -3824,13 +3851,13 @@ def parse_kwargs(desc):
            r"""
 range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
-Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor + 1`
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
 with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
 the gap between two values in the tensor.
 
 .. math::
-    \text{{out}}_{{i+1}} = \text{{out}}_i + step.
-
+    \text{out}_{i+1} = \text{out}_i + \text{step}.
+""" + r"""
 .. warning::
     This function is deprecated in favor of :func:`torch.arange`.
 
@@ -3856,7 +3883,7 @@ def parse_kwargs(desc):
            r"""
 arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
-Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor`
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor`
 with values from the interval ``[start, end)`` taken with common difference
 :attr:`step` beginning from `start`.
 
@@ -3865,8 +3892,8 @@ def parse_kwargs(desc):
 in such cases.
 
 .. math::
-    \text{{out}}_{{i+1}} = \text{{out}}_{{i}} + \text{{step}}
-
+    \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+""" + r"""
 Args:
     start (Number): the starting value for the set of points. Default: ``0``.
     end (Number): the ending value for the set of points
@@ -4293,8 +4320,8 @@ def parse_kwargs(desc):
 
 When :attr:`dim` is given, a squeeze operation is done only in the given
 dimension. If `input` is of shape: :math:`(A \times 1 \times B)`,
-`squeeze(input, 0)` leaves the tensor unchanged, but :func:`squeeze(input, 1)` will
-squeeze the tensor to the shape :math:`(A \times B)`.
+``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+will squeeze the tensor to the shape :math:`(A \times B)`.
 
 .. note:: The returned tensor shares the storage with the input tensor,
           so changing the contents of one will change the contents of the other.
@@ -4504,7 +4531,7 @@ def parse_kwargs(desc):
 :math:`m` dimensional vector.
 
 This function calculates all eigenvalues (and vectors) of :attr:`input`
-such that :math:`input = V diag(e) V^T`.
+such that :math:`\text{input} = V \text{diag}(e) V^T`.
 
 The boolean argument :attr:`eigenvectors` defines computation of
 eigenvectors or eigenvalues only.
@@ -4530,10 +4557,10 @@ def parse_kwargs(desc):
     (Tensor, Tensor): A tuple containing
 
         - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``,
-            The eigenvalues are in ascending order.
+          The eigenvalues are in ascending order.
         - **V** (*Tensor*): Shape :math:`(m \times m)`.
-            If ``eigenvectors=False``, it's a tensor filled with zeros.
-            Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
+          If ``eigenvectors=False``, it's a tensor filled with zeros.
+          Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
 
 Examples::
 
@@ -4561,7 +4588,7 @@ def parse_kwargs(desc):
 Expects :attr:`input` to be a matrix (2-D tensor) and transposes dimensions 0
 and 1.
 
-Can be seen as a short-hand function for :meth:`transpose(input, 0, 1)`
+Can be seen as a short-hand function for ``transpose(input, 0, 1)``.
 
 Args:
     input (Tensor): the input tensor
@@ -4907,28 +4934,28 @@ def parse_kwargs(desc):
            r"""
 trtrs(b, A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
 
-Solves a system of equations with a triangular coefficient matrix `A`
-and multiple right-hand sides `b`.
+Solves a system of equations with a triangular coefficient matrix :math:`A`
+and multiple right-hand sides :attr:`b`.
 
-In particular, solves :math:`AX = b` and assumes `A` is upper-triangular
+In particular, solves :math:`AX = b` and assumes :math:`A` is upper-triangular
 with the default keyword arguments.
 
 Args:
     A (Tensor): the input triangular coefficient matrix
-    b (Tensor): multiple right-hand sides. Each column of `b` is a
+    b (Tensor): multiple right-hand sides. Each column of :math:`b` is a
         right-hand side for the system of equations.
     upper (bool, optional): whether to solve the upper-triangular system
         of equations (default) or the lower-triangular system of equations. Default: True.
-    transpose (bool, optional): whether `A` should be transposed before
+    transpose (bool, optional): whether :math:`A` should be transposed before
         being sent into the solver. Default: False.
-    unitriangular (bool, optional): whether `A` is unit triangular.
-        If True, the diagonal elements of `A` are assumed to be
-        1 and not referenced from `A`. Default: False.
+    unitriangular (bool, optional): whether :math:`A` is unit triangular.
+        If True, the diagonal elements of :math:`A` are assumed to be
+        1 and not referenced from :math:`A`. Default: False.
 
 Returns:
-    A tuple (X, M) where `M` is a clone of `A` and `X` is the solution to
-    `AX = b` (or whatever variant of the system of equations, depending on
-    the keyword arguments.)
+    A tuple :math:`(X, M)` where :math:`M` is a clone of :math:`A` and :math:`X`
+    is the solution to :math:`AX = b` (or whatever variant of the system of
+    equations, depending on the keyword arguments.)
 
 Shape:
     - A: :math:`(N, N)`
@@ -5968,17 +5995,17 @@ def parse_kwargs(desc):
 meshgrid(seq) -> seq
 
 Take a sequence of :math:`N` tensors, each of which can be either scalar or 1-dimensional
-vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by
-expanding the :math:`i`th input over dimensions defined by other inputs.
+vector, and create :math:`N` N-dimensional grids, where the :math:`i` th grid is defined by
+expanding the :math:`i` th input over dimensions defined by other inputs.
 
 Arguments:
     seq (sequence of Tensors): sequence of scalars or 1 dimensional tensors. Scalars will be
         treated as tensors of size :math:`(1,)` automatically.
 
 Returns:
-    seq (sequence of Tensors): If the input has :math:`k` tensors of size
-        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors,
-        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
+    seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`,
+    then the output would also has :math:`k` tensors, where all tensors are
+    of size :math:`(N_1, N_2, \ldots , N_k)`.
 
 Example::
 
diff --git a/torch/functional.py b/torch/functional.py
index d84597f3fff437..e31903db5a7129 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -50,7 +50,7 @@ def split(tensor, split_size_or_sections, dim=0):
 
     If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
     be split into equally sized chunks (if possible). Last chunk will be smaller if
-    the tensor size along the given dimension :attr:`dim= is not divisible by
+    the tensor size along the given dimension :attr:`dim` is not divisible by
     :attr:`split_size`.
 
     If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
@@ -287,8 +287,8 @@ def stft(input, n_fft, hop_length=None, win_length=None, window=None,
 
     .. math::
         X[m, \omega] = \sum_{k = 0}^{\text{win\_length}}%
-                            window[k]\ input[m \times hop_length + k]\ %
-                            e^{- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}},
+                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
+                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right),
 
     where :math:`m` is the index of the sliding window, and :math:`\omega` is
     the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When
@@ -345,7 +345,7 @@ def stft(input, n_fft, hop_length=None, win_length=None, window=None,
         win_length (int): the size of window frame and STFT filter.
             Default: ``None``  (treated as equal to :attr:`n_fft`)
         window (Tensor, optional): the optional window function.
-            Default: ``None`` (treated as window of all :math:`1`s)
+            Default: ``None`` (treated as window of all :math:`1` s)
         center (bool, optional): whether to pad :attr:`input` on both sides so
             that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
             Default: ``True``
@@ -452,7 +452,7 @@ def unique(input, sorted=False, return_inverse=False, dim=None):
 
 
 def argmax(input, dim=None, keepdim=False):
-    """Returns the indices of the maximum values of a tensor across a dimension.
+    r"""Returns the indices of the maximum values of a tensor across a dimension.
 
     This is the second value returned by :meth:`torch.max`. See its
     documentation for the exact semantics of this method.
@@ -483,7 +483,7 @@ def argmax(input, dim=None, keepdim=False):
 
 
 def argmin(input, dim=None, keepdim=False):
-    """Returns the indices of the minimum values of a tensor across a dimension.
+    r"""Returns the indices of the minimum values of a tensor across a dimension.
 
     This is the second value returned by :meth:`torch.min`. See its
     documentation for the exact semantics of this method.
@@ -514,7 +514,7 @@ def argmin(input, dim=None, keepdim=False):
 
 
 def tensordot(a, b, dims=2):
-    """Returns a contraction of a and b over multiple dimensions.
+    r"""Returns a contraction of a and b over multiple dimensions.
 
     :attr:`tensordot` implements a generalizes the matrix product.
 
@@ -531,7 +531,7 @@ def tensordot(a, b, dims=2):
 
     .. math::
         r_{i_0,...,i_{m-d}, i_d,...,i_n}
-          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} * b_{k_0,...,k_{d-1}, i_d,...,i_n}.
+          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.
 
     When called with :attr:`dims` of the list form, the given dimensions will be contracted
     in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
@@ -569,7 +569,7 @@ def tensordot(a, b, dims=2):
 
 
 def argsort(input, dim=None, descending=False):
-    """Returns the indices that sort a tensor along a given dimension in ascending
+    r"""Returns the indices that sort a tensor along a given dimension in ascending
     order by value.
 
     This is the second value returned by :meth:`torch.sort`.  See its documentation
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 47b7375bd1f456..d9889d631f7fb8 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -68,16 +68,16 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 See :class:`~torch.nn.Conv1d` for details and output shape.
 
 Args:
-    input: input tensor of shape :math:`minibatch \times in\_channels \times iW`
-    weight: filters of shape :math:`out\_channels \times \frac{in\_channels}{groups} \times kW`
-    bias: optional bias of shape (:math:`out\_channels`). Default: ``None``
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
+    weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: ``None``
     stride: the stride of the convolving kernel. Can be a single number or
       a one-element tuple `(sW,)`. Default: 1
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a one-element tuple `(padW,)`. Default: 0
     dilation: the spacing between kernel elements. Can be a single number or
       a one-element tuple `(dW,)`. Default: 1
-    groups: split input into groups, :math:`in\_channels` should be divisible by
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
       the number of groups. Default: 1
 
 Examples::
@@ -96,16 +96,16 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 See :class:`~torch.nn.Conv2d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`)
-    weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kH \times kW`)
-    bias: optional bias tensor of shape (:math:`out\_channels`). Default: ``None``
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
+    weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kH \times kW)`
+    bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple `(sH, sW)`. Default: 1
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a tuple `(padH, padW)`. Default: 0
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple `(dH, dW)`. Default: 1
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
 
 Examples::
@@ -114,7 +114,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> filters = torch.randn(8,4,3,3)
     >>> inputs = torch.randn(1,4,5,5)
     >>> F.conv2d(inputs, filters, padding=1)
-""")
+""")  # noqa: E501
 
 conv3d = _add_docstr(torch.conv3d, r"""
 conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
@@ -125,16 +125,16 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 See :class:`~torch.nn.Conv3d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
-    weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kT \times kH \times kW`)
-    bias: optional bias tensor of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
+    weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kT \times kH \times kW)`
+    bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple `(sT, sH, sW)`. Default: 1
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a tuple `(padT, padH, padW)`. Default: 0
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple `(dT, dH, dW)`. Default: 1
-    groups: split input into groups, :math:`in\_channels` should be divisible by
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
       the number of groups. Default: 1
 
 Examples::
@@ -142,7 +142,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> filters = torch.randn(33, 16, 3, 3, 3)
     >>> inputs = torch.randn(20, 16, 50, 10, 20)
     >>> F.conv3d(inputs, filters)
-""")
+""")  # noqa: E501
 
 conv_transpose1d = _add_docstr(torch.conv_transpose1d, r"""
 conv_transpose1d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
@@ -153,9 +153,9 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 See :class:`~torch.nn.ConvTranspose1d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`)
-    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kW`)
-    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
+    weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple ``(sW,)``. Default: 1
     padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
@@ -163,7 +163,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
       ``(padW,)``. Default: 0
     output_padding: additional size added to one side of each dimension in the
       output shape. Can be a single number or a tuple ``(out_padW)``. Default: 0
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple ``(dW,)``. Default: 1
@@ -184,9 +184,9 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 See :class:`~torch.nn.ConvTranspose2d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`)
-    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kH \times kW`)
-    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
+    weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kH \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple ``(sH, sW)``. Default: 1
     padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
@@ -195,7 +195,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     output_padding: additional size added to one side of each dimension in the
       output shape. Can be a single number or a tuple ``(out_padH, out_padW)``.
       Default: 0
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple ``(dH, dW)``. Default: 1
@@ -206,7 +206,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> inputs = torch.randn(1, 4, 5, 5)
     >>> weights = torch.randn(4, 8, 3, 3)
     >>> F.conv_transpose2d(inputs, weights, padding=1)
-""")
+""")  # noqa: E501
 
 conv_transpose3d = _add_docstr(torch.conv_transpose3d, r"""
 conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
@@ -217,9 +217,9 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 See :class:`~torch.nn.ConvTranspose3d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
-    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kT \times kH \times kW`)
-    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
+    weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kT \times kH \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple ``(sT, sH, sW)``. Default: 1
     padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
@@ -228,7 +228,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     output_padding: additional size added to one side of each dimension in the
       output shape. Can be a single number or a tuple
       ``(out_padT, out_padH, out_padW)``. Default: 0
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple `(dT, dH, dW)`. Default: 1
@@ -238,20 +238,18 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> inputs = torch.randn(20, 16, 50, 10, 20)
     >>> weights = torch.randn(16, 33, 3, 3, 3)
     >>> F.conv_transpose3d(inputs, weights)
-""")
-
+""")  # noqa: E501
 
-def conv_tbc(input, weight, bias, pad=0):
-    r"""Applies a 1-dimensional sequence convolution over an input sequence.
-    Input and output dimensions are (Time, Batch, Channels) - hence TBC.
+conv_tbc = _add_docstr(torch.conv_tbc, r"""
+Applies a 1-dimensional sequence convolution over an input sequence.
+Input and output dimensions are (Time, Batch, Channels) - hence TBC.
 
-    Args:
-        input: input tensor of shape (:math:`\text{sequence length} \times batch \times in\_channels`)
-        weight: filter of shape (:math:`\text{kernel width} \times in\_channels \times out\_channels`)
-        bias: bias of shape (:math:`out\_channels`)
-        pad: number of timesteps to pad
-    """
-    return torch.conv_tbc(input, weight, bias, pad)
+Args:
+    input: input tensor of shape :math:`(\text{sequence length} \times batch \times \text{in\_channels})`
+    weight: filter of shape (:math:`\text{kernel width} \times \text{in\_channels} \times \text{out\_channels}`)
+    bias: bias of shape (:math:`\text{out\_channels}`)
+    pad: number of timesteps to pad. Default: 0
+""")
 
 
 # Pooling
@@ -264,9 +262,9 @@ def conv_tbc(input, weight, bias, pad=0):
 See :class:`~torch.nn.AvgPool1d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`)
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
     kernel_size: the size of the window. Can be a single number or a
-      tuple `(kW,)`
+      tuple :math:`(kW,)`
     stride: the stride of the window. Can be a single number or a tuple
       `(sW,)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
@@ -276,11 +274,12 @@ def conv_tbc(input, weight, bias, pad=0):
     count_include_pad: when True, will include the zero-padding in the
         averaging calculation. Default: ``True``
 
-Example::
+Examples::
     >>> # pool of square window of size=3, stride=2
     >>> input = torch.tensor([[[1,2,3,4,5,6,7]]])
     >>> F.avg_pool1d(input, kernel_size=3, stride=2)
     tensor([[[ 2.,  4.,  6.]]])
+
 """)
 
 
@@ -294,9 +293,9 @@ def conv_tbc(input, weight, bias, pad=0):
 See :class:`~torch.nn.AvgPool2d` for details and output shape.
 
 Args:
-    input: input tensor (:math:`minibatch \times in\_channels \times iH \times iW`)
+    input: input tensor :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
     kernel_size: size of the pooling region. Can be a single number or a
-      tuple (:math:`kH \times kW`)
+      tuple :math:`(kH \times kW)`
     stride: stride of the pooling operation. Can be a single number or a
       tuple `(sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
@@ -317,9 +316,9 @@ def conv_tbc(input, weight, bias, pad=0):
 See :class:`~torch.nn.AvgPool3d` for details and output shape.
 
 Args:
-    input: input tensor (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
+    input: input tensor :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
     kernel_size: size of the pooling region. Can be a single number or a
-      tuple (:math:`kT \times kH \times kW`)
+      tuple :math:`(kT \times kH \times kW)`
     stride: stride of the pooling operation. Can be a single number or a
       tuple `(sT, sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
@@ -345,13 +344,13 @@ def fractional_max_pool2d(input, kernel_size, output_size=None,
     Args:
         kernel_size: the size of the window to take a max over.
                      Can be a single number :math:`k` (for a square kernel of :math:`k \times k`)
-                     or a tuple (:math:`kH \times kW`)
+                     or a tuple :math:`(kH \times kW)`
         output_size: the target output size of the image of the form :math:`oH \times oW`.
                      Can be a tuple `(oH, oW)` or a single number :math:`oH` for a square image :math:`oH \times oH`
         output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
                       This has to be a number or tuple in the range (0, 1)
         return_indices: if ``True``, will return the indices along with the outputs.
-                        Useful to pass to `max_unpool2d`.
+                        Useful to pass to :func:`~torch.nn.functional.max_unpool2d`.
 
     Examples::
         >>> input = torch.randn(20, 16, 50, 32)
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 454151afed8201..ef79b1ffb6a289 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -403,8 +403,8 @@ class ParameterDict(Module):
 
     Arguments:
         parameters (iterable, optional): a mapping (dictionary) of
-            (string : :class:`~torch.nn.Parameter``) or an iterable of key,value pairs
-            of type (string, :class:`~torch.nn.Parameter``)
+            (string : :class:`~torch.nn.Parameter`) or an iterable of key,value pairs
+            of type (string, :class:`~torch.nn.Parameter`)
 
     Example::
 
@@ -480,8 +480,8 @@ def update(self, parameters):
 
         Arguments:
             parameters (iterable): a mapping (dictionary) of
-                (string : :class:`~torch.nn.Parameter``) or an iterable of
-                key/value pairs of type (string, :class:`~torch.nn.Parameter``)
+                (string : :class:`~torch.nn.Parameter`) or an iterable of
+                key/value pairs of type (string, :class:`~torch.nn.Parameter`)
         """
         if not isinstance(parameters, Iterable):
             raise TypeError("ParametersDict.update should be called with an "
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 03cf297e3c0aa6..7cbf5fbf604314 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -101,24 +101,24 @@ class Conv1d(_ConvNd):
         * At groups= :attr:`in_channels`, each input channel is convolved with
           its own set of filters,
           of size
-          :math:`\left\lfloor\frac{\text{out\_channels}}{\text{in\_channels}}\right\rfloor`
+          :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`
 
     .. note::
 
-         Depending of the size of your kernel, several (of the last)
-         columns of the input might be lost, because it is a valid
-         `cross-correlation`_, and not a full `cross-correlation`_.
-         It is up to the user to add proper padding.
+        Depending of the size of your kernel, several (of the last)
+        columns of the input might be lost, because it is a valid
+        `cross-correlation`_, and not a full `cross-correlation`_.
+        It is up to the user to add proper padding.
 
     .. note::
 
-         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
-         where `K` is a positive integer is termed in literature as depthwise convolution.
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also termed in
+        literature as depthwise convolution.
 
-         In other words, for an input of size :math:`(N, C_{in}, L_{in})`, if you want a
-         depthwise convolution with a depthwise multiplier `K`,
-         then you use the constructor arguments
-         :math:`(\text{in\_channels}=C_{in}, \text{out\_channels}=C_{in} * K, ..., \text{groups}=C_{in})`
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+        :math:`(C_\text{in}=C_{in}, C_\text{out}=C_{in} \times K, ..., \text{groups}=C_{in})`.
 
     Args:
         in_channels (int): Number of channels in the input image
@@ -145,11 +145,11 @@ class Conv1d(_ConvNd):
         weight (Tensor): the learnable weights of the module of shape
             (out_channels, in_channels, kernel_size). The values of these weights are sampled from
             :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-            :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+            :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
         bias (Tensor):   the learnable bias of the module of shape
             (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
             sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-            :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+            :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
 
     Examples::
 
@@ -218,7 +218,7 @@ class Conv2d(_ConvNd):
           concatenated.
         * At groups= :attr:`in_channels`, each input channel is convolved with
           its own set of filters, of size:
-          :math:`\left\lfloor\frac{\text{out\_channels}}{\text{in\_channels}}\right\rfloor`.
+          :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`.
 
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
 
@@ -235,13 +235,13 @@ class Conv2d(_ConvNd):
 
     .. note::
 
-         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
-         where `K` is a positive integer is termed in literature as depthwise convolution.
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also termed in
+        literature as depthwise convolution.
 
-         In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`, if you want a
-         depthwise convolution with a depthwise multiplier `K`,
-         then you use the constructor arguments
-         :math:`(in\_channels=C_{in}, out\_channels=C_{in} * K, ..., groups=C_{in})`
+        In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
+        a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+        :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
     Args:
         in_channels (int): Number of channels in the input image
@@ -261,6 +261,7 @@ class Conv2d(_ConvNd):
               H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0]
                         \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
+          .. math::
               W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1]
                         \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
@@ -269,11 +270,11 @@ class Conv2d(_ConvNd):
                          (out_channels, in_channels, kernel_size[0], kernel_size[1]).
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
                          then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
 
     Examples::
 
@@ -357,13 +358,13 @@ class Conv3d(_ConvNd):
 
     .. note::
 
-         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
-         where `K` is a positive integer is termed in literature as depthwise convolution.
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also termed in
+        literature as depthwise convolution.
 
-         In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`, if you want a
-         depthwise convolution with a depthwise multiplier `K`,
-         then you use the constructor arguments
-         :math:`(in\_channels=C_{in}, out\_channels=C_{in} * K, ..., groups=C_{in})`
+         In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`,
+         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+         :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
     Args:
         in_channels (int): Number of channels in the input image
@@ -383,9 +384,11 @@ class Conv3d(_ConvNd):
               D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
                     \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
+          .. math::
               H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1]
                     \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
+          .. math::
               W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2]
                     \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
 
@@ -394,11 +397,11 @@ class Conv3d(_ConvNd):
                          (out_channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2])
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
                          then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
 
     Examples::
 
@@ -550,11 +553,11 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
                          (in_channels, out_channels, kernel_size[0], kernel_size[1]). The values
                          of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+                         :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels).
                          If :attr:`bias` is ``True``, then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+                         :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
     """
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
@@ -663,11 +666,11 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
                          (in_channels, out_channels, kernel_size[0], kernel_size[1])
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels)
                          If :attr:`bias` is ``True``, then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
 
     Examples::
 
@@ -806,11 +809,11 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
                          (in_channels, out_channels, kernel_size[0], kernel_size[1], kernel_size[2])
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels)
                          If :attr:`bias` is ``True``, then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
 
     Examples::
 
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index 4465b9b10f0e7e..d0035820927e32 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -15,11 +15,12 @@ class Fold(Module):
     :math:`L` is the total number of blocks. (This is exacly the
     same specification as the output shape of :class:`~torch.nn.Unfold`.) This
     operation combines these local blocks into the large :attr:`output` tensor
-    of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`.
-    Similar to :class:`~torch.nn.Unfold`, the arguments must satisfy
+    of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
+    by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the
+    arguments must satisfy
 
     .. math::
-        L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] \
+        L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] %
             - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
 
     where :math:`d` is over all spatial dimensions.
@@ -42,7 +43,8 @@ class Fold(Module):
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
     Args:
-        output_size (int or tuple): the shape of the spatial dimensions [2:] of the output
+        output_size (int or tuple): the shape of the spatial dimensions of the
+                                    output (i.e., ``input.sizes()[2:]``)
         kernel_size (int or tuple): the size of the sliding blocks
         stride (int or tuple): the stride of the sliding blocks in the input
                                spatial dimensions. Default: 1
@@ -59,6 +61,13 @@ class Fold(Module):
     * For the case of two output spatial dimensions this operation is sometimes
       called ``col2im``.
 
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
     .. warning::
         Currently, only 4-D output tensors (batched image-like tensors) are
         supported.
@@ -113,10 +122,10 @@ class Unfold(Module):
     the total number of such blocks:
 
     .. math::
-        L = \prod_d \left\lfloor\frac{\text{input\_spatial\_size}[d] + 2 \times \text{padding}[d] \
+        L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] %
             - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
 
-    where :math:`\text{input\_spatial\_size}` is formed by the spatial dimensions
+    where :math:`\text{spatial\_size}` is formed by the spatial dimensions
     of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
     dimensions.
 
@@ -152,6 +161,13 @@ class Unfold(Module):
     * For the case of two input spatial dimensions this operation is sometimes
       called ``im2col``.
 
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
     .. warning::
         Currently, only 4-D input tensors (batched image-like tensors) are
         supported.
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index f79bbc61dd4e71..24932a678ec568 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -78,6 +78,15 @@ class InstanceNorm1d(_InstanceNorm):
         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
         new observed value.
 
+    .. note::
+        :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm1d` is applied
+        on each channel of channeled data like multidimensional time series, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm1d` usually don't apply affine
+        transform.
+
     Args:
         num_features: :math:`C` from an expected input of size
             :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
@@ -143,6 +152,15 @@ class InstanceNorm2d(_InstanceNorm):
         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
         new observed value.
 
+    .. note::
+        :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm2d` is applied
+        on each channel of channeled data like RGB images, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm2d` usually don't apply affine
+        transform.
+
     Args:
         num_features: :math:`C` from an expected input of size
             :math:`(N, C, H, W)`
@@ -208,6 +226,15 @@ class InstanceNorm3d(_InstanceNorm):
         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
         new observed value.
 
+    .. note::
+        :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm3d` is applied
+        on each channel of channeled data like 3D models with RGB color, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm3d` usually don't apply affine
+        transform.
+
     Args:
         num_features: :math:`C` from an expected input of size
             :math:`(N, C, D, H, W)`
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 573c6b20cbf035..3cd4661e5a6de2 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -17,17 +17,17 @@ class Linear(Module):
             Default: ``True``
 
     Shape:
-        - Input: :math:`(N, *, in\_features)` where :math:`*` means any number of
+        - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of
           additional dimensions
-        - Output: :math:`(N, *, out\_features)` where all but the last dimension
+        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
           are the same shape as the input.
 
     Attributes:
         weight: the learnable weights of the module of shape
-            `(out_features x in_features)`. The values are initialized from
-            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
             :math:`k = \frac{1}{\text{in\_features}}`
-        bias:   the learnable bias of the module of shape :math:`(out_features)`.
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
                 If :attr:`bias` is ``True``, the values are initialized from
                 :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                 :math:`k = \frac{1}{\text{in\_features}}`
@@ -38,6 +38,7 @@ class Linear(Module):
         >>> input = torch.randn(128, 20)
         >>> output = m(input)
         >>> print(output.size())
+        torch.Size([128, 30])
     """
 
     def __init__(self, in_features, out_features, bias=True):
@@ -87,12 +88,12 @@ class Bilinear(Module):
 
     Attributes:
         weight: the learnable weights of the module of shape
-            `(out_features x in1_features x in2_features)`. The values are initialized from
-            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`(\text{out\_features} x \text{in1\_features} x \text{in2\_features})`.
+            The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
             :math:`k = \frac{1}{\text{in1\_features}}`
-        bias:   the learnable bias of the module of shape `(out_features)`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`
                 If :attr:`bias` is ``True``, the values are initialized from
-                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
                 :math:`k = \frac{1}{\text{in1\_features}}`
 
     Examples::
@@ -102,6 +103,7 @@ class Bilinear(Module):
         >>> input2 = torch.randn(128, 30)
         >>> output = m(input1, input2)
         >>> print(output.size())
+        torch.Size([128, 40])
     """
 
     def __init__(self, in1_features, in2_features, out_features, bias=True):
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index a85c220e66f521..06be26b8cdc146 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -42,7 +42,7 @@ def forward(self, x):
                return F.relu(self.conv2(x))
 
     Submodules assigned in this way will be registered, and will have their
-    parameters converted too when you call `.cuda()`, etc.
+    parameters converted too when you call :meth:`to`, etc.
     """
 
     dump_patches = False
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 7ca95d21230cad..9243f14791b659 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -30,8 +30,8 @@ class MaxPool1d(_MaxPoolNd):
     and output :math:`(N, C, L_{out})` can be precisely described as:
 
     .. math::
-        out(N_i, C_j, k)  = \max_{m=0, \ldots, kernel\_size-1}
-                input(N_i, C_j, stride * k + m)
+        out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1}
+                input(N_i, C_j, stride \times k + m)
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
@@ -43,16 +43,16 @@ class MaxPool1d(_MaxPoolNd):
         padding: implicit zero padding to be added on both sides
         dilation: a parameter that controls the stride of elements in the window
         return_indices: if ``True``, will return the max indices along with the outputs.
-                        Useful when Unpooling later
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
         - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})` where
+        - Output: :math:`(N, C, L_{out})`, where
 
           .. math::
-              L_{out} = \left\lfloor \frac{L_{in} + 2 * \text{padding} - \text{dilation}
-                    * (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+              L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
 
     Examples::
 
@@ -84,9 +84,11 @@ class MaxPool2d(_MaxPoolNd):
     can be precisely described as:
 
     .. math::
-
-        out(N_i, C_j, h, w)  = \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
-                               \text{input}(N_i, C_j, \text{stride[0]} * h + m, \text{stride[1]} * w + n)
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+        \end{aligned}
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
@@ -104,20 +106,20 @@ class MaxPool2d(_MaxPoolNd):
         padding: implicit zero padding to be added on both sides
         dilation: a parameter that controls the stride of elements in the window
         return_indices: if ``True``, will return the max indices along with the outputs.
-                        Useful when Unpooling later
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
               H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]}
-                    * (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
+                    \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
 
           .. math::
               W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]}
-                    * (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
+                    \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
 
     Examples::
 
@@ -147,14 +149,11 @@ class MaxPool3d(_MaxPoolNd):
     can be precisely described as:
 
     .. math::
-        out(N_i, C_j, d, h, w) =
-        \begin{gathered}
-        \max_{k=0, \ldots, kD-1}
-        \max_{m=0, \ldots, kH-1}
-        \max_{n=0, \ldots, kW-1} \\
-        \text{input}(N_i, C_j, \text{stride[0]} *
-        k + d, \text{stride[1]} * h + m, \text{stride[2]} * w + n)
-        \end{gathered}
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times k + d,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
@@ -172,23 +171,23 @@ class MaxPool3d(_MaxPoolNd):
         padding: implicit zero padding to be added on all three sides
         dilation: a parameter that controls the stride of elements in the window
         return_indices: if ``True``, will return the max indices along with the outputs.
-                        Useful when Unpooling later
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
 
           .. math::
-              D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] - \text{dilation}[0] *
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times
                 (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] - \text{dilation}[1] *
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times
                 (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] - \text{dilation}[2] *
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times
                 (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
 
     Examples::
@@ -202,7 +201,7 @@ class MaxPool3d(_MaxPoolNd):
 
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """
+    """  # noqa: E501
 
     def forward(self, input):
         return F.max_pool3d(input, self.kernel_size, self.stride,
@@ -227,29 +226,29 @@ class MaxUnpool1d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
-    .. note:: `MaxPool1d` can map several input sizes to the same output sizes.
-              Hence, the inversion process can get ambiguous.
+    .. note:: :class:`MaxPool1d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
-              as an additional argument `output_size` in the forward call.
+              as an additional argument :attr:`output_size` in the forward call.
               See the Inputs and Example below.
 
     Args:
         kernel_size (int or tuple): Size of the max pooling window.
         stride (int or tuple): Stride of the max pooling window.
-            It is set to ``kernel_size`` by default.
+            It is set to :attr:`kernel_size` by default.
         padding (int or tuple): Padding that was added to the input
 
     Inputs:
         - `input`: the input Tensor to invert
-        - `indices`: the indices given out by `MaxPool1d`
-        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, H_{in})`
-        - Output: :math:`(N, C, H_{out})` where
+        - Output: :math:`(N, C, H_{out})`, where
 
           .. math::
-              H_{out} = (H_{in} - 1) * \text{stride}[0] - 2 * \text{padding}[0] + \text{kernel\_size}[0]
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0]
 
           or as given by :attr:`output_size` in the call operator
 
@@ -292,32 +291,32 @@ class MaxUnpool2d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
-    .. note:: `MaxPool2d` can map several input sizes to the same output sizes.
-              Hence, the inversion process can get ambiguous.
+    .. note:: :class:`MaxPool2d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
-              as an additional argument `output_size` in the forward call.
+              as an additional argument :attr:`output_size` in the forward call.
               See the Inputs and Example below.
 
     Args:
         kernel_size (int or tuple): Size of the max pooling window.
         stride (int or tuple): Stride of the max pooling window.
-            It is set to ``kernel_size`` by default.
+            It is set to :attr:`kernel_size` by default.
         padding (int or tuple): Padding that was added to the input
 
     Inputs:
         - `input`: the input Tensor to invert
-        - `indices`: the indices given out by `MaxPool2d`
-        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
-            H_{out} = (H_{in} - 1) * \text{stride[0]} - 2 * \text{padding[0]} + \text{kernel\_size[0]}
+            H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
 
           .. math::
-            W_{out} = (W_{in} - 1) * \text{stride[1]} - 2 * \text{padding[1]} + \text{kernel\_size[1]}
+            W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
 
           or as given by :attr:`output_size` in the call operator
 
@@ -364,35 +363,35 @@ class MaxUnpool3d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
-    .. note:: `MaxPool3d` can map several input sizes to the same output sizes.
-              Hence, the inversion process can get ambiguous.
+    .. note:: :class:`MaxPool3d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
-              as an additional argument `output_size` in the forward call.
+              as an additional argument :attr:`output_size` in the forward call.
               See the Inputs section below.
 
     Args:
         kernel_size (int or tuple): Size of the max pooling window.
         stride (int or tuple): Stride of the max pooling window.
-            It is set to ``kernel_size`` by default.
+            It is set to :attr:`kernel_size` by default.
         padding (int or tuple): Padding that was added to the input
 
     Inputs:
         - `input`: the input Tensor to invert
-        - `indices`: the indices given out by `MaxPool3d`
-        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
 
           .. math::
-              D_{out} = (D_{in} - 1) * \text{stride[0]} - 2 * \text{padding[0]} + \text{kernel\_size[0]}
+              D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
 
           .. math::
-              H_{out} = (H_{in} - 1) * \text{stride[1]} - 2 * \text{padding[1]} + \text{kernel\_size[1]}
+              H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
 
           .. math::
-              W_{out} = (W_{in} - 1) * \text{stride[2]} - 2 * \text{padding[2]} + \text{kernel\_size[2]}
+              W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]}
 
           or as given by :attr:`output_size` in the call operator
 
@@ -436,8 +435,8 @@ class AvgPool1d(_AvgPoolNd):
 
     .. math::
 
-        \text{out}(N_i, C_j, l)  = \frac{1}{k} \sum_{m=0}^{k}
-                               \text{input}(N_i, C_j, \text{stride} * l + m)
+        \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k}
+                               \text{input}(N_i, C_j, \text{stride} \times l + m)
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points.
@@ -454,11 +453,11 @@ class AvgPool1d(_AvgPoolNd):
 
     Shape:
         - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})` where
+        - Output: :math:`(N, C, L_{out})`, where
 
           .. math::
               L_{out} = \left\lfloor \frac{L_{in} +
-              2 * \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
 
     Examples::
 
@@ -494,7 +493,7 @@ class AvgPool2d(_AvgPoolNd):
     .. math::
 
         out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
-                               input(N_i, C_j, stride[0] * h + m, stride[1] * w + n)
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points.
@@ -514,14 +513,14 @@ class AvgPool2d(_AvgPoolNd):
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in}  + 2 * \text{padding}[0] -
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] -
                 \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in}  + 2 * \text{padding}[1] -
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] -
                 \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
 
     Examples::
@@ -557,11 +556,12 @@ class AvgPool3d(_AvgPoolNd):
     can be precisely described as:
 
     .. math::
-
-        \text{out}(N_i, C_j, d, h, w)  = \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
-                \frac{\text{input}(N_i, C_j, \text{stride}[0] * d + k, \text{stride}[1] * h + m,
-                        \text{stride}[2] * w + n)}
-                     {kD * kH * kW}
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\
+                                              & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k,
+                                                      \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)}
+                                                     {kD \times kH \times kW}
+        \end{aligned}
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
     for :attr:`padding` number of points.
@@ -581,18 +581,18 @@ class AvgPool3d(_AvgPoolNd):
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
 
           .. math::
-              D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] -
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] -
                     \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] -
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] -
                     \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] -
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] -
                     \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
 
     Examples::
@@ -718,11 +718,11 @@ class LPPool1d(_LPPoolNd):
 
     Shape:
         - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})` where
+        - Output: :math:`(N, C, L_{out})`, where
 
           .. math::
               L_{out} = \left\lfloor\frac{L_{in} +
-              2 * \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
 
     Examples::
         >>> # power-2 pool of window of length 3, with stride 2.
@@ -746,7 +746,7 @@ class LPPool2d(_LPPoolNd):
         f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
 
     - At p = :math:`\infty`, one gets Max Pooling
-    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
 
     The parameters :attr:`kernel_size`, :attr:`stride` can either be:
 
@@ -764,14 +764,14 @@ class LPPool2d(_LPPoolNd):
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in}  + 2 * \text{padding}[0] - \text{dilation}[0] *
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0] \times
                     (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in}  + 2 * \text{padding}[1] - \text{dilation}[1] *
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1] \times
                     (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
     Examples::
@@ -800,6 +800,9 @@ def __init__(self, output_size, return_indices=False):
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
 
+# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and
+#   output shapes are, and how the operation computes output.
+
 
 class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
     r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 3fe381a59b5108..94b60bda836d12 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -36,9 +36,10 @@ class DataParallel(Module):
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
-    dimension. In the forward pass, the module is replicated on each device,
-    and each replica handles a portion of the input. During the backwards
-    pass, gradients from each replica are summed into the original module.
+    dimension (other objects will be copied once per device). In the forward
+    pass, the module is replicated on each device, and each replica handles a
+    portion of the input. During the backwards pass, gradients from each replica
+    are summed into the original module.
 
     The batch size should be larger than the number of GPUs used.
 
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index b61fdabd3e828d..c5ad556e510a89 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -215,10 +215,9 @@ def pad_sequence(sequences, batch_first=False, padding_value=0):
         torch.Size([25, 3, 300])
 
     Note:
-        This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` where `T` is the
-            length of the longest sequence.
-        Function assumes trailing dimensions and type of all the Tensors
-            in sequences are same.
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
 
     Arguments:
         sequences (list[Tensor]): list of variable length sequences.
@@ -227,7 +226,7 @@ def pad_sequence(sequences, batch_first=False, padding_value=0):
         padding_value (float, optional): value for padded elements. Default: 0.
 
     Returns:
-        Tensor of size ``T x B x *`` if batch_first is False
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
         Tensor of size ``B x T x *`` otherwise
     """
 

From 31850163acaa8fff1fbb1d1d48d36fdfb97acae8 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Tue, 11 Sep 2018 08:51:02 -0700
Subject: [PATCH 032/237] Remove separate ATen build target (#11488)

Summary:
ATen has had a separate build target in the past, but with our move to a root-level CMakeLists.txt file this makes less sense and is harder to maintain. Also, as we blend code between Caffe2 and ATen this will become even less maintainable.

Talked to ezyang about this, but also cc zdevito, Yangqing, and soumith. If this is too difficult, I will revert, but want to see if we can simplify for now.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11488

Differential Revision: D9770266

Pulled By: orionr

fbshipit-source-id: c7ba52a1676d84e2d052dad4c042b666f49451cd
---
 .jenkins/pytorch/build.sh    |  6 ----
 aten/CMakeLists.txt          | 68 +++++++-----------------------------
 aten/src/ATen/CMakeLists.txt | 67 -----------------------------------
 tools/test_aten_install.sh   | 11 ------
 4 files changed, 13 insertions(+), 139 deletions(-)
 delete mode 100755 tools/test_aten_install.sh

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index f108c37cdd6ed1..d9e58d7e8b1604 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -102,12 +102,6 @@ fi
 # Add the test binaries so that they won't be git clean'ed away
 git add -f build/bin
 
-# Testing ATen install
-if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then
-  echo "Testing ATen install"
-  time tools/test_aten_install.sh
-fi
-
 # Test C FFI plugins
 # cffi install doesn't work for Python 3.7
 if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index ee025265a982e7..0f0019d57b11e9 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -1,19 +1,5 @@
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  project(ATen CXX C)
-  include(CMakeDependentOption)
-  option(USE_CUDA "Use CUDA" ON)
-  option(USE_ROCM "Use ROCm" OFF)
-  option(USE_CUDNN "Use cuDNN" ON)
-  option(USE_MKLDNN "Use MKLDNN" ON)
-  cmake_dependent_option(
-      USE_CUDNN "Use cuDNN" ON
-      "USE_CUDA" OFF)
-  option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
-else()
-  if (BUILD_ATEN_MOBILE)
-    return()
-  endif()
+if (BUILD_ATEN_MOBILE)
+  return()
 endif()
 
 # Find modules
@@ -42,32 +28,6 @@ SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
 SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
 SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # ---[ Build variables set within the cmake tree
-  include(../cmake/BuildVariables.cmake)
-  set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
-
-  # ---[ Misc checks to cope with various compiler modes
-  include(../cmake/MiscCheck.cmake)
-
-  # External projects
-  include(ExternalProject)
-
-  # ---[ Utils
-  # TODO: merge the following 3 files into cmake/public/utils.cmake.
-  include(../cmake/Utils.cmake)
-  include(../cmake/public/utils.cmake)
-
-  # ---[ Dependencies
-  include(../cmake/Dependencies.cmake)
-  list(APPEND ATen_CPU_INCLUDE ${Caffe2_CPU_INCLUDE})
-  list(APPEND ATen_CUDA_INCLUDE ${Caffe2_GPU_INCLUDE})
-  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS})
-  list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS})
-  list(APPEND ATen_PUBLIC_CUDA_DEPENDENCY_LIBS
-    ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-endif()
-
 if(USE_CUDA)
   list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS})
 endif()
@@ -132,16 +92,14 @@ list(APPEND ATen_CPU_INCLUDE
   ${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
 add_subdirectory(src/ATen)
 
-if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # Pass source, includes, and libs to parent
-  set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
-  set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
-  set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
-  set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
-  set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
-  set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
-  set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
-  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
-  set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
-  set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
-endif()
+# Pass source, includes, and libs to parent
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
+set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
+set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
+set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index d65211e2269d98..994756fa18c995 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -1,11 +1,6 @@
 cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # ---[ Generate and install header and cpp files
-  include(../../../cmake/Codegen.cmake)
-endif()
-
 IF(NOT MSVC)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers")
   SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers")
@@ -352,34 +347,6 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE")
   endif()
 endif()
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # Eventually replace this use of LOCATION with use of
-  # $<TARGET_FILE:ATen_cpu>, but generators only work in some cases
-  cmake_policy(SET CMP0026 OLD)
-  get_target_property(ATEN_CPU_OUTPUT_NAME ATen_cpu LOCATION)
-  get_filename_component(ATEN_CPU_OUTPUT_NAME ${ATEN_CPU_OUTPUT_NAME} NAME)
-  set(ATEN_LIBRARIES
-    "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CPU_OUTPUT_NAME}")
-  if(USE_CUDA OR USE_ROCM)
-    get_target_property(ATEN_CUDA_OUTPUT_NAME ATen_cuda LOCATION)
-    get_filename_component(ATEN_CUDA_OUTPUT_NAME ${ATEN_CUDA_OUTPUT_NAME} NAME)
-    list(APPEND ATEN_LIBRARIES
-      "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CUDA_OUTPUT_NAME}")
-  endif()
-
-  install(TARGETS ATen_cpu
-    RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}"
-    LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}"
-    ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}")
-
-  if(USE_CUDA OR USE_ROCM)
-    install(TARGETS ATen_cuda
-      RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}"
-      LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}"
-      ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}")
-  endif()
-endif()
-
 SET(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
 CONFIGURE_FILE(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake")
 INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
@@ -404,40 +371,6 @@ else()
   add_subdirectory(test)
 endif()
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  if (BUILD_TEST)
-    foreach(test_src ${ATen_CPU_TEST_SRCS})
-      get_filename_component(test_name ${test_src} NAME_WE)
-      add_executable(${test_name} "${test_src}")
-      target_include_directories(
-          ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-      target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
-      target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-      target_link_libraries(${test_name} ATen_cpu)
-      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-      install(TARGETS ${test_name} DESTINATION test)
-    endforeach()
-
-    if(USE_CUDA OR USE_ROCM)
-      foreach(test_src ${ATen_CUDA_TEST_SRCS})
-        get_filename_component(test_name ${test_src} NAME_WE)
-        torch_cuda_based_add_executable(${test_name} "${test_src}")
-        target_include_directories(
-            ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-        target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
-        target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-        target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda)
-        add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-        install(TARGETS ${test_name} DESTINATION test)
-      endforeach()
-    endif()
-  endif()
-
-  # Make sure these don't get built by parent
-  set(ATen_CPU_TEST_SRCS)
-  set(ATen_CUDA_TEST_SRCS)
-endif()
-
 # Pass source, includes, and libs to parent
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
diff --git a/tools/test_aten_install.sh b/tools/test_aten_install.sh
deleted file mode 100755
index d2d57235505778..00000000000000
--- a/tools/test_aten_install.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/sh
-set -xe
-rm -rf aten_build
-rm -rf aten_install
-mkdir aten_build aten_install
-cd aten_build
-cmake ../aten -DUSE_CUDA=OFF -DCMAKE_INSTALL_PREFIX=../aten_install
-NUM_JOBS="$(getconf _NPROCESSORS_ONLN)"
-make -j"$NUM_JOBS" install
-cd ..
-aten/tools/test_install.sh $(pwd)/aten_install $(pwd)/aten

From 4e8d9a4a5838e99082c6866032a39efd78d341ac Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Tue, 11 Sep 2018 08:51:19 -0700
Subject: [PATCH 033/237] Introducing python setup.py rebuild develop (#11487)

Summary:
This speeds up incremental builds by doing the following changes:

- Uses `rsync` instead of `cp` (when `rsync` is found) which is a bit smarter in doing "maybe copy"
- Introduces a `rebuild` mode which does not rerun `cmake` in `build_pytorch_libs.sh`.
   *Note: `rebuild` should only be used if you dont add / remove files to the build, as `cmake` is not rerun*

Current no-op rebuild speedup:
- 1m 15s -> 20s

There are some lingering bugs. No-op rebuilds rerun `cmake`  for two rebuilds (likely that cmake logic is dependent on the install folder, hence kicking off rebuild).

So what you see

```
python setup.py rebuild develop    # first time - ~5 mins
python setup.py rebuild develop    # second time - ~3 mins
python setup.py rebuild develop    # third time - ~2 mins
python setup.py rebuild develop    # fourth time - ~20 seconds
python setup.py rebuild develop    # fifth time - ~20 seconds
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11487

Differential Revision: D9769087

Pulled By: soumith

fbshipit-source-id: 20fbecde33af6426149c13767e8734fb3be783c5
---
 setup.py                    |  38 ++++++-
 tools/build_pytorch_libs.sh | 206 ++++++++++++++++++++----------------
 2 files changed, 148 insertions(+), 96 deletions(-)

diff --git a/setup.py b/setup.py
index 7edd89cbd7fcb6..10ef480c73ceeb 100644
--- a/setup.py
+++ b/setup.py
@@ -187,6 +187,7 @@ def hotpatch_var(var, prefix='USE_'):
 
 BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
 USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
+RERUN_CMAKE = True
 
 NUM_JOBS = multiprocessing.cpu_count()
 max_jobs = os.getenv("MAX_JOBS")
@@ -382,6 +383,8 @@ def build_libs(libs):
         build_libs_cmd += ['--use-mkldnn']
     if USE_GLOO_IBVERBS:
         build_libs_cmd += ['--use-gloo-ibverbs']
+    if not RERUN_CMAKE:
+        build_libs_cmd += ['--dont-rerun-cmake']
 
     my_env["BUILD_TORCH"] = "ON"
     my_env["BUILD_PYTHON"] = "ON"
@@ -464,6 +467,7 @@ def check_file(f):
 
 
 build_dep_cmds = {}
+rebuild_dep_cmds = {}
 
 for lib in dep_libs:
     # wrap in function to capture lib
@@ -475,6 +479,16 @@ def run(self):
     build_dep.lib = lib
     build_dep_cmds['build_' + lib.lower()] = build_dep
 
+    class rebuild_dep(build_deps):
+        description = 'Rebuild {} external library'.format(lib)
+
+        def run(self):
+            global RERUN_CMAKE
+            RERUN_CMAKE = False
+            build_libs([self.lib])
+    rebuild_dep.lib = lib
+    rebuild_dep_cmds['rebuild_' + lib.lower()] = rebuild_dep
+
 
 class build_module(PytorchCommand):
     def run(self):
@@ -514,8 +528,15 @@ def load(filename):
         all_commands = [entry
                         for f in ninja_files + cmake_files
                         for entry in load(f)]
-        with open('compile_commands.json', 'w') as f:
-            json.dump(all_commands, f, indent=2)
+
+        new_contents = json.dumps(all_commands, indent=2)
+        contents = ''
+        if os.path.exists('compile_commands.json'):
+            with open('compile_commands.json', 'r') as f:
+                contents = f.read()
+        if contents != new_contents:
+            with open('compile_commands.json', 'w') as f:
+                f.write(new_contents)
         if not USE_NINJA:
             print("WARNING: 'develop' is not building C++ code incrementally")
             print("because ninja is not installed. Run this to enable it:")
@@ -676,6 +697,17 @@ class build(distutils.command.build.build):
     ] + distutils.command.build.build.sub_commands
 
 
+class rebuild(distutils.command.build.build):
+    sub_commands = [
+        ('build_deps', lambda self: True),
+    ] + distutils.command.build.build.sub_commands
+
+    def run(self):
+        global RERUN_CMAKE
+        RERUN_CMAKE = False
+        distutils.command.build.build.run(self)
+
+
 class install(setuptools.command.install.install):
 
     def run(self):
@@ -1116,11 +1148,13 @@ def make_relative_rpath(path):
     'build_deps': build_deps,
     'build_module': build_module,
     'rebuild_libtorch': rebuild_libtorch,
+    'rebuild': rebuild,
     'develop': develop,
     'install': install,
     'clean': clean,
 }
 cmdclass.update(build_dep_cmds)
+cmdclass.update(rebuild_dep_cmds)
 
 entry_points = {
     'console_scripts': [
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index d09dc756d82d8e..3134bcafa6f968 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -10,6 +10,11 @@
 
 set -ex
 
+SYNC_COMMAND="cp"
+if [ -x "$(command -v rsync)" ]; then
+    SYNC_COMMAND="time rsync -lptgoD"
+fi
+
 # Options for building only a subset of the libraries
 USE_CUDA=0
 USE_ROCM=0
@@ -17,8 +22,12 @@ USE_NNPACK=0
 USE_MKLDNN=0
 USE_GLOO_IBVERBS=0
 CAFFE2_STATIC_LINK_CUDA=0
+RERUN_CMAKE=1
 while [[ $# -gt 0 ]]; do
     case "$1" in
+      --dont-rerun-cmake)
+          RERUN_CMAKE=0
+          ;;
       --use-cuda)
           USE_CUDA=1
           ;;
@@ -147,46 +156,49 @@ function build() {
       THCS | THCUNN ) BUILD_C_FLAGS=$C_FLAGS;;
       *) BUILD_C_FLAGS=$C_FLAGS" -fexceptions";;
   esac
-  # TODO: The *_LIBRARIES cmake variables should eventually be
-  # deprecated because we are using .cmake files to handle finding
-  # installed libraries instead
-  ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
-              ${CMAKE_GENERATOR} \
-              -DTorch_FOUND="1" \
-              -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-              -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \
-              -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
-              -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
-              -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
-              -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \
-              -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \
-              -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \
-              -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \
-              -Dcwrap_files="$CWRAP_FILES" \
-              -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
-              -DTH_LIB_PATH="$INSTALL_DIR/lib" \
-              -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \
-              -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \
-              -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
-              -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \
-              -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \
-              -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \
-              -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \
-              -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \
-              -DTH_SO_VERSION=1 \
-              -DTHC_SO_VERSION=1 \
-              -DTHNN_SO_VERSION=1 \
-              -DTHCUNN_SO_VERSION=1 \
-              -DTHD_SO_VERSION=1 \
-              -DUSE_CUDA=$USE_CUDA \
-              -DBUILD_EXAMPLES=OFF \
-              -DBUILD_TEST=$BUILD_TEST \
-              -DNO_NNPACK=$((1-$USE_NNPACK)) \
-              -DNCCL_EXTERNAL=1 \
-              -DCMAKE_DEBUG_POSTFIX="" \
-              -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-              ${@:2} \
-              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]}
+  if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then
+      # TODO: The *_LIBRARIES cmake variables should eventually be
+      # deprecated because we are using .cmake files to handle finding
+      # installed libraries instead
+      ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
+		       ${CMAKE_GENERATOR} \
+		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
+		       -DTorch_FOUND="1" \
+		       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+		       -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+		       -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+		       -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \
+		       -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \
+		       -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \
+		       -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \
+		       -Dcwrap_files="$CWRAP_FILES" \
+		       -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
+		       -DTH_LIB_PATH="$INSTALL_DIR/lib" \
+		       -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \
+		       -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \
+		       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
+		       -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \
+		       -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \
+		       -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \
+		       -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \
+		       -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \
+		       -DTH_SO_VERSION=1 \
+		       -DTHC_SO_VERSION=1 \
+		       -DTHNN_SO_VERSION=1 \
+		       -DTHCUNN_SO_VERSION=1 \
+		       -DTHD_SO_VERSION=1 \
+		       -DUSE_CUDA=$USE_CUDA \
+		       -DBUILD_EXAMPLES=OFF \
+		       -DBUILD_TEST=$BUILD_TEST \
+		       -DNO_NNPACK=$((1-$USE_NNPACK)) \
+		       -DNCCL_EXTERNAL=1 \
+		       -DCMAKE_DEBUG_POSTFIX="" \
+		       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+		       ${@:2} \
+		       -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]}
+  fi
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   popd
 
@@ -211,18 +223,21 @@ function path_remove {
 function build_nccl() {
   mkdir -p build/nccl
   pushd build/nccl
-  ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
-              ${CMAKE_GENERATOR} \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-              -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \
-              -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
-              -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \
-              -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \
-              -DNUM_JOBS="$MAX_JOBS"
+  if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then
+      ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
+		       ${CMAKE_GENERATOR} \
+		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
+		       -DCMAKE_BUILD_TYPE=Release \
+		       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+		       -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \
+		       -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \
+		       -DNUM_JOBS="$MAX_JOBS"
+  fi
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   mkdir -p ${INSTALL_DIR}/lib
-  cp "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1"
+  $SYNC_COMMAND "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1"
   if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then
     ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so"
   fi
@@ -252,50 +267,53 @@ function build_caffe2() {
     EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH")
   fi
 
-  ${CMAKE_VERSION} $BASE_DIR \
-  ${CMAKE_GENERATOR} \
-      -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
-      -DBUILDING_WITH_TORCH_LIBS=ON \
-      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-      -DBUILD_TORCH=$BUILD_TORCH \
-      -DBUILD_PYTHON=$BUILD_PYTHON \
-      -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \
-      -DBUILD_BINARY=$BUILD_BINARY \
-      -DBUILD_TEST=$BUILD_TEST \
-      -DINSTALL_TEST=$INSTALL_TEST \
-      -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \
-      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
-      -DUSE_CUDA=$USE_CUDA \
-      -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
-      -DUSE_ROCM=$USE_ROCM \
-      -DUSE_NNPACK=$USE_NNPACK \
-      -DUSE_LEVELDB=$USE_LEVELDB \
-      -DUSE_LMDB=$USE_LMDB \
-      -DUSE_OPENCV=$USE_OPENCV \
-      -DUSE_GLOG=OFF \
-      -DUSE_GFLAGS=OFF \
-      -DUSE_SYSTEM_EIGEN_INSTALL=OFF \
-      -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
-      -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
-      -DCUDNN_LIBRARY=$CUDNN_LIBRARY \
-      -DUSE_MKLDNN=$USE_MKLDNN \
-      -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \
-      -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
-      -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
-      -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-      -DCMAKE_C_FLAGS="$USER_CFLAGS" \
-      -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \
-      -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
-      -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]}
+  if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then
+      ${CMAKE_VERSION} $BASE_DIR \
+		       ${CMAKE_GENERATOR} \
+		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
+		       -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
+		       -DBUILDING_WITH_TORCH_LIBS=ON \
+		       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+		       -DBUILD_TORCH=$BUILD_TORCH \
+		       -DBUILD_PYTHON=$BUILD_PYTHON \
+		       -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \
+		       -DBUILD_BINARY=$BUILD_BINARY \
+		       -DBUILD_TEST=$BUILD_TEST \
+		       -DINSTALL_TEST=$INSTALL_TEST \
+		       -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \
+		       -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+		       -DUSE_CUDA=$USE_CUDA \
+		       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
+		       -DUSE_ROCM=$USE_ROCM \
+		       -DUSE_NNPACK=$USE_NNPACK \
+		       -DUSE_LEVELDB=$USE_LEVELDB \
+		       -DUSE_LMDB=$USE_LMDB \
+		       -DUSE_OPENCV=$USE_OPENCV \
+		       -DUSE_GLOG=OFF \
+		       -DUSE_GFLAGS=OFF \
+		       -DUSE_SYSTEM_EIGEN_INSTALL=OFF \
+		       -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
+		       -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
+		       -DCUDNN_LIBRARY=$CUDNN_LIBRARY \
+		       -DUSE_MKLDNN=$USE_MKLDNN \
+		       -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \
+		       -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
+		       -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
+		       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+		       -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+		       -DCMAKE_C_FLAGS="$USER_CFLAGS" \
+		       -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \
+		       -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+		       -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]}
       # STOP!!! Are you trying to add a C or CXX flag?  Add it
       # to CMakeLists.txt and aten/CMakeLists.txt, not here.
       # We need the vanilla cmake build to work.
+  fi
 
   # This is needed by the aten tests built with caffe2
   if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
-    # cp root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
-    cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
+    # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
+    $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
   fi
 
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
@@ -352,15 +370,15 @@ pushd $TORCH_LIB_DIR
 # binaries to torch/lib
 rm -rf "$INSTALL_DIR/lib/cmake"
 rm -rf "$INSTALL_DIR/lib/python"
-cp -r "$INSTALL_DIR/lib"/* .
+$SYNC_COMMAND -r "$INSTALL_DIR/lib"/* .
 if [ -d "$INSTALL_DIR/lib64/" ]; then
-    cp -r "$INSTALL_DIR/lib64"/* .
+    $SYNC_COMMAND -r "$INSTALL_DIR/lib64"/* .
 fi
-cp ../../aten/src/THNN/generic/THNN.h .
-cp ../../aten/src/THCUNN/generic/THCUNN.h .
-cp -r "$INSTALL_DIR/include" .
+$SYNC_COMMAND ../../aten/src/THNN/generic/THNN.h .
+$SYNC_COMMAND ../../aten/src/THCUNN/generic/THCUNN.h .
+$SYNC_COMMAND -r "$INSTALL_DIR/include" .
 if [ -d "$INSTALL_DIR/bin/" ]; then
-    cp -r "$INSTALL_DIR/bin/"/* .
+    $SYNC_COMMAND -r "$INSTALL_DIR/bin/"/* .
 fi
 
 popd

From deac304b6b4e515dd606738c01ec441e9baa1987 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 11 Sep 2018 09:26:27 -0700
Subject: [PATCH 034/237] Bugfix for basic slicing

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11428

Differential Revision: D9753999

Pulled By: jamesr66a

fbshipit-source-id: cfc4163a5a06b41beb808a4e24650d71f5d91f4f
---
 test/test_jit.py                   | 13 +++++++++++++
 torch/csrc/jit/script/compiler.cpp |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 1b880fe3b0fdec..23b76d815d4f07 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1700,6 +1700,19 @@ def random_foo(x):
         x = torch.rand(5, 6)
         self.assertEqual(random_foo(x), random_foo_traced(x))
 
+    def test_trace_slice_expr_complete_type(self):
+        def random_foo(x):
+            return x + 1.0
+
+        random_foo_traced = torch.jit.trace(random_foo, (torch.rand(3, 4),))
+
+        @torch.jit.script
+        def random_bar(x):
+            return random_foo_traced(x)[0:1]
+
+        x = torch.rand(3, 4)
+        self.assertEqual(random_bar(x), (x + 1)[0:1])
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 8aae072a02a1fb..988ac6e1139fb0 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1727,7 +1727,7 @@ struct to_ir {
     auto slice_exp = SliceExpr(subscript.subscript_exprs()[0]);
     auto * sliceable = emitExpr(subscript.value());
     at::optional<int64_t> maybe_dim;
-    if (sliceable->type()->kind() == TypeKind::DynamicType) {
+    if (sliceable->type()->isSubtypeOf(DynamicType::get())) {
       // If the sliceable object is a tensor, specify a default dimension
       maybe_dim = 0;
     }

From d32b41003aa5d0ec10aa72be126ad9c136b50111 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Tue, 11 Sep 2018 10:03:36 -0700
Subject: [PATCH 035/237] Copy protos on install same as develop (#11517)

Summary:
This is a potential fix for https://github.com/pytorch/pytorch/issues/11453 and https://github.com/pytorch/pytorch/issues/11074 worked through with pjh5 . Turns out we had some protos copy code that was in the .sh file that was removed. Better to have it in setup.py, though, same as for develop.

cc ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11517

Differential Revision: D9771911

Pulled By: orionr

fbshipit-source-id: 76975d8f71f38d951eaaed0b50dd3ec36dd177a9
---
 setup.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 10ef480c73ceeb..f99a2bf945c162 100644
--- a/setup.py
+++ b/setup.py
@@ -408,6 +408,17 @@ def build_libs(libs):
         sys.exit(1)
 
 
+# Copy Caffe2's Python proto files (generated during the build with the
+# protobuf python compiler) from the build folder to the root folder
+# cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py
+def copy_protos():
+    for src in glob.glob(
+            os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')):
+        dst = os.path.join(
+            cwd, os.path.relpath(src, caffe2_build_dir))
+        shutil.copyfile(src, dst)
+
+
 # Build all dependent libraries
 class build_deps(PytorchCommand):
     def run(self):
@@ -509,15 +520,7 @@ def run(self):
         self.run_command('create_version_file')
         setuptools.command.develop.develop.run(self)
         self.create_compile_commands()
-
-        # Copy Caffe2's Python proto files (generated during the build with the
-        # protobuf python compiler) from the build folder to the root folder
-        # cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py
-        for src in glob.glob(
-                os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')):
-            dst = os.path.join(
-                cwd, os.path.relpath(src, caffe2_build_dir))
-            self.copy_file(src, dst)
+        copy_protos()
 
     def create_compile_commands(self):
         def load(filename):
@@ -713,6 +716,7 @@ class install(setuptools.command.install.install):
     def run(self):
         if not self.skip_build:
             self.run_command('build_deps')
+        copy_protos()
 
         setuptools.command.install.install.run(self)
 

From 01c7542f43a0c27215bfadf3cde43797f700982d Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 11 Sep 2018 10:29:54 -0700
Subject: [PATCH 036/237] Use -isystem for system includes in C++ extensions
 (#11459)

Summary:
I noticed warnings from within pybind11 being shown when building C++ extensions. This can be avoided by including non-user-supplied headers with `-isystem` instead of `-I`

I hope this works on Windows.

soumith ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11459

Differential Revision: D9764444

Pulled By: goldsborough

fbshipit-source-id: b288572106078f347f0342f158f9e2b63a58c235
---
 torch/utils/cpp_extension.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 43edbd2f8bd055..22940aaa5f7ea5 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -817,15 +817,21 @@ def _write_ninja_file(path,
     # Turn into absolute paths so we can emit them into the ninja build
     # file wherever it is.
     sources = [os.path.abspath(file) for file in sources]
-    includes = [os.path.abspath(file) for file in extra_include_paths]
+    user_includes = [os.path.abspath(file) for file in extra_include_paths]
 
     # include_paths() gives us the location of torch/torch.h
-    includes += include_paths(with_cuda)
+    system_includes = include_paths(with_cuda)
     # sysconfig.get_paths()['include'] gives us the location of Python.h
-    includes.append(sysconfig.get_paths()['include'])
+    system_includes.append(sysconfig.get_paths()['include'])
+
+    # Windoze does not understand `-isystem`.
+    if sys.platform == 'win32':
+        user_includes += system_includes
+        system_includes.clear()
 
     common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
-    common_cflags += ['-I{}'.format(include) for include in includes]
+    common_cflags += ['-I{}'.format(include) for include in user_includes]
+    common_cflags += ['-isystem {}'.format(include) for include in system_includes]
 
     if is_binary_build():
         common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0']

From f80f15866b28ca148c797f5998f61bffb6e4b364 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 11 Sep 2018 10:37:39 -0700
Subject: [PATCH 037/237] Get rid of manual dispatch on Type. (#11486)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11486

I discovered these by narrowing the interface on Type, and then
fixing call sites outside of core plumbing code which depended
on these methods being provided.

Reviewed By: cpuhrsch

Differential Revision: D9757935

fbshipit-source-id: 3abda0c98919a448a326a757671d438964f6909f
---
 aten/src/ATen/native/Embedding.cpp               | 9 ++++-----
 aten/src/ATen/native/LinearAlgebra.cpp           | 6 +++---
 aten/src/ATen/native/TensorShape.cpp             | 2 +-
 aten/src/ATen/native/cudnn/RNN.cpp               | 6 +++---
 aten/src/ATen/native/sparse/SparseTensorMath.cpp | 4 ++--
 torch/csrc/utils/tensor_flatten.cpp              | 3 +--
 6 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 67c0877f9fa072..99fa4c701d4bbf 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -67,18 +67,17 @@ Tensor embedding_sparse_backward(
   int64_t num_features = grad_.size(-1);
   auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
   auto& dense_type = grad.type();
-  auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
 
   // check if all our grad come from padding_idx
   if (grad.numel() == 0) {
-    return sparse_type._sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}),
-                                                 dense_type.tensor({0, num_features}),
-                                                 weight_size);
+    return at::_sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}),
+                                         dense_type.tensor({0, num_features}),
+                                         weight_size);
   }
 
   auto index = indices.reshape({1, -1});
   auto values = grad.reshape({-1, num_features});
-  return sparse_type._sparse_coo_tensor_unsafe(index, values, weight_size);
+  return at::_sparse_coo_tensor_unsafe(index, values, weight_size);
 }
 
 Tensor embedding_dense_backward_cpu(
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 2371d82efc6cfb..d596a7cb99a265 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -169,14 +169,14 @@ Tensor mm(const Tensor& self, const Tensor& mat2) {
   if (self.is_sparse()) {
     return mat2.type().addmm(at::zeros({}, mat2.type()), self, mat2, 0, 1);
   }
-  return self.type()._mm(self, mat2);
+  return at::_mm(self, mat2);
 }
 
 Tensor& mm_out(Tensor& result, const Tensor& self, const Tensor& mat2) {
   if (self.is_sparse()) {
-    return mat2.type().addmm_out(result, at::zeros({}, mat2.type()), self, mat2, 0, 1);
+    return at::addmm_out(result, at::zeros({}, mat2.options()), self, mat2, 0, 1);
   }
-  return self.type()._mm_out(result, self, mat2);
+  return at::_mm_out(result, self, mat2);
 }
 
 Tensor mv(const Tensor& self, const Tensor& vec) {
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index cf6a9ece5c0d9d..822bea31fee942 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -206,7 +206,7 @@ Tensor repeat(const Tensor& self, IntList repeats) {
   Tensor xtensor = self.expand(padded_size);
 
   Tensor result = self.type().tensor(target_size);
-  Tensor urtensor = result.type().alias(result);
+  Tensor urtensor = at::alias(result);
   for (int64_t i = 0; i < xtensor.dim(); ++i) {
     // can't unfold with step 0, so make sure step is at least 1
     // (it doesn't matter what it is in that case, because the size is 0).
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 8fc896afe23a12..e3b6aaadf0f0b6 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -1059,9 +1059,9 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     std::array<bool, 4> output_mask
     ) {
 
-  auto grad_output = grad_output_r.defined() ? grad_output_r : output.type().zeros_like(output);
-  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : hx.type().zeros_like(hx);
-  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : cx.type().zeros_like(cx)) : grad_cy_r;
+  auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output);
+  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx);
+  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx)) : grad_cy_r;
 
   Tensor dx, dhx, dcx;
   // NB: unconditionally compute this gradient, because it mutates reserve
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 2e37ad41a3b96e..afd8001734a9a8 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -784,7 +784,7 @@ Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self,
 // sparse, dense -> sparse
 Tensor smm(const Tensor& self, const Tensor& mat2) {
   auto result = self.type().tensor();
-  self.type().sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
+  at::sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
   return result;
 }
 
@@ -792,7 +792,7 @@ Tensor smm(const Tensor& self, const Tensor& mat2) {
 Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2,
     Scalar beta, Scalar alpha) {
   auto result = self.type().tensor();
-  self.type().sspaddmm_out(result, self, mat1, mat2, beta, alpha);
+  at::sspaddmm_out(result, self, mat1, mat2, beta, alpha);
   return result;
 }
 
diff --git a/torch/csrc/utils/tensor_flatten.cpp b/torch/csrc/utils/tensor_flatten.cpp
index e1bbd48316d6c6..29414368c7bd50 100644
--- a/torch/csrc/utils/tensor_flatten.cpp
+++ b/torch/csrc/utils/tensor_flatten.cpp
@@ -85,9 +85,8 @@ std::vector<at::Tensor> unflatten_sparse_tensors(
 
   std::vector<at::Tensor> outputs;
   outputs.reserve(tensors.size());
-  auto & type = tensors[0].type();
   for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i)
-    outputs.emplace_back(type._sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes()));
+    outputs.emplace_back(at::_sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes()));
   return outputs;
 }
 

From 727a4453aa742db4dc59079fe10d3b7a67e62086 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Tue, 11 Sep 2018 10:44:00 -0700
Subject: [PATCH 038/237] New Serialization Proto

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11166

Reviewed By: mingzhe09088

Differential Revision: D9623522

Pulled By: houseroad

fbshipit-source-id: f21153034a398de7959404321d8534234cd58a40
---
 caffe2/proto/torch.proto   | 550 +++++++++++++++++++++++++++++++++++++
 caffe2/utils/proto_wrap.cc |  15 +
 cmake/ProtoBuf.cmake       |   2 +-
 3 files changed, 566 insertions(+), 1 deletion(-)
 create mode 100644 caffe2/proto/torch.proto

diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto
new file mode 100644
index 00000000000000..9e626d8d845260
--- /dev/null
+++ b/caffe2/proto/torch.proto
@@ -0,0 +1,550 @@
+syntax = "proto2";
+
+import "caffe2/proto/caffe2.proto";
+
+package torch;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+//
+// To simplify framework compatibility, ONNX is defined using the subset of
+// protobuf that is compatible with both protobuf v2 and v3. This means that we
+// do not use any protobuf features that are only available in one of the two
+// versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control.
+  // For the IR, we are using simple numbers starting with with 0x00000001,
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_NEWEST_ONNX = 0x0000000000000003;
+
+  // PYTORCH IR VERSION
+  IR_VERSION_NEWEST = 0x0000000000000103;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accomodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  optional string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // Additional annotations, attributes are defined in Schema
+  // To be added as annotations:
+  //    string engine
+  //    string list control_input
+  //    int64 is_gradient_op
+  //    string debug_info
+  repeated AttributeProto annotations = 8;
+
+  // Besides the node type, PyTorhc also serialize ATen function signature
+  optional caffe2.DeviceOption device_option = 51;
+  optional string aten_function = 52;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto.
+//
+// Model ==> Caffe2 MetaNetDef
+//       ==> PyTorch Module
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  optional int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  // The main graph, in single graph case, it is ONNX compatible.
+  optional GraphProto graph = 7;
+
+  // The remaining nets in MetaNetDef.
+  // Submodules and methods in PyTorch.
+  repeated GraphProto methods = 15;
+
+  // Named metadata values; keys should be distinct.
+  // Many meta data in MetaNetDef and preditor are piggy backed here.
+  // 1) project
+  // 2) model_class
+  // 3) internal_version
+  // 4) predictor_type
+  // 5) predictor_id
+  // 6) execute_plan
+  // 7) applicationSpecificInfo (another string map, need to verify it has no duplicate.)
+  // 8) engine
+  // 9) publish time
+  repeated StringStringEntryProto metadata_props = 14;
+
+  // Model name
+  optional string name = 16;
+
+  // Model name
+  repeated AttributeProto annotations = 17;
+
+  // Mapping from list name to blob name list, must be string list type.
+  // Equivalent to blobs in MetaNetDef.
+  repeated AttributeProto blob_lists = 51;
+
+  // Mapping from plan name to serialized plan, must be string list type.
+  // Equivalent to plans in MetaNetDef.
+  repeated AttributeProto plans = 52;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value= 2;
+};
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+// Graph ==> NetDef in Caffe2
+//       ==> Submodule/Method in PyTorch
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // also appears in the input list.
+  repeated TensorProto initializer = 5;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // Additional annotations.
+  repeated AttributeProto annotations = 14;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // Advanced types
+    FLOAT16 = 10;
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+    // Future extensions go here.
+
+    // Special data type, real type information is stored in ValueInfoProto.
+    // If data_type is SPECIAL, raw_data should be used.
+    SPECIAL = 51;
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+  repeated int64 strides = 14;
+
+  // The data type of the tensor.
+  optional DataType data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+    optional int64 chuck_num = 51;
+    optional int64 chuck_id = 52;
+  }
+  // Used as offset in the external shared data.
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // For double
+  // Complex64 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+
+  // External data by file name
+  optional string external_data = 13;
+
+  // If two tensors represent the same weights/content, use alias.
+  // Must exist a TensorProto named alias in the initializer list.
+  // To avoid the duplicate tensor in attribute, such as value in Constant node.
+  // This is useful, if everything is stored just in the proto.
+  optional string alias = 16;
+
+  // Additional annotations.
+  repeated AttributeProto annotations = 17;
+
+  // Device info
+  optional caffe2.DeviceOption device_option = 51;
+
+  // For PyTorch serialized tensor.
+  optional int64 require_gradient = 52;
+  optional int64 is_buffer = 53;
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  // To represent a scalar, using no dim to represent 0-d tensor.
+  repeated Dimension dim = 1;
+
+  repeated Dimension stride = 51;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST be present for this version of the IR.
+    optional TensorProto.DataType elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+  // Sequence type: List, Tuple
+  message Sequence {
+    // elem_type and elem_type_list cannot appear together.
+    // If all the element types are the same, we use elem_type,
+    // otherwise, we specify the type of each element in elem_type_list.
+    optional TypeProto elem_type = 1;
+    repeated TypeProto elem_type_list = 51;
+    enum SequenceType {
+      UNDEFINED = 0;
+      LIST = 1;
+      TUPLE = 2;
+    }
+    optional SequenceType sequence_type = 52;
+  }
+
+  // Map<K, V>, (not necessary at this moment)
+  message Map {
+    optional TensorProto.DataType key_type = 1;
+    optional TypeProto value_type = 2;
+  }
+
+  // Special type of blobs, based on the type_name, we can choose the right
+  // serializer and deserialzier.
+  message SpecialBlob {
+    optional string type_name = 1;
+  }
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+    Sequence sequence_type = 4;
+    Map map_type = 5;
+    SpecialBlob special_type = 51;
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
+}
diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc
index b573968d9095ed..eb06524cae8417 100644
--- a/caffe2/utils/proto_wrap.cc
+++ b/caffe2/utils/proto_wrap.cc
@@ -29,3 +29,18 @@ void ShutdownProtobufLibrary() {
 }
 
 }  // namespace caffe2
+
+namespace torch {
+
+// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
+// used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
+}
+
+void ShutdownProtobufLibrary() {
+  ::google::protobuf::ShutdownProtobufLibrary();
+}
+
+}  // namespace torch
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 9adc2a2be8347a..ed12b3b90e5480 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -179,7 +179,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
 
         # If we remove all reference to these pb.h files from external
         # libraries and binaries this rewrite can be removed.
-        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
 
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )

From d09041bd81f4264b82f3cb6c1b05e486b234efcc Mon Sep 17 00:00:00 2001
From: Fei Sun <feisun@fb.com>
Date: Tue, 11 Sep 2018 10:59:03 -0700
Subject: [PATCH 039/237] Add an option to statically link cuda (#10596)

Summary:
Need to link CUDA statically for benchmarking purpose.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10596

Reviewed By: llyfacebook

Differential Revision: D9370738

Pulled By: sf-wind

fbshipit-source-id: 4464d62473e95fe8db65b0bd3b301f262bf269bf
---
 scripts/build_windows.bat | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat
index ebb76ccd5a3e30..281d20a8d6926a 100644
--- a/scripts/build_windows.bat
+++ b/scripts/build_windows.bat
@@ -10,6 +10,10 @@ setlocal
 SET ORIGINAL_DIR=%cd%
 SET CAFFE2_ROOT=%~dp0%..
 
+if NOT DEFINED CAFFE2_STATIC_LINK_CUDA (
+  set CAFFE2_STATIC_LINK_CUDA=OFF
+)
+
 if NOT DEFINED CMAKE_BUILD_TYPE (
   set CMAKE_BUILD_TYPE=Release
 )
@@ -57,6 +61,7 @@ cmake .. ^
   -G%CMAKE_GENERATOR% ^
   -DBUILD_TEST=OFF ^
   -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
+  -DCAFFE2_STATIC_LINK_CUDA=%CAFFE2_STATIC_LINK_CUDA% ^
   -DUSE_CUDA=%USE_CUDA% ^
   -DTORCH_CUDA_ARCH_LIST=5.0 ^
   -DUSE_NNPACK=OFF ^

From a566bc2f1140f9a51343656f8b1b47a6b72a5fc9 Mon Sep 17 00:00:00 2001
From: Will Feng <willfeng@fb.com>
Date: Tue, 11 Sep 2018 11:20:07 -0700
Subject: [PATCH 040/237] Disable all CircleCI jobs (#11523)

Summary:
Disable all CircleCI jobs until we are ready to move forward with them.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11523

Differential Revision: D9774462

Pulled By: yf225

fbshipit-source-id: c5724e71eb68bac4df958b4f7bcc380050668b3c
---
 .circleci/config.yml | 144 +++++++++++++++++++++----------------------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bcd2a5527b9835..2390d6ac5c0a87 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -845,36 +845,36 @@ workflows:
   version: 2
   build:
     jobs:
-      - pytorch_linux_trusty_py2_7_9_build_test
-      - pytorch_linux_trusty_py2_7_build_test
-      - pytorch_linux_trusty_py3_5_build_test
-      - pytorch_linux_trusty_py3_6_gcc4_8_build_test
-      - pytorch_linux_trusty_py3_6_gcc5_4_build_test
-      - pytorch_linux_trusty_py3_6_gcc7_build_test
-      - pytorch_linux_trusty_pynightly_build_test
-      - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_py3_clang5_asan_test:
-          requires:
-            - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
-          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
-          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
-          requires:
-            - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
-          requires:
-            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
-      - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
-          requires:
-            - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
+      # - pytorch_linux_trusty_py2_7_9_build_test
+      # - pytorch_linux_trusty_py2_7_build_test
+      # - pytorch_linux_trusty_py3_5_build_test
+      # - pytorch_linux_trusty_py3_6_gcc4_8_build_test
+      # - pytorch_linux_trusty_py3_6_gcc5_4_build_test
+      # - pytorch_linux_trusty_py3_6_gcc7_build_test
+      # - pytorch_linux_trusty_pynightly_build_test
+      # - pytorch_linux_xenial_py3_clang5_asan_build
+      # - pytorch_linux_xenial_py3_clang5_asan_test:
+      #     requires:
+      #       - pytorch_linux_xenial_py3_clang5_asan_build
+      # - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      # - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      # - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
+      # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
 
       # - pytorch_macos_10_13_py3_build
       # - pytorch_macos_10_13_py3_test:
@@ -882,48 +882,48 @@ workflows:
       #       - pytorch_macos_10_13_py3_build
       # - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
 
-      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
-      - caffe2_py2_mkl_ubuntu16_04_build
-      - caffe2_py2_mkl_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_mkl_ubuntu16_04_build
-      - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
-      - caffe2_py2_gcc4_8_ubuntu14_04_build
-      - caffe2_py2_gcc4_8_ubuntu14_04_test:
-          requires:
-            - caffe2_py2_gcc4_8_ubuntu14_04_build
-      - caffe2_onnx_py2_gcc5_ubuntu16_04_build
-      - caffe2_onnx_py2_gcc5_ubuntu16_04_test:
-          requires:
-            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
-      - caffe2_conda2_ubuntu16_04_build
-      - caffe2_conda2_ubuntu16_04_test:
-          requires:
-            - caffe2_conda2_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_gcc4_9_ubuntu14_04_build
-      - caffe2_py2_clang3_8_ubuntu16_04_build
-      - caffe2_py2_clang3_9_ubuntu16_04_build
-      - caffe2_py2_gcc6_ubuntu16_04_build
-      - caffe2_py2_gcc7_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build
-      - caffe2_py2_android_ubuntu16_04_build
-      - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_centos7_build
+      # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
+      # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
+      # - caffe2_py2_mkl_ubuntu16_04_build
+      # - caffe2_py2_mkl_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_mkl_ubuntu16_04_build
+      # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_gcc4_8_ubuntu14_04_build
+      # - caffe2_py2_gcc4_8_ubuntu14_04_test:
+      #     requires:
+      #       - caffe2_py2_gcc4_8_ubuntu14_04_build
+      # - caffe2_onnx_py2_gcc5_ubuntu16_04_build
+      # - caffe2_onnx_py2_gcc5_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_onnx_py2_gcc5_ubuntu16_04_build
+      # - caffe2_conda2_ubuntu16_04_build
+      # - caffe2_conda2_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_conda2_ubuntu16_04_build
+      # - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_gcc4_9_ubuntu14_04_build
+      # - caffe2_py2_clang3_8_ubuntu16_04_build
+      # - caffe2_py2_clang3_9_ubuntu16_04_build
+      # - caffe2_py2_gcc6_ubuntu16_04_build
+      # - caffe2_py2_gcc7_ubuntu16_04_build
+      # - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build
+      # - caffe2_py2_android_ubuntu16_04_build
+      # - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_centos7_build
 
       # - caffe2_py2_ios_macos10_13_build
       # - caffe2_py2_system_macos10_13_build

From 781737f84c5f6e067e8e02648c8bfc3a2df2dbaa Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 12:07:01 -0700
Subject: [PATCH 041/237] Remove time prefix from rsync (#11525)

Summary:
This fails with zsh saying "time: command not found".

cc soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11525

Differential Revision: D9772522

Pulled By: apaszke

fbshipit-source-id: b80d108fa6b174d68ada08a9fdbf7260ee37e08f
---
 tools/build_pytorch_libs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 3134bcafa6f968..579cde711eae43 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -12,7 +12,7 @@ set -ex
 
 SYNC_COMMAND="cp"
 if [ -x "$(command -v rsync)" ]; then
-    SYNC_COMMAND="time rsync -lptgoD"
+    SYNC_COMMAND="rsync -lptgoD"
 fi
 
 # Options for building only a subset of the libraries

From fbc17321fdd96ce4dcc275ff61264d04f2fbd6e2 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 11 Sep 2018 12:24:52 -0700
Subject: [PATCH 042/237] Update pybind11 to fix Python 3.7 support for script
 (#11473)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/11419

In particular pulling in https://github.com/pybind/pybind11/pull/1454
as well as pending bugfix in https://github.com/pybind/pybind11/pull/1517 (documenting in comment)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11473

Differential Revision: D9776003

Pulled By: jamesr66a

fbshipit-source-id: a225dcfb66c06bcae98fd2508d9e690c24be551a
---
 third_party/pybind11 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/pybind11 b/third_party/pybind11
index add56ccdcac23a..5c8746ff135abb 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit add56ccdcac23a6c522a2c1174a866e293c61dab
+Subproject commit 5c8746ff135abb390bf95944be593e895a586a50

From 5952acc041db96b99dc0d463a28d74da9d3b28e5 Mon Sep 17 00:00:00 2001
From: Will Feng <willfeng@fb.com>
Date: Tue, 11 Sep 2018 12:29:47 -0700
Subject: [PATCH 043/237] Add "merge to master" step before build in CircleCI
 (#11443)

Summary:
This PR adds the "merge to master" step before the build step in CircleCI, so that all PR commits are built against master instead of against the PR's branch. Note that all PRs still need to rebase to master to pick up this new config, so it won't apply to old PR branches retroactively.

To check in CI: make sure it's performing the git merge to master appropriately in "Merge Onto Master" step.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11443

Differential Revision: D9775628

Pulled By: yf225

fbshipit-source-id: 8083db6b098d234a44ae4481f40a486e9906f6f8
---
 .circleci/config.yml | 89 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 22 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2390d6ac5c0a87..476e9867cf2a4a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,11 +5,37 @@ docker_config_defaults: &docker_config_defaults
     aws_access_key_id: AKIAJ2J6FIG5OSZTQ3IA
     aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY}
 
+# NOTE: We only perform the merge in build step and not in test step, because
+# all source files will be shared from build to test
+merge_pull_request_onto_master: &merge_pull_request_onto_master
+  name: Merge Onto Master
+  no_output_timeout: "10h"
+  command: |
+    if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
+      git config --global user.email "circleci.ossci@gmail.com"
+      git config --global user.name "CircleCI"
+
+      git config remote.origin.url https://github.com/pytorch/pytorch.git
+      git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
+      git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=50 --quiet
+
+      export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
+      echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
+      export GIT_COMMIT=${CIRCLE_SHA1}
+      echo "GIT_COMMIT: " ${GIT_COMMIT}
+
+      git checkout -f ${GIT_COMMIT}
+      git reset --hard ${GIT_COMMIT}
+      git merge --no-edit --no-ff ${GIT_MERGE_TARGET}
+    fi
+
 pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
   resource_class: large
   working_directory: /var/lib/jenkins/workspace
   steps:
   - checkout
+  - run:
+      <<: *merge_pull_request_onto_master
   - run:
       name: Build
       no_output_timeout: "10h"
@@ -22,7 +48,7 @@ pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
         # This IAM user allows write access to S3 bucket for sccache
         export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
         export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
-        git submodule update --init
+        git submodule sync && git submodule update --init
         .jenkins/pytorch/build.sh
         .jenkins/pytorch/test.sh
 
@@ -31,6 +57,8 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults
   working_directory: /var/lib/jenkins/workspace
   steps:
   - checkout
+  - run:
+      <<: *merge_pull_request_onto_master
   - run:
       name: Build
       no_output_timeout: "10h"
@@ -46,16 +74,18 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults
         # This IAM user allows write access to S3 bucket for sccache
         export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
         export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
-        git submodule update --init || git submodule update --init || git submodule update --init
+        git submodule sync && git submodule update --init
         .jenkins/pytorch/build.sh
-        mkdir -p pytorch-ci-env/
-        cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch pytorch-ci-env/torch
-        cp -r build/bin pytorch-ci-env/cpp_test_bin
+        export PYTORCH_CI_ENV_DIR=/var/lib/jenkins/pytorch-ci-env
+        mkdir -p ${PYTORCH_CI_ENV_DIR}
+        cp -r /var/lib/jenkins/workspace ${PYTORCH_CI_ENV_DIR}/build_workspace  # This copies all source files from build step to the next step
+        cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch ${PYTORCH_CI_ENV_DIR}/torch
+        cp -r build/bin ${PYTORCH_CI_ENV_DIR}/cpp_test_bin
         if [ -d "../cpp-build" ]; then
-          cp -r ../cpp-build pytorch-ci-env/cpp-build
+          cp -r ../cpp-build ${PYTORCH_CI_ENV_DIR}/cpp-build
         fi
   - persist_to_workspace:
-      root: /var/lib/jenkins/workspace/pytorch-ci-env
+      root: /var/lib/jenkins/pytorch-ci-env
       paths:
         - "*"
 
@@ -63,7 +93,6 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
   machine:
     image: default
   steps:
-  - checkout
   - run:
       name: Prepare workspace
       command: |
@@ -107,12 +136,16 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
           id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
         fi
         pwd
+
+        cp -r /opt/workspace/build_workspace/. /home/circleci/project  # This copies all source files from build step to the current step
+
         echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
         echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env
         echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
         # This IAM user allows write access to S3 bucket for sccache
         echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env
         echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env
+
         mkdir -p /home/circleci/project/build
         cp -r /opt/workspace/cpp_test_bin /home/circleci/project/build/bin
         docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
@@ -122,9 +155,9 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
           docker cp "/opt/workspace/cpp-build" "$id:/var/lib/jenkins/cpp-build"
         fi
         if [ -n "${MULTI_GPU}" ]; then
-          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash
+          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash
         else
-          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash
+          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash
         fi
 
 caffe2_linux_build_defaults: &caffe2_linux_build_defaults
@@ -132,6 +165,8 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
   working_directory: /var/lib/jenkins/workspace
   steps:
   - checkout
+  - run:
+      <<: *merge_pull_request_onto_master
   - run:
       name: Build
       no_output_timeout: "10h"
@@ -152,7 +187,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
         cd third_party/onnx && git fetch --tags --progress origin +refs/pull/*:refs/remotes/origin/pr/* && cd -
 
         # Reinitialize submodules
-        git submodule update --init --recursive
+        git submodule sync && git submodule update --init --recursive
 
         # Ensure jenkins can write to the ccache root dir.
         sudo chown jenkins:jenkins "${HOME}/.ccache"
@@ -189,16 +224,18 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
         fi
 
         # Copy all necessary binaries to shared workspace
-        mkdir -p caffe2-ci-env
-        cp -r third_party/onnx caffe2-ci-env/onnx
+        export CAFFE2_CI_ENV_DIR=/var/lib/jenkins/caffe2-ci-env
+        mkdir -p ${CAFFE2_CI_ENV_DIR}
+        cp -r /var/lib/jenkins/workspace ${CAFFE2_CI_ENV_DIR}/build_workspace  # This copies all source files from build step to the next step
+        cp -r third_party/onnx ${CAFFE2_CI_ENV_DIR}/onnx
         if [ -d "/usr/local/caffe2" ]; then
-          cp -r /usr/local/caffe2 caffe2-ci-env/caffe2
+          cp -r /usr/local/caffe2 ${CAFFE2_CI_ENV_DIR}/caffe2
         fi
         if [ -d "/opt/conda" ]; then
-          cp -r /opt/conda caffe2-ci-env/conda_env
+          cp -r /opt/conda ${CAFFE2_CI_ENV_DIR}/conda_env
         fi
   - persist_to_workspace:
-      root: /var/lib/jenkins/workspace/caffe2-ci-env
+      root: /var/lib/jenkins/caffe2-ci-env
       paths:
         - "*"
 
@@ -206,7 +243,6 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
   machine:
     image: default
   steps:
-  - checkout
   - run:
       name: Prepare workspace
       command: |
@@ -250,6 +286,7 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
           id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
         fi
         pwd
+        cp -r /opt/workspace/build_workspace/. /home/circleci/project  # This copies all source files from build step to the current step
         echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
         echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
         # This IAM user allows write access to S3 bucket for sccache
@@ -315,6 +352,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
     xcode: "9.0"
   steps:
     - checkout
+    - run:
+        <<: *merge_pull_request_onto_master
     - run:
         name: Build
         no_output_timeout: "10h"
@@ -326,7 +365,7 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
           brew install cmake
 
           # Reinitialize submodules
-          git submodule update --init --recursive
+          git submodule sync && git submodule update --init --recursive
 
           # Reinitialize path (see man page for path_helper(8))
           eval `/usr/libexec/path_helper -s`
@@ -525,6 +564,8 @@ jobs:
       xcode: "9.0"
     steps:
       - checkout
+      - run:
+          <<: *merge_pull_request_onto_master
       - run:
           name: Build
           environment:
@@ -544,9 +585,12 @@ jobs:
             export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
 
-            git submodule update --init
+            git submodule sync && git submodule update --init
             chmod a+x .jenkins/pytorch/macos-build.sh
             .jenkins/pytorch/macos-build.sh
+
+            # TODO: need to share source files from build to test, when macOS builds are enabled
+
       - persist_to_workspace:
           root: /Users/distiller/pytorch-ci-env
           paths:
@@ -556,7 +600,6 @@ jobs:
     macos:
       xcode: "9.0"
     steps:
-      - checkout
       - run:
           name: Prepare workspace
           command: |
@@ -570,9 +613,9 @@ jobs:
             BUILD_ENVIRONMENT: pytorch-macos-10.13-py3
           no_output_timeout: "10h"
           command: |
+            # TODO: need to share source files from build to test, when macOS builds are enabled
             set -ex
             export IN_CIRCLECI=1
-            git submodule update --init
             chmod a+x .jenkins/pytorch/macos-test.sh
             .jenkins/pytorch/macos-test.sh
 
@@ -581,6 +624,8 @@ jobs:
       xcode: "9.0"
     steps:
       - checkout
+      - run:
+          <<: *merge_pull_request_onto_master
       - run:
           name: Build
           environment:
@@ -616,7 +661,7 @@ jobs:
             export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
 
-            git submodule update --init
+            git submodule sync && git submodule update --init
             chmod a+x .jenkins/pytorch/macos-build.sh
             .jenkins/pytorch/macos-build.sh
 

From c56a7cfc37aae8b5e1515da37b314ed03bff8641 Mon Sep 17 00:00:00 2001
From: vishwakftw <cs15btech11043@iith.ac.in>
Date: Tue, 11 Sep 2018 12:40:56 -0700
Subject: [PATCH 044/237] More use of AT_CHECK and AT_ERROR (#11457)

Summary: Considering these increase the size of the message stack, I didn't touch the code outside `ATen/native`

Differential Revision: D9754283

Pulled By: soumith

fbshipit-source-id: 04198ec4fd0c4abae09eeba92c493a783408537a
---
 aten/src/ATen/native/Convolution.cpp          | 147 +++++++-----------
 aten/src/ATen/native/Indexing.cpp             |  52 +++----
 aten/src/ATen/native/Normalization.cpp        |  83 ++++------
 aten/src/ATen/native/RoiPooling.cpp           |   2 +-
 aten/src/ATen/native/SpectralOpsUtils.h       |   2 +-
 aten/src/ATen/native/TensorShape.cpp          | 108 +++++--------
 aten/src/ATen/native/cuda/CuFFTPlanCache.h    |  30 ++--
 aten/src/ATen/native/cuda/SpectralOps.cu      |   6 +-
 .../ATen/native/cudnn/AffineGridGenerator.cpp |   4 +-
 aten/src/ATen/native/cudnn/BatchNorm.cpp      |   4 +-
 aten/src/ATen/native/cudnn/Conv.cpp           |  38 ++---
 aten/src/ATen/native/cudnn/GridSampler.cpp    |   4 +-
 aten/src/ATen/native/cudnn/LossCTC.cpp        |   2 +-
 aten/src/ATen/native/cudnn/RNN.cpp            | 147 +++++++-----------
 .../ATen/native/miopen/BatchNorm_miopen.cpp   |   4 +-
 aten/src/ATen/native/miopen/Conv_miopen.cpp   |  36 ++---
 aten/src/ATen/native/mkl/SpectralOps.cpp      |  20 +--
 aten/src/ATen/native/mkldnn/Conv.cpp          |   8 +-
 18 files changed, 274 insertions(+), 423 deletions(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index f359d67c72e786..b3fcff1b2ccd29 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -155,70 +155,51 @@ static void check_input_shape_forward(const at::Tensor& input,
   int64_t k = input.ndimension();
   int64_t weight_dim = weight.ndimension();
 
-  if (weight_dim != k) {
-    std::stringstream ss;
-    ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim
-       << "-dimensional weight " << weight.sizes() << ", but got input of size "
-       << input.sizes() << " instead";
-    throw std::runtime_error(ss.str());
-  }
-  if (weight.size(0) < groups) {
-    std::stringstream ss;
-    ss << "Given groups=" << groups << ", expected weight to be at least "
-       << groups << " at dimension 0, but got weight of size " << weight.sizes()
-       << " instead";
-    throw std::runtime_error(ss.str());
-  }
-  if (weight.size(0) % groups != 0) {
-    std::stringstream ss;
-    ss << "Given groups=" << groups << ", expected weight to be divisible by "
-       << groups << " at dimension 0, but got weight of size " << weight.sizes()
-       << " instead";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(weight_dim == k,
+           "Expected ", weight_dim, "-dimensional input for ", weight_dim,
+           "-dimensional weight ", weight.sizes(), ", but got input of size ",
+           input.sizes(), " instead");
+  AT_CHECK(weight.size(0) >= groups,
+           "Given groups=", groups, ", expected weight to be at least ", groups,
+           " at dimension 0, but got weight of size ", weight.sizes(), " instead");
+  AT_CHECK(weight.size(0) % groups == 0,
+           "Given groups=", groups, ", expected weight to be divisible by ",
+           groups, " at dimension 0, but got weight of size ", weight.sizes(),
+           " instead");
 
   if (!transposed) {
-    if (input.size(1) != (weight.size(1) * groups)) {
-      std::stringstream ss;
-      ss << "Given groups=" << groups << ", weight of size " << weight.sizes()
-         << ", expected input" << input.sizes() << " to have "
-         << (weight.size(1) * groups) << " channels, but got " << input.size(1)
-         << " channels instead";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(0))) {
-      std::stringstream ss;
-      ss << "Given weight of size " << weight.sizes()
-         << ", expected bias to be 1-dimensional with " << weight.size(0) << " elements"
-         << ", but got bias of size " << bias.sizes() << " instead";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.size(1) == (weight.size(1) * groups),
+             "Given groups=", groups, ", weight of size ", weight.sizes(),
+             ", expected input", input.sizes(), " to have ",
+             (weight.size(1) * groups), " channels, but got ", input.size(1),
+             " channels instead");
+    AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(0)),
+             "Given weight of size ", weight.sizes(),
+             ", expected bias to be 1-dimensional with ", weight.size(0), " elements",
+             ", but got bias of size ", bias.sizes(), " instead");
   } else { // transposed
-    if (input.size(1) != weight.size(0)) {
-      std::stringstream ss;
-      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
-         << ", expected input" << input.sizes() << " to have "
-         << weight.size(0) << " channels, but got " << input.size(1)
-         << " channels instead";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(1) * groups)) {
-      std::stringstream ss;
-      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
-         << ", expected bias to be 1-dimensional with " << weight.size(1) * groups << " elements"
-         << ", but got bias of size " << bias.sizes() << " instead";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.size(1) == weight.size(0),
+             "Given transposed=", transposed, ", weight of size ", weight.sizes(),
+             ", expected input", input.sizes(), " to have ", weight.size(0),
+             " channels, but got ", input.size(1), " channels instead");
+    AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(1) * groups),
+             "Given transposed=", transposed, ", weight of size ", weight.sizes(),
+             ", expected bias to be 1-dimensional with ", weight.size(1) * groups, " elements",
+             ", but got bias of size ", bias.sizes(), " instead");
   }
 }
 
 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
-  if (tensor.ndimension() != 3) throw std::runtime_error("expected 3D tensor");
+  AT_CHECK(tensor.ndimension() == 3,
+           "expected 3D tensor, got tensor with ", tensor.ndimension(),
+           " dimensions instead");
   return tensor.unsqueeze(2);
 }
 
 static auto view3d(const at::Tensor& tensor) -> at::Tensor {
-  if (tensor.ndimension() != 4) throw std::runtime_error("expected 4D tensor");
+  AT_CHECK(tensor.ndimension() == 4,
+           "expected 4D tensor, got tensor with ", tensor.ndimension(),
+           " dimensions instead");
   return tensor.squeeze(2);
 }
 
@@ -293,7 +274,7 @@ static inline std::vector<int64_t> convolution_expand_param_if_needed(
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
        << "dimensions, but got " << param_name << "=" << list_param;
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   } else {
     return list_param.vec();
   }
@@ -311,9 +292,7 @@ at::Tensor _convolution(
   auto k = weight.ndimension();
   int64_t dim = k - 2;
 
-  if (dim <= 0) {
-    throw std::runtime_error("weight should have at least two dimensions");
-  }
+  AT_CHECK(dim > 0, "weight should at least have at least two dimensions");
 
   ConvParams params;
   params.stride = convolution_expand_param_if_needed(stride_, "stride", dim);
@@ -326,8 +305,8 @@ at::Tensor _convolution(
   params.deterministic = deterministic;
   params.cudnn_enabled = cudnn_enabled;
 
-  if (params.is_padding_neg()) throw std::runtime_error("negative padding is not supported");
-  if (params.is_output_padding_neg()) throw std::runtime_error("negative output_padding is not supported");
+  AT_CHECK(!params.is_padding_neg(), "negative padding is not supported");
+  AT_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
 
   check_input_shape_forward(input, weight, bias, params.groups, params.transposed);
 
@@ -349,16 +328,12 @@ at::Tensor _convolution(
 
       output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation);
   } else if (params.use_cudnn(input)) {
-    if (input.type() != weight.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && input.type() != bias.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.type() == weight.type(),
+             "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
+             ") should be the same");
+    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
+             ") should be the same");
 
     if (params.transposed) {
       output = at::cudnn_convolution_transpose(
@@ -370,16 +345,12 @@ at::Tensor _convolution(
           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
     }
   } else if (params.use_miopen(input)) {
-    if (input.type() != weight.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && input.type() != bias.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.type() == weight.type(),
+             "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
+             ") should be the same");
+    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
+             ") should be the same");
 
     if (params.transposed) {
       output = at::miopen_convolution_transpose(
@@ -392,16 +363,12 @@ at::Tensor _convolution(
     }
   } else if (params.use_mkldnn(input)) {
 #if AT_MKLDNN_ENABLED()
-    if (input.type() != weight.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && input.type() != bias.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.type() == weight.type(),
+             "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
+             ") should be the same");
+    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
+             ") should be the same");
 
     output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
 #endif
@@ -487,7 +454,7 @@ at::Tensor _convolution_nogroup(
     }
   }
 
-  throw std::runtime_error("unsupported ConvNd parameters");
+  AT_ERROR("unsupported ConvNd parameters");
 }
 
 static Tensor subvariable(const Tensor& var, int dim, int groups, int g) {
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 288fa283abe660..5566fd397320aa 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -39,7 +39,7 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
   ss << "The shape of the mask " << mask.sizes() << " at index " << maskIdx;
   ss << " does not match the shape of the indexed tensor " << self.sizes();
   ss << " at index " << idx;
-  throw std::runtime_error(ss.str());
+  AT_ERROR(ss.str());
 }
 
 static void checkIndexTensorTypes(TensorList indices) {
@@ -47,9 +47,8 @@ static void checkIndexTensorTypes(TensorList indices) {
     if (tensor.defined()) {
       auto& type = tensor.type();
       auto scalarType = type.scalarType();
-      if (scalarType != kLong && scalarType != kByte) {
-        throw std::runtime_error("tensors used as indices must be long or byte tensors");
-      }
+      AT_CHECK(scalarType == kLong || scalarType == kByte,
+               "tensors used as indices must be long or byte tensors");
     }
   }
 }
@@ -146,12 +145,10 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size)
   if (index.numel() != 0) {
     auto max_idx = index.max().toCLong();
     auto min_idx = index.min().toCLong();
-    if (max_idx >= dim_size) {
-      AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
-    }
-    if (min_idx < -dim_size) {
-      AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
-    }
+    AT_CHECK(max_idx < dim_size,
+             "index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
+    AT_CHECK(min_idx >= -dim_size,
+             "index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
   }
   return index.remainder(dim_size);
 }
@@ -230,9 +227,8 @@ static std::tuple<Tensor, Tensor> makeLinearIndex(Tensor self, TensorList orig)
 }
 
 Tensor index(const Tensor & self, TensorList indices) {
-  if (indices.size() > (size_t)self.dim()) {
-   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-  }
+  AT_CHECK(indices.size() <= (size_t)self.dim(),
+           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   Tensor src, linearIndex;
   std::tie(src, linearIndex) = makeLinearIndex(self, indices);
@@ -240,9 +236,8 @@ Tensor index(const Tensor & self, TensorList indices) {
 }
 
 Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) {
-  if (indices.size() > (size_t)self.dim()) {
-   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-  }
+  AT_CHECK(indices.size() <= (size_t)self.dim(),
+           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   Tensor src, linearIndex, expandedValue;
   std::tie(src, linearIndex) = makeLinearIndex(self, indices);
@@ -252,9 +247,8 @@ Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value)
 }
 
 Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
-  if (indices.size() > (size_t)self.dim()) {
-   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-  }
+  AT_CHECK(indices.size() <= (size_t)self.dim(),
+           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   Tensor src, linearIndex, expandedValue;
   std::tie(src, linearIndex) = makeLinearIndex(self, indices);
@@ -265,18 +259,14 @@ Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
 Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
   dim = maybe_wrap_dim(dim, self.dim());
 
-  if (index.dim() >= 2) {
-   AT_ERROR(
-        "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
-  }
+  AT_CHECK(index.dim() < 2,
+           "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
+
   int64_t numIndices = index.numel();
-  if (source.dim() == 0 && numIndices != 1) {
-   AT_ERROR(
-        "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
-  }
-  if (index.type().scalarType() != ScalarType::Long) {
-   AT_ERROR("index_copy_(): Expected LongTensor for index");
-  }
+  AT_CHECK(source.dim() != 0 || numIndices == 1,
+           "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
+  AT_CHECK(index.type().scalarType() == ScalarType::Long,
+           "index_copy_(): Expected LongTensor for index");
 
   // Check that source and destination slices have the same size
   auto selfSlicedSizes = self.sizes().vec();
@@ -294,7 +284,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
     ss << "index_copy_(): Source/destination tensor must have same slice shapes. ";
     ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << dim;
     ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0.";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
   if (source.dim() > 0 && numIndices != source.size(dim)) {
      AT_ERROR(
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 24d8a41fb50271..750e42568715d7 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -11,11 +11,8 @@ namespace at { namespace native {
 
 namespace {
   void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){
-    if (actual != expected){
-      std::stringstream ss;
-      ss << arg_name << " should contain " << expected << " elements not " << actual ;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(actual == expected,
+             arg_name, " should contain ", expected, " elements not ", actual);
   }
 }
 
@@ -28,12 +25,12 @@ Tensor batch_norm(
   if (running_mean.defined()) {
     check_dims_match_num_input_features("running_mean", num_features, running_mean.numel());
   } else if (!training) {
-    throw std::runtime_error("running_mean must be defined in evaluation mode");
+    AT_ERROR("running_mean must be defined in evaluation mode");
   }
   if (running_var.defined()) {
     check_dims_match_num_input_features("running_var", num_features, running_var.numel());
   } else if (!training) {
-    throw std::runtime_error("running_var must be defined in evaluation mode");
+    AT_ERROR("running_var must be defined in evaluation mode");
   }
   if (weight.defined()) {
     check_dims_match_num_input_features("weight", num_features, weight.numel());
@@ -89,29 +86,19 @@ Tensor layer_norm(const Tensor& input, IntList normalized_shape,
 
     int64_t normalized_ndim = normalized_shape.size();
 
-    if (normalized_ndim < 1) {
-      std::stringstream ss;
-      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
-         << "containing at least one element, but got normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(normalized_ndim >= 1,
+             "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+             "containing at least one element, but got normalized_shape=",
+             normalized_shape);
 
-    if (weight.defined() && !weight.sizes().equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Expected weight to be of same shape as normalized_shape, but got "
-         << "weight of shape " << weight.sizes() << " and normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
-
-    if (bias.defined() && !bias.sizes().equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Expected bias to be of same shape as normalized_shape, but got "
-         << "bias of shape " << bias.sizes() << " and normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape),
+             "Expected weight to be of same shape as normalized_shape, but got ",
+             "weight of shape ", weight.sizes(), " and normalized_shape=",
+             normalized_shape);
+    AT_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape),
+             "Expected bias to be of same shape as normalized_shape, but got ",
+             "bias of shape ", bias.sizes(), " and normalized_shape=",
+             normalized_shape);
 
     auto input_shape = input.sizes();
     auto input_ndim = input.dim();
@@ -125,7 +112,7 @@ Tensor layer_norm(const Tensor& input, IntList normalized_shape,
         ss << ", " << size;
       }
       ss << "], but got input of size" << input_shape;
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 
     int64_t n = 1;
@@ -159,29 +146,19 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
     int64_t b = input.size(0);
     int64_t c = input.size(1);
 
-    if (c % num_groups != 0) {
-      std::stringstream ss;
-      ss << "Expected number of channels in input to be divisible by "
-         << "num_groups, but got input of shape " << input.sizes() << " and "
-         << "num_groups=" << num_groups;
-      throw std::runtime_error(ss.str());
-    }
-
-    if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) {
-      std::stringstream ss;
-      ss << "Expected weight to be a vector of size equal to the number of "
-         << "channels in input, but got weight of shape " << weight.sizes()
-         << " and input of shape " <<  input.sizes();
-      throw std::runtime_error(ss.str());
-    }
-
-    if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) {
-      std::stringstream ss;
-      ss << "Expected bias to be a vector of size equal to the number of "
-         << "channels in input, but got bias of shape " << weight.sizes()
-         << " and input of shape " <<  input.sizes();
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(c % num_groups == 0,
+             "Expected number of channels in input to be divisible by ",
+             "num_groups, but got input of shape ", input.sizes(), " and "
+             "num_groups=", num_groups);
+
+    AT_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c),
+             "Expected weight to be a vector of size equal to the number of ",
+             "channels in input, but got weight of shape ", weight.sizes(),
+             " and input of shape ", input.sizes());
+    AT_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c),
+             "Expected bias to be a vector of size equal to the number of ",
+             "channels in input, but got bias of shape ", weight.sizes(),
+             " and input of shape ", input.sizes());
 
     // Apply group norm
     auto input_reshaped = input.contiguous().view({1, b * num_groups, -1});
diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp
index 5995e43ef1e536..1a089a9f473c17 100644
--- a/aten/src/ATen/native/RoiPooling.cpp
+++ b/aten/src/ATen/native/RoiPooling.cpp
@@ -134,7 +134,7 @@ Tensor RoiPooling2d_backward_cpu(
   double spatialScale,
   const Tensor& gradOutput,
   const Tensor& argmaxes) {
-  throw std::runtime_error("not implemented");
+  AT_ERROR("not implemented");
 }
 
 }
diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h
index 7518d1f945a5fd..875c7aa12b68cf 100644
--- a/aten/src/ATen/native/SpectralOpsUtils.h
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@@ -51,7 +51,7 @@ inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size,
     std::ostringstream ss;
     ss << "expected real signal size " << expected_size << " is incompatible "
        << "with onesided complex frequency size " << complex_size;
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 }
 
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 822bea31fee942..c22374d0964e81 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -20,9 +20,8 @@ std::vector<Tensor> broadcast_tensors(TensorList tensors) {
 static void check_cat_no_zero_dim(TensorList tensors) {
   for(size_t i = 0; i < tensors.size(); ++i) {
     auto& t = tensors[i];
-    if (t.dim() == 0) {
-      AT_ERROR("zero-dimensional tensor (at position ", i, ") cannot be concatenated");
-    }
+    AT_CHECK(t.dim() > 0,
+             "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
   }
 }
 
@@ -39,12 +38,11 @@ Tensor cat(TensorList tensors, int64_t dim) {
 }
 
 std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
-  if (self.dim() == 0) {
-    AT_ERROR("chunk expects at least a 1-dimensional tensor");
-  }
-  if (chunks <= 0) {
-    AT_ERROR("chunk expects `chunks` to be greater than 0, got: ", chunks);
-  }
+  AT_CHECK(self.dim() > 0,
+           "chunk expects at least a 1-dimensional tensor");
+  AT_CHECK(chunks > 0,
+           "chunk expects `chunks` to be greater than 0, got: ", chunks);
+
   int64_t split_size = (self.size(dim) + chunks - 1) / chunks;
 
   // We need to call split_with_sizes in the case where split_size and dimension size are 0, because
@@ -117,14 +115,11 @@ Tensor expand(const Tensor& self, IntList size, bool implicit) {
   // distinguish between expands inserted by broadcasts and those explicitly
   // requested by the user, because it is legal to remove implicit expands
   // from the graph, but not legal to remove the explicit ones.
-  if (size.size() < (size_t)self.dim()) {
-    std::ostringstream ss;
-    ss << "expand(" << self.type() << "{" << self.sizes() << "}, size=" << size
-       << "): the number of sizes provided (" << size.size() << ") "
-       << "must be greater or equal to the number of dimensions in the tensor ("
-       << self.dim() << ")";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(size.size() >= (size_t)self.dim(),
+           "expand(", self.type(), "{", self.sizes(), "}, size=", size,
+           "): the number of sizes provided (", size.size(), ") ",
+           "must be greater or equal to the number of dimensions in the tensor (",
+           self.dim(), ")");
 
   std::vector<int64_t> expandedSizes;
   std::vector<int64_t> expandedStrides;
@@ -159,17 +154,15 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  if (length < 0 || start > cur_size - length) {
-    AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
-  }
+  AT_CHECK(length >= 0 && start <= cur_size - length,
+           "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice(self, dim, start, start + length, 1);
 }
 
 Tensor permute(const Tensor& self, IntList dims) {
   auto nDims = self.dim();
-  if (dims.size() != (size_t)nDims) {
-    AT_ERROR("number of dims don't match in permute");
-  }
+  AT_CHECK(dims.size() == (size_t)nDims,
+           "number of dims don't match in permute");
   auto oldSizes = self.sizes();
   auto oldStrides = self.strides();
   std::vector<int64_t> newSizes(nDims);
@@ -177,9 +170,8 @@ Tensor permute(const Tensor& self, IntList dims) {
   std::vector<bool> seen(nDims);
   for (int64_t i = 0; i < nDims; i++) {
     auto dim = maybe_wrap_dim(dims[i], nDims);
-    if (seen[dim]) {
-      AT_ERROR("repeated dim in permute");
-    }
+    AT_CHECK(!seen[dim],
+             "repeated dim in permute");
     seen[dim] = true;
     newSizes[i] = oldSizes[dim];
     newStrides[i] = oldStrides[dim];
@@ -188,9 +180,8 @@ Tensor permute(const Tensor& self, IntList dims) {
 }
 
 Tensor repeat(const Tensor& self, IntList repeats) {
-  if (repeats.size() < (size_t)self.dim()) {
-    AT_ERROR("Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
-  }
+  AT_CHECK(repeats.size() >= (size_t)self.dim(),
+           "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
 
   // Add new leading dimensions to the tensor if the
   // number of target dimensions is larger than the
@@ -238,12 +229,9 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor.");
   dim = maybe_wrap_dim(dim, ndim);
   auto size = self.size(dim);
-  if (index < -size || index >= size) {
-    std::stringstream ss;
-    ss << "select(): index " << index << " out of range for tensor of size ";
-    ss << self.sizes() << " at dimension " << dim;
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(index >= -size && index < size,
+           "select(): index ", index, " out of range for tensor of size ",
+           self.sizes(), " at dimension ", dim);
   if (index < 0) {
     index += size;
   }
@@ -261,10 +249,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   dim = maybe_wrap_dim(dim, ndim);
   auto sizes = self.sizes().vec();
   auto strides = self.strides().vec();
-  if (step <= 0) {
-    // TODO: support negative strides
-    throw std::runtime_error("slice step must be positive");
-  }
+  // TODO: support negative strides
+  AT_CHECK(step > 0, "slice step must be positive");
   if (start < 0) {
     start += sizes[dim];
   }
@@ -322,22 +308,15 @@ std::vector<Tensor> split_with_sizes(const Tensor& self, IntList split_sizes, in
 
   for (i = 0; i < num_splits; ++i) {
     auto length = split_sizes[i];
-    if (length < 0) {
-      std::ostringstream ss;
-      ss << "split_with_sizes expects split_sizes have only non-negative "
-         << "entries, but got split_sizes=" << split_sizes;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(length >= 0,
+             "split_with_sizes expects split_sizes have only non-negative ",
+             "entries, but got split_sizes=", split_sizes);
     splits[i] = self.narrow(dim, start_idx, length);
     start_idx += length;
   }
-  if (start_idx != dim_size) {
-    std::ostringstream ss;
-    ss << "split_with_sizes expects split_sizes to sum exactly to "
-       << dim_size << " (input tensor's size at dimension " << dim << "), "
-       << "but got split_sizes=" << split_sizes;
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(start_idx == dim_size,
+           "split_with_sizes expects split_sizes to sum exactly to ", dim_size,
+           " (input tensor's size at dimension ", dim, "), ", "but got split_sizes=", split_sizes);
   return splits;
 }
 
@@ -350,28 +329,24 @@ static inline std::vector<Tensor> get_stack_inputs(TensorList tensors, int64_t d
 }
 
 Tensor stack(TensorList tensors, int64_t dim) {
-  if (tensors.size() == 0) {
-    throw std::runtime_error("stack expects a non-empty TensorList");
-  }
+  AT_CHECK(tensors.size() > 0,
+           "stack expects a non-empty TensorList");
   dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
   return at::cat(get_stack_inputs(tensors, dim), dim);
 }
 
 Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) {
-  if (tensors.size() == 0) {
-    throw std::runtime_error("stack expects a non-empty TensorList");
-  }
+  AT_CHECK(tensors.size() > 0,
+           "stack expects a non-empty TensorList");
   dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
   return at::cat_out(result, get_stack_inputs(tensors, dim), dim);
 }
 
 static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
   int64_t nsparseDims = self._sparseDims();
-  if (dim0 >= nsparseDims || dim1 >= nsparseDims) {
-    AT_ERROR(
-        "sparse transpose: transposed dimensions must be sparse ",
-        "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1);
-  }
+  AT_CHECK(dim0 < nsparseDims && dim1 < nsparseDims,
+           "sparse transpose: transposed dimensions must be sparse ",
+           "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1);
 
   if (self._indices().numel() == 0 && self._values().numel() == 0) {
     auto sizes = self.sizes().vec();
@@ -442,10 +417,9 @@ static void check_t(const Tensor& self, const char *fn) {
   if (self.is_sparse()) {
     int64_t sparseDims = self._sparseDims();
     int64_t denseDims = self._denseDims();
-    if (!(sparseDims == 2 && denseDims == 0)) {
-      AT_ERROR(fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ",
-               sparseDims, " sparse and ", denseDims, " dense dimensions");
-    }
+    AT_CHECK(sparseDims == 2 && denseDims == 0,
+             fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ",
+             sparseDims, " sparse and ", denseDims, " dense dimensions");
   } else if (self.dim() != 2) {
     AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D");
   }
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 37c8f3a364f75a..8715a9ef460ee6 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -112,22 +112,16 @@ class CuFFTConfig {
     if (input.type().scalarType() == ScalarType::Half) {
       // cuFFT on half requires compute capability of at least SM_53
       auto dev_prop = at::cuda::getCurrentDeviceProperties();
-      if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) {
-        std::ostringstream ss;
-        ss << "cuFFT doesn't support signals of half type with compute "
-           << "capability less than SM_53, but the device containing input half "
-           << "tensor only has SM_" << dev_prop->major << dev_prop->minor;
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
+               "cuFFT doesn't support signals of half type with compute "
+               "capability less than SM_53, but the device containing input half "
+               "tensor only has SM_", dev_prop->major, dev_prop->minor);
       for (int64_t i = 0; i < signal_ndim; i++) {
         auto signal_size = checked_signal_sizes[i];
-        if (!is_pow_of_two(signal_size)) {
-          std::ostringstream ss;
-          ss << "cuFFT doesn't support signals of half type with size at any "
-             << "dimension that is not a power of two, but got a signal size of "
-             << checked_signal_sizes;
-          throw std::runtime_error(ss.str());
-        }
+        AT_CHECK(is_pow_of_two(signal_size),
+                 "cuFFT doesn't support signals of half type with size at any ",
+                 "dimension that is not a power of two, but got a signal size of ",
+                 checked_signal_sizes);
       }
       clone_input |= input.stride(signal_ndim) != 1;
     }
@@ -212,7 +206,7 @@ class CuFFTConfig {
       } else if (!complex_input && complex_output) {
         exec_type = HIPFFT_R2C;
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (float)");
+        AT_ERROR("hipFFT doesn't support r2r (float)");
       }
     } else if (input.type().scalarType() == ScalarType::Double) {
       if (complex_input && complex_output) {
@@ -222,13 +216,13 @@ class CuFFTConfig {
       } else if (!complex_input && complex_output) {
         exec_type = HIPFFT_D2Z;
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (double)");
+        AT_ERROR("hipFFT doesn't support r2r (double)");
       }
     } else {
       std::ostringstream ss;
       ss << "hipFFT doesn't support tensor of type: "
          << at::toString(input.type().scalarType());
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 
 #else
@@ -249,7 +243,7 @@ class CuFFTConfig {
       std::ostringstream ss;
       ss << "cuFFT doesn't support tensor of type: "
          << at::toString(input.type().scalarType());
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 #endif
 
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index c82f4e7afb87de..38b1dddb496276 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -206,7 +206,7 @@ static inline Tensor _run_cufft(
         CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(input.data_ptr()),
           static_cast<hipfftComplex*>(output.data_ptr())));
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (float)");
+        AT_ERROR("hipFFT doesn't support r2r (float)");
       }
     } else if (input.type().scalarType() == ScalarType::Double) {
       if (complex_input && complex_output) {
@@ -220,13 +220,13 @@ static inline Tensor _run_cufft(
         CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(input.data_ptr()),
           static_cast<hipfftDoubleComplex*>(output.data_ptr())));
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (double)");
+        AT_ERROR("hipFFT doesn't support r2r (double)");
       }
     } else {
       std::ostringstream ss;
       ss << "hipFFT doesn't support tensor of type: "
          << at::toString(input.type().scalarType());
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 #else
   CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(),
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
index 6856c465e9e8ef..463d4ffea3cf04 100644
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -12,13 +12,13 @@ namespace at { namespace native {
 Tensor cudnn_affine_grid_generator_forward(
     const Tensor& theta,
     int64_t N, int64_t C, int64_t H, int64_t W) {
-  throw std::runtime_error("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
 }
 
 Tensor cudnn_affine_grid_generator_backward(
     const Tensor& grad_theta,
     int64_t N, int64_t C, int64_t H, int64_t W) {
-  throw std::runtime_error("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
 }
 
 }}
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index 7cd7466a285035..d54fe256b29152 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -13,7 +13,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
     const Tensor& input, const Tensor& weight,
     const Tensor& bias, const Tensor& running_mean, const Tensor& running_var,
     bool training, double exponential_average_factor, double epsilon) {
-  throw std::runtime_error("cudnn_batch_norm: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_batch_norm: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
@@ -21,7 +21,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     const Tensor& running_mean, const Tensor& running_var,
     const Tensor& save_mean, const Tensor& save_var,
     double epsilon) {
-  throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index a94916532a3f02..afbd7653aefa67 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -14,61 +14,61 @@ at::Tensor cudnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_bias(
     const at::Tensor& grad_output) {
-  throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList output_padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose_backward_input(
     const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
 }
 
 }}
@@ -194,16 +194,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
 // Used on pad, stride and dilation
 static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name)
 {
-  if (args.size() > expected_size){
-    std::stringstream ss;
-    ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
-  else if (args.size() < expected_size){
-    std::stringstream ss;
-    ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  AT_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
 
   auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
   if (num_negative_values > 0){
@@ -211,7 +207,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
     ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 }
 
@@ -449,7 +445,7 @@ perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) {
         return perfResults[i];
       }
     }
-    throw std::runtime_error("no deterministic convolution algorithms available in CuDNN");
+    AT_ERROR("no deterministic convolution algorithms available in CuDNN");
   } else {
     return perfResults[0];
   }
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index 1ce92cf7e18d2a..e859344bcc3691 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -11,13 +11,13 @@ namespace at { namespace native {
 
 Tensor cudnn_grid_sampler_forward(
     const Tensor& input_t, const Tensor& grid_t) {
-  throw std::runtime_error("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
     const Tensor& input_t, const Tensor& grid_t,
     const Tensor& grad_output_t) {
-  throw std::runtime_error("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support");
 }
 
 }}
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 966aa20e0a128d..98c0cb7918f02f 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -14,7 +14,7 @@ namespace at { namespace native {
 // See Note [ATen preprocessor philosophy]
 
 std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) {
-  throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
+  AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
 }
 
 }}
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index e3b6aaadf0f0b6..876590409c43c4 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -22,7 +22,7 @@ Tensor _cudnn_rnn_flatten_weight(
     int64_t fn_num_layers, bool batch_first,
     bool fn_bidirectional
     ) {
-  throw std::runtime_error("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
@@ -34,7 +34,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes,
     const Tensor& fn_dropout_state
     ) {
-  throw std::runtime_error("_cudnn_rnn: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
@@ -47,11 +47,11 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     const Tensor& dropout_state, const Tensor& reserve,
     std::array<bool, 4> output_mask
     ) {
-  throw std::runtime_error("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
 }
 
 Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) {
-  throw std::runtime_error("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
 }
 
 }} // namespace at::native
@@ -123,7 +123,7 @@ namespace {
         {
           std::ostringstream oss;
           oss << "unrecognized cuDNN RNN mode " << fn_mode;
-          throw std::runtime_error(oss.str());
+          AT_ERROR(oss.str());
         }
       }
     }
@@ -131,7 +131,7 @@ namespace {
     void set_bidirectional(bool fn_bidirectional) {
       bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
     }
-     
+
     void set_algo(cudnnRNNAlgo_t algo){
       this->algo = algo;
     }
@@ -570,7 +570,7 @@ namespace {
       if (prop->major == 7 && rnn.datatype == CUDNN_DATA_HALF && !tensors.is_input_packed()) {
           if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && tensors.input_size <=1024 && rnn.num_directions() == 1 &&
                   rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){
-              //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, 
+              //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
               //weed them out
               if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){
                   if ((tensors.seq_length >=40 && bsize <=128) ||
@@ -599,9 +599,8 @@ Tensor _cudnn_rnn_flatten_weight(
     bool fn_bidirectional
     ) {
 
-  if (weight_arr.size() == 0) {
-    throw std::runtime_error("_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
-  }
+  AT_CHECK(weight_arr.size() > 0,
+           "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
 
   auto any_param = weight_arr[0];
 
@@ -671,9 +670,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   // TODO: Set device to input
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    if (cx.defined()) {
-      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
-    }
+    AT_CHECK(!cx.defined(),
+             "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   // TODO: can batch_first be a wrapper around this function?
@@ -685,12 +683,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  if (!hx.is_contiguous()) {
-    throw std::runtime_error("rnn: hx is not contiguous");
-  }
-  if (cx.defined() && !cx.is_contiguous()) {
-    throw std::runtime_error("rnn: cx is not contiguous");
-  }
+  AT_CHECK(hx.is_contiguous(),
+           "rnn: hx is not contiguous");
+  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+           "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   auto output = input.type().tensor(output_size);
@@ -723,11 +719,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     w_desc.set(weight_buf, 3);
   }
 
-  if (cx.defined() && !cx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes();
-    throw std::runtime_error(oss.str());
-  }
+  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
+           "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes());
 
   size_t workspace_size;
   auto x_descs_arr = descs.get_x_descs();
@@ -817,9 +810,8 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    if (cx.defined()) {
-      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
-    }
+    AT_CHECK(!cx.defined(),
+             "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   auto is_input_packed = fn_batch_sizes.size() != 0;
@@ -833,12 +825,10 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  if (!hx.is_contiguous()) {
-    throw std::runtime_error("rnn: hx is not contiguous");
-  }
-  if (cx.defined() && !cx.is_contiguous()) {
-    throw std::runtime_error("rnn: cx is not contiguous");
-  }
+  AT_CHECK(hx.is_contiguous(),
+           "rnn: hx is not contiguous");
+  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+           "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   auto dy = grad_output.contiguous();
@@ -851,42 +841,25 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
   auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor();
 
-  if (!fn_train) {
-    throw std::runtime_error("cudnn RNN backward can only be called in training mode");
-  }
-  if (!input.sizes().equals(input_size)) {
-    std::ostringstream oss;
-    oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (!output.sizes().equals(output_size)) {
-    std::ostringstream oss;
-    oss << "Expected output size " << IntList{output_size} << ", got " << output.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (hx.defined() && !hx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (cx.defined() && !cx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (dhy.defined() && !dhy.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected d_hidden size " << IntList{hidden_size} << ", got " << dhy.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (dcy.defined() && !dcy.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected d_cell size " << IntList{hidden_size} << ", got " << dcy.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (!dhy.is_cuda() || !dy.is_cuda() || (dcy.defined() && !dcy.is_cuda())) {
-    throw std::runtime_error("Gradients aren't CUDA tensors");
-  }
+  AT_CHECK(fn_train,
+           "cudnn RNN backward can only be called in training mode");
+
+  AT_CHECK(input.sizes().equals(input_size),
+           "Expected input size ", IntList{input_size}, ", got ", input.sizes());
+  AT_CHECK(output.sizes().equals(output_size),
+           "Expected output size ", IntList{output_size}, ", got ", output.sizes());
+
+  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
+           "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes());
+  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
+           "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes());
+  AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
+           "Expected d_hidden size ", IntList{hidden_size}, ", got ", dhy.sizes());
+  AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size),
+           "Expected d_cell size ", IntList{hidden_size}, ", got ", dcy.sizes());
+
+  AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
+           "Gradients aren't CUDA tensors");
 
   cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors);
   fn.rnn.set_algo(algo);
@@ -959,9 +932,8 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    if (cx.defined()) {
-      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
-    }
+    AT_CHECK(!cx.defined(),
+             "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   auto is_input_packed = fn_batch_sizes.size() != 0;
@@ -973,28 +945,21 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto input_size = _input_size(fn.tensors);
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
 
-  if (!fn_train) {
-    throw std::runtime_error("cudnn RNN backward can only be called in training mode");
-  }
-  if (!input.sizes().equals(input_size)) {
-    std::ostringstream oss;
-    oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (hx.defined() && !hx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes();
-    throw std::runtime_error(oss.str());
-  }
+  AT_CHECK(fn_train,
+           "cudnn RNN backward can only be called in training mode");
+
+  AT_CHECK(input.sizes().equals(input_size),
+           "Expected input size ", IntList{input_size}, ", got ", input.sizes());
+  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
+           "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes());
+
   // TODO: the above were the only checks in rnn.py, but it doesn't seem
   // like these checks are enough
 
-  if (!hx.is_contiguous()) {
-    throw std::runtime_error("rnn: hx is not contiguous");
-  }
-  if (cx.defined() && !cx.is_contiguous()) {
-    throw std::runtime_error("rnn: cx is not contiguous");
-  }
+  AT_CHECK(hx.is_contiguous(),
+           "rnn: hx is not contiguous");
+  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+           "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   const auto& y = output;
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 997431b7a86170..c9d25780bd65d3 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -14,7 +14,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     const Tensor& input, const Tensor& weight,
     const Tensor& bias, const Tensor& running_mean, const Tensor& running_var,
     bool training, double exponential_average_factor, double epsilon) {
-  throw std::runtime_error("miopen_batch_norm: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_batch_norm: ATen not compiled with MIOpen support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
@@ -22,7 +22,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     const Tensor& running_mean, const Tensor& running_var,
     const Tensor& save_mean, const Tensor& save_var,
     double epsilon) {
-  throw std::runtime_error("miopen_batch_norm_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_batch_norm_backward: ATen not compiled with MIOpen support");
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 1ae36edd5c7b76..9aeaad73558617 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -13,61 +13,61 @@ at::Tensor miopen_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_backward_input: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward_input: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_backward_weight: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward_weight: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_backward_bias(
     const at::Tensor& grad_output) {
-  throw std::runtime_error("miopen_convolution_backward_bias: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward_bias: ATen not compiled with MIOpen support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("miopen_convolution_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_transpose(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList output_padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_transpose: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_transpose_backward_input(
     const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_transpose_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
 }
 
 }}
@@ -180,16 +180,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
 // Used on pad, stride and dilation
 static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name)
 {
-  if (args.size() > expected_size){
-    std::stringstream ss;
-    ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
-  else if (args.size() < expected_size){
-    std::stringstream ss;
-    ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  AT_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
 
   auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
   if (num_negative_values > 0){
@@ -197,7 +193,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
     ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 }
 
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index c3451824c05113..2c81d69d3b8435 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -12,7 +12,7 @@ Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim,
                 bool inverse, IntList checked_signal_sizes,
                 bool normalized, bool onesided,
                 IntList output_sizes) {
-  throw std::runtime_error("fft: ATen not compiled with MKL support");
+  AT_ERROR("fft: ATen not compiled with MKL support");
 }
 
 }}
@@ -191,12 +191,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
       osize = output_sizes[i];
       istride = complex_input ? input.stride(i) >> 1 : input.stride(i);
       ostride = onumel;
-      if (isize > MKL_LONG_MAX || osize > MKL_LONG_MAX || ostride > MKL_LONG_MAX) {
-        std::ostringstream ss;
-        ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ "
-           << MKL_LONG_MAX << "]";
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX,
+               "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]");
       if (!need_contiguous && istride > MKL_LONG_MAX) {
         // If we didn't plan to contiguous-fy but the `istride` exceeds bound,
         // check if we can stride (equal to `inumel`) get back within bound if
@@ -205,12 +201,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
         // fine as `inumel` is non-decreasing.
         need_contiguous = true;
       }
-      if (need_contiguous && inumel > MKL_LONG_MAX) {
-        std::ostringstream ss;
-        ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ "
-           << MKL_LONG_MAX << "]";
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX,
+               "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]");
       inumel *= isize;
       onumel *= osize;
     }
@@ -227,7 +219,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
     std::ostringstream ss;
     ss << "MKL FFT doesn't support tensor of type: "
        << at::toString(input.type().scalarType());
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
   // signal type
   DFTI_CONFIG_VALUE signal_type;
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 00f4e8f95b92d4..ddbd6977645e74 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -9,25 +9,25 @@ namespace at { namespace native {
 at::Tensor mkldnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
     IntList padding, IntList stride, IntList dilation, int64_t groups) {
-  throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
 }
 
 at::Tensor mkldnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
-  throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor> mkldnn_convolution_backward_weights(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
-  throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) {
-  throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
 }
 
 }}

From c1dce21fd5505085ef148aa9d70a312d8d14257f Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Tue, 11 Sep 2018 13:06:47 -0700
Subject: [PATCH 045/237] Cuda TensorAccessor (#11373)

Summary:
Provide a TensorAccessor-Like interface for CUDA as discussed in #8366.

Compared to TensorAccessor
- the CUDATensorAccessor copies the sizes and strides while on the host (I didn't implement a host indexing function, though) to enable transfer to the device, on the device, `[]` works like for TensorAccessors,
- instantiation is from TensorAccessors in order to allow using `.accessor<..>`. The drawback is that it you cannot use `auto` for the variable declaration, but the alternative would be a cuda-specific `.accessor`-like function,
- there is a PtrTraits argument to enable `__restrict__`,

Example for the intended use:
```
...
template <typename scalar_t>
__global__ void
apply_homography_2d_kernel(cuda::CUDATensorAccessor<scalar_t, 4> dest_a,
			   cuda::CUDATensorAccessor<scalar_t, 4> src_a,
			   cuda::CUDATensorAccessor<float, 2> transform) {
...
}

template <typename scalar_t>
Tensor apply_homography_2d_template(Tensor& res, const Tensor& image, const Tensor& transform) {
  ...
  cuda::CUDATensorAccessor<scalar_t, 4> image_a(image.accessor<scalar_t, 4>());
  cuda::CUDATensorAccessor<scalar_t, 4> res_a(res.accessor<scalar_t, 4>());
  cuda::CUDATensorAccessor<float, 2> transform_a(transform.accessor<float, 2>());
  auto stream = at::cuda::getCurrentCUDAStream();

  apply_homography_2d_kernel<scalar_t>
    <<<grid, block, 0, stream>>>(res_a, image_a, transform_a);
  return res;
}

...
```

I could use a hint where to put a test for this (e.g. doing a plain vanilla matrix multiplication with a custom kernel) and comparing with the aten mm.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11373

Differential Revision: D9735573

Pulled By: ezyang

fbshipit-source-id: 482b218a0d514e19a8b692bbc77c0e37082cfded
---
 aten/src/ATen/core/TensorAccessor.h           | 131 +++++++++++++++---
 aten/src/ATen/templates/Tensor.h              |  16 +++
 aten/src/ATen/test/CMakeLists.txt             |   3 +-
 .../test/cuda_packedtensoraccessor_test.cu    |  46 ++++++
 4 files changed, 179 insertions(+), 17 deletions(-)
 create mode 100644 aten/src/ATen/test/cuda_packedtensoraccessor_test.cu

diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index e3a73a3cbea2e5..cbd130ed775d09 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -5,45 +5,144 @@
 
 namespace at {
 
+// The PtrTraits argument to the TensorAccessor/PackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+#ifdef __CUDACC__
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+
+#ifndef AT_HOSTDEVICE
+#ifdef __CUDACC__
+#define AT_HOSTDEVICE __host__ __device__
+#define AT_HOST __host__
+#define AT_DEVICE __device__
+#else
+#define AT_HOSTDEVICE
+#define AT_HOST
+#define AT_DEVICE
+#endif
+#endif
 
-template<typename T, size_t N>
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
+// to functions and types available there (e.g. IntList isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
 class TensorAccessorBase {
 public:
-  TensorAccessorBase(T * data_, const int64_t * sizes_, const int64_t * strides_)
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOSTDEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : data_(data_), sizes_(sizes_), strides_(strides_) {}
-  IntList sizes() {
+  AT_HOST IntList sizes() {
     return IntList(sizes_,N);
   }
-  IntList strides() {
+  AT_HOST IntList strides() {
     return IntList(strides_,N);
   }
-  int64_t stride(int64_t i) { return strides()[i]; }
-  int64_t size(int64_t i) { return sizes()[i]; }
+  AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; }
+  AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; }
 protected:
-  T * data_;
+  PtrType data_;
   const int64_t* sizes_;
   const int64_t* strides_;
 };
 
-template<typename T, size_t N>
-class TensorAccessor : public TensorAccessorBase<T,N> {
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `PackedTensorAccessor` is used on the host and only
+// indexing on the device uses `TensorAccessor`s.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits> {
 public:
-  TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_)
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : TensorAccessorBase<T,N>(data_,sizes_,strides_) {}
 
-  TensorAccessor<T,N-1> operator[](int64_t i) {
+  AT_HOSTDEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
     return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
   }
 };
 
-template<typename T>
-class TensorAccessor<T,1> : public TensorAccessorBase<T,1> {
+template<typename T, template <typename U> class PtrTraits>
+class TensorAccessor<T,1,PtrTraits> : public TensorAccessorBase<T,1,PtrTraits> {
 public:
-  TensorAccessor(T * data_, const int64_t * sizes_, const   int64_t * strides_)
-  : TensorAccessorBase<T,1>(data_,sizes_,strides_) {}
-  T & operator[](int64_t i) {
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : TensorAccessorBase<T,1,PtrTraits>(data_,sizes_,strides_) {}
+  AT_HOSTDEVICE T & operator[](int64_t i) {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+
+
+// PackedTensorAccessorBase and PackedTensorAccessor are used on for CUDA `Tensor`s on the host
+// and as 
+// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
+// in order to transfer them on the device when calling kernels.
+// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
+// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
+// on the device, so those functions are host only.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+class PackedTensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  AT_HOST PackedTensorAccessorBase(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : data_(data_)
+  {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+  AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; }
+  AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; }
+protected:
+  PtrType data_;
+  int64_t sizes_[N];
+  int64_t strides_[N];
+};
+
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+class PackedTensorAccessor : public PackedTensorAccessorBase<T,N,PtrTraits> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : PackedTensorAccessorBase<T,N,PtrTraits>(data_, sizes_, strides_) {};
+
+  AT_DEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
+    int64_t* new_sizes = this->sizes_+1;
+    int64_t* new_strides = this->strides_+1;
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits>
+class PackedTensorAccessor<T,1,PtrTraits> : public PackedTensorAccessorBase<T,1,PtrTraits> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : PackedTensorAccessorBase<T,1,PtrTraits>(data_, sizes_, strides_) {};
+
+  AT_DEVICE T & operator[](int64_t i) {
     return this->data_[this->strides_[0]*i];
   }
 };
 
 }
+
+#undef AT_HOSTDEVICE
+#undef AT_HOST
+#undef AT_DEVICE
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index b80e354b60b347..50e280be8f2fe9 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -194,6 +194,8 @@ struct AT_API Tensor {
   AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
   #undef TO_C_TYPE
 
+  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
+  // dimension.
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
@@ -203,6 +205,20 @@ struct AT_API Tensor {
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() && = delete;
 
+  // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
+  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
+  // cast the data pointer to a __restrict__ pointer.
+  // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor
+  // as an argument.
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+    PackedTensorAccessor<T,N,PtrTraits> packed_accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return PackedTensorAccessor<T,N,PtrTraits>(static_cast<typename PtrTraits<T>::PtrType>(data<T>()),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N,  template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor<T,N> packed_accessor() && = delete;
+
   Tensor operator-() const;
   Tensor& operator+=(const Tensor & other);
   Tensor& operator+=(Scalar other);
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 4fcbeaa137ae78..8103f025988aab 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -27,7 +27,8 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/apply_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/stream_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu)
 if (CUDNN_FOUND)
   list(APPEND ATen_CUDA_TEST_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_test.cpp)
diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
new file mode 100644
index 00000000000000..f1eb5cbbe4b1f3
--- /dev/null
+++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
@@ -0,0 +1,46 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+#include "ATen/core/TensorAccessor.h"
+#include "ATen/cuda/CUDAContext.h"
+
+#include <assert.h>
+
+using namespace at;
+
+__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,RestrictPtrTraits> resa,
+						   PackedTensorAccessor<float,2,RestrictPtrTraits> t1a,
+						   PackedTensorAccessor<float,1,RestrictPtrTraits> t2a){
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    float val = 0.0f;
+    for (int64_t j = 0; j < t1a.size(1); j++) {
+      val += t1a[i][j] * t2a[j];
+    }
+    resa[i] = val;
+  }
+}
+
+TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
+  manual_seed(123, at::kCPU);
+  manual_seed(123, at::kCUDA);
+
+  Tensor t1 = rand({4, 4}, CUDA(kFloat));
+  Tensor t2 = rand({4}, CUDA(kFloat));
+  Tensor res = empty({4}, CUDA(kFloat));
+
+  auto t1a = t1.packed_accessor<float, 2, RestrictPtrTraits>();
+  auto t2a = t2.packed_accessor<float, 1, RestrictPtrTraits>();
+  auto resa = res.packed_accessor<float, 1, RestrictPtrTraits>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  
+  test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a);
+  cudaError_t err = cudaDeviceSynchronize();
+  REQUIRE(err == cudaSuccess);
+
+  auto expected = mv(t1, t2);
+
+  REQUIRE(res.allclose(expected));
+}

From 4db21a1d8e97dd087293988e46c7d43a7ac59ebe Mon Sep 17 00:00:00 2001
From: Lukasz Wesolowski <lwesolowski@fb.com>
Date: Tue, 11 Sep 2018 13:18:43 -0700
Subject: [PATCH 046/237] Optimize LengthsTileOp on GPU to run a kernel instead
 of a sequence of memcopies (#11413)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11413

LengthsTileOp was implemented using a sequence of device memcopies initiated on the CPU. This was very slow. I changed it to use a kernel. TUM benchmark QPS improved from 13k QPS to 20k QPS as a result.

Reviewed By: manojkris, xianjiec

Differential Revision: D9724988

fbshipit-source-id: 2f98c697730982734d7c6a26d0b6967310d49900
---
 caffe2/operators/lengths_tile_op.cc           |  44 +++++++
 caffe2/operators/lengths_tile_op.cu           | 110 ++++++++++++++++++
 caffe2/operators/lengths_tile_op.h            |  40 +------
 caffe2/operators/lengths_tile_op_gpu.cc       |   6 -
 .../operator_test/lengths_tile_op_test.py     |   4 +-
 5 files changed, 158 insertions(+), 46 deletions(-)
 create mode 100644 caffe2/operators/lengths_tile_op.cu
 delete mode 100644 caffe2/operators/lengths_tile_op_gpu.cc

diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc
index e832fe8723a0ce..d5af0a91bd65c0 100644
--- a/caffe2/operators/lengths_tile_op.cc
+++ b/caffe2/operators/lengths_tile_op.cc
@@ -1,6 +1,50 @@
 #include "caffe2/operators/lengths_tile_op.h"
 
 namespace caffe2 {
+
+template <>
+bool LengthsTileOp<CPUContext>::RunOnDevice() {
+  auto& data = Input(DATA);
+  auto& lengths = Input(LENGTHS);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
+
+  // Context::CopyFrom and math::Sum need the same context to avoid race
+  // conditions
+  // why? CPUContext is not used in Sum
+  lengths_host_.CopyFrom(lengths, &context_);
+  context_.FinishDeviceComputation();
+  auto lengths_size = lengths_host_.size();
+  auto* lengths_data = lengths_host_.data<int32_t>();
+
+  int32_t total_length = 0;
+  CPUContext cpuContext;
+  math::Sum<int32_t, CPUContext>(
+      lengths_size, lengths_data, &total_length, &cpuContext);
+
+  auto shape = data.dims();
+  shape[0] = total_length;
+  output->Resize(shape);
+
+  auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+  auto src = static_cast<const char*>(data.raw_data());
+  auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+
+  for (TIndex i = 0; i < lengths_size; ++i) {
+    auto length = lengths_data[i];
+    CAFFE_ENFORCE_GE(length, 0);
+    for (int32_t j = 0; j < length; ++j) {
+      context_.CopyBytesSameDevice(block_bytesize, src, out);
+      out += block_bytesize;
+    }
+    src += block_bytesize;
+  }
+  return true;
+}
+
 REGISTER_CPU_OPERATOR(LengthsTile, LengthsTileOp<CPUContext>);
 
 OPERATOR_SCHEMA(LengthsTile)
diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu
new file mode 100644
index 00000000000000..aebb33c1460a56
--- /dev/null
+++ b/caffe2/operators/lengths_tile_op.cu
@@ -0,0 +1,110 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/lengths_tile_op.h"
+
+namespace caffe2 {
+
+template <typename T>
+__global__ void lengthsTileKernel(
+    int numElements,
+    int rowSize,
+    const T* input,
+    T* output,
+    const int32_t* inputRowOffsets) {
+  CUDA_1D_KERNEL_LOOP(i, numElements) {
+    auto outputRowIndex = i / rowSize;
+    auto inputBlockOffset = inputRowOffsets[outputRowIndex];
+    auto indexInRow = i - outputRowIndex * rowSize;
+    output[i] = input[inputBlockOffset + indexInRow];
+  }
+}
+
+template <>
+bool LengthsTileOp<CUDAContext>::RunOnDevice() {
+  auto& data = Input(DATA);
+  auto& lengths = Input(LENGTHS);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
+
+  lengths_host_.CopyFrom(lengths, &context_);
+  context_.FinishDeviceComputation();
+  auto lengths_size = lengths_host_.size();
+  auto* lengths_data = lengths_host_.data<int32_t>();
+
+  int32_t total_length = 0;
+  CPUContext cpuContext;
+  math::Sum<int32_t, CPUContext>(
+      lengths_size, lengths_data, &total_length, &cpuContext);
+
+  auto shape = data.dims();
+  shape[0] = total_length;
+  output->Resize(shape);
+
+  auto numElementsPerRow = data.size_from_dim(1);
+  auto numElements = total_length * numElementsPerRow;
+  auto numBlocks = CAFFE_GET_BLOCKS(numElements);
+
+  rowMappingHost_.Resize(total_length);
+  rowMappingDevice_.Resize(total_length);
+  auto* rowOffsets = rowMappingHost_.mutable_data<int32_t>();
+  int32_t outputRow = 0;
+  for (TIndex i = 0; i < lengths_size; i++) {
+    auto length = lengths_data[i];
+    for (int32_t j = 0; j < length; j++) {
+      rowOffsets[outputRow++] = i * numElementsPerRow;
+    }
+  }
+
+  context_.CopyFromCPU<int32_t>(
+      total_length,
+      rowMappingHost_.data<int32_t>(),
+      rowMappingDevice_.mutable_data<int32_t>());
+  context_.FinishDeviceComputation();
+
+  if (data.template IsType<float>()) {
+    lengthsTileKernel<<<
+        numBlocks,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numElements,
+        numElementsPerRow,
+        data.data<float>(),
+        output->mutable_data<float>(),
+        rowMappingDevice_.data<int32_t>());
+  } else if (data.template IsType<int>()) {
+    lengthsTileKernel<<<
+        numBlocks,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numElements,
+        numElementsPerRow,
+        data.data<int>(),
+        output->mutable_data<int>(),
+        rowMappingDevice_.data<int32_t>());
+  } else if (data.template IsType<int64_t>()) {
+    lengthsTileKernel<<<
+        numBlocks,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numElements,
+        numElementsPerRow,
+        data.data<int64_t>(),
+        output->mutable_data<int64_t>(),
+        rowMappingDevice_.data<int32_t>());
+  } else {
+    CAFFE_THROW(
+        "LengthsTile operator only supports 32-bit float, int and int64_t"
+        " types but input was of type ",
+        data.meta().name());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h
index c92adcb956d916..c9bd5ef500d996 100644
--- a/caffe2/operators/lengths_tile_op.h
+++ b/caffe2/operators/lengths_tile_op.h
@@ -13,44 +13,6 @@ class LengthsTileOp : public Operator<Context> {
   USE_SIMPLE_CTOR_DTOR(LengthsTileOp);
 
   bool RunOnDevice() override {
-    auto& data = Input(DATA);
-    auto& lengths = Input(LENGTHS);
-    auto* output = Output(0);
-
-    CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
-    CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
-    CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
-
-    // Context::CopyFrom and math::Sum need the same context to avoid race
-    // conditions
-    // why? CPUContext is not used in Sum
-    lengths_host_.CopyFrom(lengths, &context_);
-    context_.FinishDeviceComputation();
-    auto lengths_size = lengths_host_.size();
-    auto* lengths_data = lengths_host_.data<int32_t>();
-
-    int32_t total_length = 0;
-    CPUContext cpuContext;
-    math::Sum<int32_t, CPUContext>(
-        lengths_size, lengths_data, &total_length, &cpuContext);
-
-    auto shape = data.dims();
-    shape[0] = total_length;
-    output->Resize(shape);
-
-    auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
-    auto src = static_cast<const char*>(data.raw_data());
-    auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
-
-    for (TIndex i = 0; i < lengths_size; ++i) {
-      auto length = lengths_data[i];
-      CAFFE_ENFORCE_GE(length, 0);
-      for (int32_t j = 0; j < length; ++j) {
-        context_.CopyBytesSameDevice(block_bytesize, src, out);
-        out += block_bytesize;
-      }
-      src += block_bytesize;
-    }
     return true;
   }
 
@@ -58,6 +20,8 @@ class LengthsTileOp : public Operator<Context> {
 
  private:
   Tensor lengths_host_{CPU};
+  Tensor rowMappingHost_{CPU};
+  Tensor rowMappingDevice_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op_gpu.cc b/caffe2/operators/lengths_tile_op_gpu.cc
deleted file mode 100644
index 65ed44b735de43..00000000000000
--- a/caffe2/operators/lengths_tile_op_gpu.cc
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/lengths_tile_op.h"
-
-namespace caffe2 {
-REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp<CUDAContext>);
-} // namespace caffe2
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index d37904c08ec08b..42ef87ac364f70 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -15,7 +15,7 @@ class TestLengthsTileOp(hu.HypothesisTestCase):
     @given(
         inputs=st.integers(min_value=1, max_value=20).flatmap(
             lambda size: st.tuples(
-                hu.arrays([size]),
+                hu.arrays([size], dtype=np.float32),
                 hu.arrays([size], dtype=np.int32,
                           elements=st.integers(min_value=0, max_value=20)),
             )
@@ -32,7 +32,7 @@ def lengths_tile_op(data, lengths):
         op = core.CreateOperator(
             "LengthsTile",
             ["data", "lengths"],
-            ["output"]
+            ["output"],
         )
 
         self.assertReferenceChecks(

From 17776db2ee8cb9248233c22f33810345859352b8 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Tue, 11 Sep 2018 13:29:34 -0700
Subject: [PATCH 047/237] Add gtest dependency on aten tests. (#11429)

Summary:
ezyang delivering my promise to you :)

Basically, now aten tests can use gtest as part of our test harness unification effort. I also converted one test (atest.cpp) to show how one can do this.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11429

Reviewed By: ezyang

Differential Revision: D9762934

Pulled By: Yangqing

fbshipit-source-id: 68ec3a748403c6bd88399b1e756200985a4e07e3
---
 .jenkins/caffe2/test.sh      | 15 ++++++------
 aten/src/ATen/test/atest.cpp | 46 ++++++++++++++++++------------------
 caffe2/CMakeLists.txt        | 31 ++----------------------
 3 files changed, 33 insertions(+), 59 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index b0f9c41382601f..6ca0850f940968 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -62,19 +62,20 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
     */mkl_utils_test|*/aten/integer_divider_test)
       continue
       ;;
-    */aten/*)
-      # ATen uses test framework Catch2
-      # NB: We do NOT use the xml test reporter, because
-      # Catch doesn't support multiple reporters
+    *)
+      # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While
+      # planning to migrate to gtest as the common PyTorch c++ test suite, we
+      # currently do NOT use the xml test reporter, because Catch doesn't
+      # support multiple reporters
       # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223
       # which means that enabling XML output means you lose useful stdout
       # output for Jenkins.  It's more important to have useful console
       # output than it is to have XML output for Jenkins.
+      # Note: in the future, if we want to use xml test reporter once we switch
+      # to all gtest, one can simply do:
+      # "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
       "$test"
       ;;
-    *)
-      "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
-      ;;
   esac
 done
 
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index dd37fa86af3a36..8dffa3d7c02c75 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -19,26 +18,26 @@ void trace() {
     trace += foo_a[i][i];
   }
 
-  REQUIRE(foo.trace().toCFloat() == Approx(trace));
+  EXPECT_FLOAT_EQ(foo.trace().toCFloat(), trace);
 }
 
-TEST_CASE( "atest", "[]" ) {
-
+// TEST_CASE( "atest", "[]" ) {
+TEST(atest, atest) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
   auto foo = rand({12,6});
-  REQUIRE(foo.data<float>() == foo.toFloatData());
+  EXPECT_EQ(foo.data<float>(), foo.toFloatData());
 
-  REQUIRE(foo.size(0) == 12);
-  REQUIRE(foo.size(1) == 6);
+  EXPECT_EQ(foo.size(0), 12);
+  EXPECT_EQ(foo.size(1), 6);
 
   foo = foo+foo*3;
   foo -= 4;
 
   Scalar a = 4;
   float b = a.to<float>();
-  REQUIRE(b == 4);
+  EXPECT_EQ(b, 4);
 
   foo = (foo*foo) == (foo.pow(3));
   foo =  2 + (foo+1);
@@ -51,7 +50,7 @@ TEST_CASE( "atest", "[]" ) {
     }
   }
 
-  REQUIRE(foo.equal(4 * ones({12, 6}, kByte)));
+  EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte)));
 
   trace();
 
@@ -61,17 +60,18 @@ TEST_CASE( "atest", "[]" ) {
   auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3});
   auto f_a = f.accessor<float,3>();
 
-  REQUIRE(f_a[0][0][0] == 1.0);
-  REQUIRE(f_a[0][1][1] == 5.0);
+  EXPECT_EQ(f_a[0][0][0], 1.0);
+  EXPECT_EQ(f_a[0][1][1], 5.0);
 
-  REQUIRE(f.strides()[0] == 6);
-  REQUIRE(f.strides()[1] == 3);
-  REQUIRE(f.strides()[2] == 1);
-  REQUIRE(f.sizes()[0] == 1);
-  REQUIRE(f.sizes()[1] == 2);
-  REQUIRE(f.sizes()[2] == 3);
+  EXPECT_EQ(f.strides()[0], 6);
+  EXPECT_EQ(f.strides()[1], 3);
+  EXPECT_EQ(f.strides()[2], 1);
+  EXPECT_EQ(f.sizes()[0], 1);
+  EXPECT_EQ(f.sizes()[1], 2);
+  EXPECT_EQ(f.sizes()[2], 3);
 
-  REQUIRE_THROWS(f.resize_({3,4,5}));
+  // TODO(ezyang): maybe do a more precise exception type.
+  ASSERT_THROW(f.resize_({3,4,5}), std::exception);
   {
     int isgone = 0;
     {
@@ -79,7 +79,7 @@ TEST_CASE( "atest", "[]" ) {
         isgone++;
       });
     }
-    REQUIRE(isgone == 1);
+    EXPECT_EQ(isgone, 1);
   }
   {
     int isgone = 0;
@@ -90,9 +90,9 @@ TEST_CASE( "atest", "[]" ) {
       });
       a_view = f2.view({3,2,1});
     }
-    REQUIRE(isgone == 0);
+    EXPECT_EQ(isgone, 0);
     a_view.reset();
-    REQUIRE(isgone == 1);
+    EXPECT_EQ(isgone, 1);
   }
 
   if(at::hasCUDA()) {
@@ -103,6 +103,6 @@ TEST_CASE( "atest", "[]" ) {
         isgone++;
       });
     }
-    REQUIRE(isgone==1);
+    EXPECT_EQ(isgone, 1);
   }
 }
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 8672f836d89c69..99aca34a860d79 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -32,9 +32,8 @@ if (NOT BUILD_ATEN_MOBILE)
   # Add source, includes, and libs to lists
   list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
   list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS})
-  # ATen tests use catch instead of gtest so keep separate for now
-  # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
-  # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
+  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
+  list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
   list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
@@ -428,32 +427,6 @@ if (BUILD_TEST)
     endforeach()
   endif()
 
-  if (NOT USE_ROCM)
-    set(__aten_test_dir "test/aten")
-    foreach(test_src ${ATen_CPU_TEST_SRCS})
-      get_filename_component(test_name ${test_src} NAME_WE)
-      add_executable(${test_name} "${test_src}")
-      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-      target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
-      target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
-      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-      install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
-    endforeach()
-
-    if(USE_CUDA OR USE_ROCM)
-      foreach(test_src ${ATen_CUDA_TEST_SRCS})
-        get_filename_component(test_name ${test_src} NAME_WE)
-        torch_cuda_based_add_executable(${test_name} "${test_src}")
-        target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-        target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-        target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
-        target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
-        add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-        install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
-      endforeach()
-    endif()
-  endif()
 endif()
 
 if (BUILD_PYTHON)

From 289a8c9b7def723b64b77dc32786252a9d35865b Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Tue, 11 Sep 2018 15:01:48 -0700
Subject: [PATCH 048/237] Allow train/eval, and non-Tensor arguments to python
 functions (#11505)

Summary:
This whitelists train/eval functions in script modules, and tests that nested nn.Modules still work.

This also changes the code for calling python functions from script to allow non-tensor inputs/outputs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11505

Differential Revision: D9765466

Pulled By: zdevito

fbshipit-source-id: 1177bff931324422b69e18fa0bbaa82e3c98ec69
---
 setup.py                                |   9 ++
 test/test_jit.py                        |  94 ++++++++++++++++++++-
 torch/csrc/autograd/python_function.cpp |  29 +++++--
 torch/csrc/jit/ir.cpp                   |   4 +-
 torch/csrc/jit/ir.h                     |   4 +-
 torch/csrc/jit/passes/onnx.cpp          |   4 +-
 torch/csrc/jit/pybind_utils.h           | 105 ++++++++++++------------
 torch/csrc/jit/python_interpreter.cpp   |  43 ++--------
 torch/csrc/jit/python_ir.cpp            |  23 ------
 torch/csrc/jit/script/compiler.cpp      |   1 -
 torch/csrc/jit/script/init.cpp          |  37 +++------
 torch/jit/__init__.py                   |   2 +-
 torch/jit/annotations.py                |  18 +---
 13 files changed, 205 insertions(+), 168 deletions(-)

diff --git a/setup.py b/setup.py
index f99a2bf945c162..1ed87cc7819cf7 100644
--- a/setup.py
+++ b/setup.py
@@ -532,6 +532,14 @@ def load(filename):
                         for f in ninja_files + cmake_files
                         for entry in load(f)]
 
+        # cquery does not like c++ compiles that start with gcc.
+        # It forgets to include the c++ header directories.
+        # We can work around this by replacing the gcc calls that python
+        # setup.py generates with g++ calls instead
+        for command in all_commands:
+            if command['command'].startswith("gcc "):
+                command['command'] = "g++ " + command['command'][4:]
+
         new_contents = json.dumps(all_commands, indent=2)
         contents = ''
         if os.path.exists('compile_commands.json'):
@@ -540,6 +548,7 @@ def load(filename):
         if contents != new_contents:
             with open('compile_commands.json', 'w') as f:
                 f.write(new_contents)
+
         if not USE_NINJA:
             print("WARNING: 'develop' is not building C++ code incrementally")
             print("because ninja is not installed. Run this to enable it:")
diff --git a/test/test_jit.py b/test/test_jit.py
index 23b76d815d4f07..0c64a48f0bdcaa 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -460,6 +460,52 @@ def method(self, x):
         finally:
             torch.jit._enabled = True
 
+    def test_train_eval(self):
+        class Sub(nn.Module):
+            def forward(self, input):
+                if self.training:
+                    return input
+                else:
+                    return -input
+
+        class MyModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.sub = Sub()
+
+            @torch.jit.script_method
+            def forward(self, input):
+                return self.sub(input) + 1
+
+        m = MyModule()
+        input = torch.rand(3, 4)
+        self.assertEqual(input + 1, m(input))
+        m.eval()
+        self.assertEqual(-input + 1, m(input))
+
+    def test_train_eval_const(self):
+        class MyModule(torch.jit.ScriptModule):
+            __constants__ = ['training']
+
+            def __init__(self):
+                super(MyModule, self).__init__()
+                # TODO: it is illegal to try to call
+                # eval/train because training has already
+                # been set. Consider allowing
+                # constants to be mutable until the end of __init__
+
+            @torch.jit.script_method
+            def forward(self, input):
+                if self.training:
+                    x = 2 * input
+                else:
+                    x = -input
+                return x + 1
+
+        m = MyModule()
+        input = torch.rand(3, 4)
+        self.assertEqual(2 * input + 1, m(input))
+
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
     # and we attempted to trace its derivative (which is not
@@ -1418,6 +1464,25 @@ def fn(x):
         fn(x)
         fn(y)
 
+    def test_python_function_tup(self):
+        class MyFn(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x + 1, x - 1
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output, grad_output
+
+        @_trace(torch.zeros(2))
+        def fn(x):
+            a, b = MyFn.apply(x + 2)
+            return a + b + 3
+        x = torch.tensor([1., 2., 3.])
+        y = torch.randn(2, 2, requires_grad=True)
+        fn(x)
+        fn(y)
+
     def test_decompose_addmm(self):
         @torch.jit.script
         def addmm(mat, mat1, mat2, alpha, beta):
@@ -4438,6 +4503,32 @@ def forward(self, x, seq_lens):
             f = io.BytesIO()
             torch.onnx._export(m, (x, seq_lens), f, verbose=False)
 
+    def test_python_call_non_tensor(self):
+        def foo(a, b, c):
+            # type: (Tensor, int, Tuple[Tensor, int]) -> Tuple[int, Tensor]
+            d, e = c
+            return b + e, a + d
+
+        @torch.jit.script
+        def bar():
+            x = torch.ones(3, 4)
+            a, b = foo(x, 3, (x, 3))
+            return a, b
+
+        self.assertEqual((6, torch.ones(3, 4) + 1), bar())
+
+    def test_python_call_non_tensor_wrong(self):
+        with self.assertRaisesRegex(RuntimeError, r"but instead got value of type tuple"):
+            def foo():
+                # type: () -> Tensor
+                return ((3, 4),)
+
+            @torch.jit.script
+            def bar():
+                return foo()
+
+            bar()
+
     def test_tuples(self):
         @torch.jit.script
         def foo(i):
@@ -5770,7 +5861,7 @@ def test_fn():
                 return foo(torch.full([1], 1), torch.full([1], 2), torch.full([1], 3))
 
     def test_wrong_return_type(self):
-        with self.assertRaisesRegex(RuntimeError, 'Python functions can currently only return Tensors'):
+        with self.assertRaisesRegex(RuntimeError, 'but instead got value of type tuple'):
             def somefunc():
                 # type: () -> Tuple[Tuple[Tensor, Tensor]]
                 return torch.zeros(3, 4), torch.zeros(4, 5)
@@ -5778,6 +5869,7 @@ def somefunc():
             @torch.jit.script
             def wrong_return_type():
                 return somefunc()
+            wrong_return_type()
 
     # Tests for calling between different front-end modes
     def test_call_python_fn_from_tracing_fn(self):
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 2301df22e1a661..ace53fdfb0aed9 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -575,9 +575,9 @@ static Node* _trace_pre_record(
   for (int i = 0; i < num_args; i++) {
     PyObject *arg_object = PyTuple_GET_ITEM(input_objects, i);
     if (THPVariable_Check(arg_object)) {
-      arg_types.push_back('t');
+      arg_types.push_back('d');
     } else {
-      arg_types.push_back('s');
+      arg_types.push_back('c');
       Py_INCREF(arg_object);
       scalar_args.emplace_back(arg_object);
     }
@@ -597,21 +597,34 @@ static void _trace_post_record(
     PyObject* op_obj,
     const variable_list& input_vars,
     PyObject *output_objects,
-    bool is_inplace) {
+    bool is_inplace,
+    bool unpack_output) {
   if (!jit::tracer::isTracing()) {
     return;
   }
 
+  node->i_(attr::inplace, is_inplace);
+
   // Isolate C variable ptrs in a vector
   int num_outputs = PyTuple_GET_SIZE(output_objects);
   variable_list output_vars(num_outputs);
+  auto graph = node->owningGraph();
+  node->addOutput();
+  if (!unpack_output) {
+    std::vector<TypePtr> tuple_values(num_outputs, DynamicType::get());
+    TypePtr tuple_type = TupleType::create(std::move(tuple_values));
+    node->output()->setType(tuple_type);
+    auto unpacked = graph->createTupleUnpack(node->output())->insertAfter(node);
+    node = unpacked;
+  }
   for (int i = 0; i < num_outputs; ++i) {
     auto var = (THPVariable*)PyTuple_GET_ITEM(output_objects, i);
-    jit::tracer::addOutput(node, var->cdata);
+    Value* value = node->outputs()[i];
+    if (var->cdata.defined()) {
+      value->inferTypeFrom(var->cdata);
+      jit::tracer::setValueTrace(autograd::as_variable_ref(var->cdata), value);
+    }
   }
-
-  node->i_(attr::inplace, is_inplace);
-
 }
 
 PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const UnpackedInput& unpacked,
@@ -637,7 +650,7 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked
 
   bool is_inplace = static_cast<bool>(grad_fn->dirty_tensors);
   _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable);
-  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace);
+  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output);
   if (is_executable) {
     _save_variables(grad_fn);
   } else {
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 48da3ec9cea913..4cc59e8b9bb8e1 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -331,9 +331,9 @@ void Node::lint() const {
   IR_ELSEIFM_CONST(PythonOp)
     size_t n_scalars = 0, n_tensors = 0;
     for (auto c : value->cconv) {
-      if (c == 's') {
+      if (c == 'c') {
         n_scalars++;
-      } else if (c == 't') {
+      } else if (c == 'd') {
         n_tensors++;
       } else {
         JIT_ASSERT(0);
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 9ac5059a2d801e..ae3c8191922c8c 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -1391,8 +1391,8 @@ struct PythonOp : public Node {
   // TraceInterpreterState for execution semantics.
   THPObjectPtr pyobj;
   // The calling convention for the Python function.
-  // 's' -- python scalar argument
-  // 't' -- tensor argument
+  // 'c' -- constant argument
+  // 'd' -- dynamic argument
   std::string cconv;
   // Scalar arguments to the Python function.  Not necessarily passed to
   // the function in this order; see cconv for the correct order.
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 75fb063c761a31..6efcd11bfaadf3 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -156,10 +156,10 @@ void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExpo
     auto scalar_it = op->scalar_args.begin();
     for (auto arg_type : op->cconv) {
       py::object obj;
-      if (arg_type == 's') {
+      if (arg_type == 'c') {
         JIT_ASSERTM(scalar_it != op->scalar_args.end(), "expected too many scalar args");
         obj = py::reinterpret_borrow<py::object>(py::handle((scalar_it++)->get()));
-      } else if (arg_type == 't') {
+      } else if (arg_type == 'd') {
         JIT_ASSERTM(node_it != inputs.end(), "expected too many inputs");
         obj = py::cast(envFn(*node_it++));
       } else {
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 4b76872cb3f967..b9f9e6bb85fd4e 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -18,6 +18,12 @@
 
 namespace torch { namespace jit {
 namespace detail {
+
+// error reporting: when reporting user-caused errors, these functions should
+// not use AT_ERROR macros, since these macros add stack trace information
+// that is confusing to display to the end user since it always reports
+// locations in libtorch code rather than user code.
+
 inline void findErrorInKwargs(
     const FunctionSchema& schema,
     py::kwargs kwargs) {
@@ -26,22 +32,22 @@ inline void findErrorInKwargs(
   // any argument in the schema.
   for (const auto& kwarg : kwargs) {
     const auto key = py::cast<std::string>(kwarg.first);
-    AT_CHECK(
-        std::count_if(
+    if(!std::count_if(
             arguments.begin(),
             arguments.end(),
-            [&key](const Argument& argument) { return argument.name == key; }),
-        "Unknown keyword argument '", key, "' for operator '",
-        schema.name, "'. Schema: ", schema);
+            [&key](const Argument& argument) { return argument.name == key; })) {
+        throw std::runtime_error(at::str("Unknown keyword argument '", key, "' for operator '",
+        schema.name, "'. Schema: ", schema));
+    }
   }
   // If there are unconsumed kwargs but none of them were unknown, the first
   // positional argument present in the kwargs is duplicated.
   for (const auto& argument : arguments) {
     if (kwargs.contains(argument.name.c_str())) {
       AT_ASSERT(!argument.default_value);
-      AT_ERROR(
+      throw std::runtime_error(at::str(
           "Argument '", argument.name, "' specified both as positional and ",
-          "keyword argument. Schema: ", schema);
+          "keyword argument. Schema: ", schema));
     }
   }
 }
@@ -78,19 +84,6 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
   return ConstantList<IValue>::create(std::move(elems));
 }
 
-struct ConvertError : public std::exception {
-    ConvertError(std::string msg)
-    : msg_(std::move(msg)) {}
-    const char* what() const noexcept override  {
-        return msg_.c_str();
-    }
-private:
-    std::string msg_;
-};
-
-#define TORCH_CONVERT_ERROR(...) \
-  throw ConvertError(at::str(__VA_ARGS__))
-
 inline IValue toIValue(py::handle obj, const TypePtr& type) {
     switch (type->kind()) {
       case TypeKind::DynamicType:
@@ -104,11 +97,14 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
       case TypeKind::NoneType:
         return {};
       case TypeKind::TupleType: {
+        if(!PyTuple_Check(obj.ptr()))
+          throw py::cast_error(); // note: the py::cast does not throw cast_error
+                                  // because it attempts to iterate a non-tuple
         py::tuple tuple = py::cast<py::tuple>(obj);
         size_t tuple_size = tuple.size();
         const auto & elem_types = type->cast<TupleType>()->elements();
         if (elem_types.size() != tuple_size) {
-          TORCH_CONVERT_ERROR("Expected ", elem_types.size(), " tuple elements for argument, but got ", tuple_size);
+          throw py::cast_error();
         }
         std::vector<IValue> values;
         values.reserve(tuple_size);
@@ -134,11 +130,10 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
         }
       }
       case TypeKind::NumberType:
-        TORCH_CONVERT_ERROR("Insufficient type information to convert input");
       case TypeKind::GeneratorType:
-        TORCH_CONVERT_ERROR("Generators are not supported yet.");
+        break;
     }
-  AT_ERROR("Missing cases in toIValue! File a bug report.");
+  AT_ERROR("Missing cases in toIValue for type: ", type->str(), "! File a bug report.");
 }
 
 inline IValue argumentToIValue(
@@ -149,21 +144,28 @@ inline IValue argumentToIValue(
   try {
     return toIValue(object, argument.type);
   } catch (const py::cast_error& error) {
-    AT_ERROR(
+    throw std::runtime_error(at::str(
         schema.name, "() expected value of type ", argument.type->str(),
         " for argument '", argument.name,
         "' in position ", argumentPosition,
         ", but instead got value of type ",
-        py::str(object.get_type().attr("__name__")),
-        ".\nDeclaration: ", schema);
-  } catch (const ConvertError& error) {
-    AT_ERROR(
-        schema.name, "(): ", error.what(),
-        "\n for argument '", argument.name,
-        "' in position ", argumentPosition,
-        ", but instead got value of type ",
-        py::str(object.get_type().attr("__name__")),
-        ".\nDeclaration: ", schema);
+        py::str(object.get_type().attr("__name__")), ".",
+        "\nValue: ", py::repr(object),
+        "\nDeclaration: ", schema));
+  }
+}
+
+inline IValue returnToIValue(
+    const TypePtr& type,
+    py::handle object) {
+  try {
+    return toIValue(object, type);
+  } catch (const py::cast_error& error) {
+    throw std::runtime_error(at::str(
+        " expected value of type ", type->str(),
+        " for return value but instead got value of type ",
+        py::str(object.get_type().attr("__name__")), ".",
+          "\nValue: ", py::repr(object)));
   }
 }
 
@@ -199,12 +201,12 @@ inline Stack createStackForSchema(
     const FunctionSchema& schema,
     py::args args,
     py::kwargs kwargs = py::kwargs()) {
-  AT_CHECK(
-      args.size() + kwargs.size() <= schema.arguments.size(),
-      schema.name, "() expected at most ", schema.arguments.size(),
-      " argument(s) but received ",
-      args.size() + kwargs.size(), " argument(s). Declaration: ", schema);
-
+  if(args.size() + kwargs.size() > schema.arguments.size()) {
+    throw std::runtime_error(at::str(
+        schema.name, "() expected at most ", schema.arguments.size(),
+        " argument(s) but received ",
+        args.size() + kwargs.size(), " argument(s). Declaration: ", schema));
+  }
   Stack stack;
   stack.reserve(schema.arguments.size());
 
@@ -226,9 +228,9 @@ inline Stack createStackForSchema(
     } else if (arg.default_value) {
       push(stack, *arg.default_value);
     } else {
-      AT_ERROR(
+      throw std::runtime_error(at::str(
           schema.name, "() is missing value for argument '", arg.name,
-          "'. Declaration: ", schema);
+          "'. Declaration: ", schema));
     }
   }
 
@@ -285,18 +287,13 @@ inline py::object invokeOperatorFromPython(
     const Operator& op,
     py::args args,
     py::kwargs kwargs) {
-  try {
-    // Create a stack full of the arguments and keyword arguments.
-    auto stack =
-        createStackForSchema(op.schema(), std::move(args), std::move(kwargs));
+  // Create a stack full of the arguments and keyword arguments.
+  auto stack =
+      createStackForSchema(op.schema(), std::move(args), std::move(kwargs));
 
-    // Invoke the operation, which puts the return values onto the stack.
-    op.getOperation()(stack);
+  // Invoke the operation, which puts the return values onto the stack.
+  op.getOperation()(stack);
 
-    return createPyObjectForStack(std::move(stack));
-  } catch (const at::Error& error) {
-    // We don't want to show the backtrace in the error message in Python.
-    throw std::runtime_error(error.what_without_backtrace());
-  }
+  return createPyObjectForStack(std::move(stack));
 }
 }}  // namespace torch::jit
diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp
index 86bd4dfaf96733..5cc4b70b2e9637 100644
--- a/torch/csrc/jit/python_interpreter.cpp
+++ b/torch/csrc/jit/python_interpreter.cpp
@@ -10,6 +10,7 @@
 #include "torch/csrc/jit/custom_operator.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/pybind_utils.h"
 
 #include "torch/csrc/variable_tensor_functions.h"
 
@@ -31,9 +32,10 @@ Operation createPythonOperation(Node* op_) {
   py::function func = py::reinterpret_borrow<py::function>(py::handle(op->pyobj.get()));
   size_t num_inputs = 0;
   for(auto arg_type : op->cconv) {
-    if(arg_type == 't')
+    if(arg_type == 'd')
       num_inputs++;
   }
+  JIT_ASSERT(op->outputs().size() == 1);
   return [=](Stack & stack) {
     AutoGIL gil;
     py::tuple py_inputs(op->cconv.size());
@@ -41,47 +43,18 @@ Operation createPythonOperation(Node* op_) {
     size_t next_scalar = 0;
     size_t next_tensor = 0;
     for (auto arg_type : op->cconv) {
-      if (arg_type == 's') {
+      if (arg_type == 'c') {
         py_inputs[i] = py::reinterpret_borrow<py::object>(
             op->scalar_args[next_scalar++].get());
-      } else if (arg_type == 't') {
-        auto var = std::move(peek(stack, next_tensor, num_inputs)).toTensor();
-        py_inputs[i] =
-            py::reinterpret_steal<py::object>(THPVariable_Wrap(var));
+      } else if (arg_type == 'd') {
+        py_inputs[i] = toPyObject(std::move(peek(stack, next_tensor, num_inputs)));
         next_tensor++;
       }
       i++;
     }
     drop(stack, num_inputs);
-    py::object py_outputs(func(*py_inputs));
-
-    auto num_outputs = op->outputs().size();
-    auto addOutput = [&](py::handle entry) {
-      if (!THPVariable_Check(entry.ptr())) {
-        throw std::runtime_error(
-            "Function application returned a non-Variable output");
-      }
-      THPVariable* var = (THPVariable*)entry.ptr();
-      auto cdata = var->cdata;
-      stack.push_back(std::move(cdata));
-    };
-
-    if (!PyTuple_Check(py_outputs.ptr())) {
-      if (num_outputs != 1) {
-        throw std::runtime_error(
-            "Function.apply returned the wrong number of outputs.");
-      }
-      addOutput(py_outputs);
-    } else {
-      auto output_tuple = py::tuple(py_outputs);
-      if (output_tuple.size() != num_outputs) {
-        throw std::runtime_error(
-            "Function application returned the wrong number of outputs.");
-      }
-      for (py::handle entry : py::tuple(py_outputs)) {
-        addOutput(entry);
-      }
-    }
+    py::object py_output(func(*py_inputs));
+    stack.push_back(returnToIValue(op->output()->type(), py_output));
     return 0;
   };
 }
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 11ec332251b6c5..c745f9f6d8f122 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -180,29 +180,6 @@ void initPythonIRBindings(PyObject * module_) {
        py::arg("defer_weight_export")=false,
        py::arg("operator_export_type")=::torch::onnx::OperatorExportTypes::ONNX,
        py::arg("google_printer")=false)
-    .def("wrapPyFuncWithSymbolic", [](Graph &g, py::function func, std::vector<Value*> inputs, size_t n_outputs, py::function symbolic) {
-      // This function should be used for situations where we have a Python function
-      // that should have different behavior when exporting for JIT interpreter
-      // execution v.s. for ONNX export. For example, nn.utils.rnn.pack_padded_sequence
-      // emits a placeholder under ONNX export, but we want to keep the ability to
-      // run this in the interpreter, thus we emit a PythonOp for that use case.
-
-      // Concretely, this function emits a PythonOp wrapping the passed-in
-      // parameter `func`, while storing the function `symbolic` for use by the
-      // ONNX export
-      std::string cconv(inputs.size(), 't');
-      func.attr("symbolic") = symbolic;
-      Node* new_node = g.insertNode(g.createPythonOp(
-        THPObjectPtr(func.release().ptr()), cconv, {}));
-      for (auto i : inputs)
-        new_node->addInput(i);
-      std::vector<Value*> outputs;
-      for (size_t i = 0; i < n_outputs; ++i)
-        new_node->addOutput();
-      auto sl = std::make_shared<StringSourceLocation>(tracer::getPythonInterpreterStackTrace());
-      new_node->setSourceLocation(sl);
-      return py::make_iterator(new_node->outputs().begin(), new_node->outputs().end());
-    }, py::return_value_policy::reference_internal)
     .def("inputs",[](Graph &g) {
       return py::make_iterator(g.inputs().begin(), g.inputs().end());
     })
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 988ac6e1139fb0..a04e4ae0fc3f03 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -12,7 +12,6 @@
 
 #include "ATen/core/optional.h"
 
-
 #include <climits>
 #include <set>
 
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 2afc56df8c2c4e..9f8185621a1c0e 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -69,18 +69,15 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
     // introspection.
     size_t actual_n_args = n_args;
     if (!signature.is_none()) {
-      std::vector<TypePtr> arg_types, ret_types;
-      std::tie(arg_types, ret_types) = py::cast<std::pair<std::vector<TypePtr>, std::vector<TypePtr>>>(signature);
+      std::vector<TypePtr> arg_types;
+      TypePtr ret_type;
+      std::tie(arg_types, ret_type) = py::cast<std::pair<std::vector<TypePtr>, TypePtr>>(signature);
       args.reserve(arg_types.size());
       size_t idx = 0; // Fake argument names by putting in the index
       for (auto &arg_type : arg_types) {
         args.push_back(Argument(std::to_string(idx++), std::move(arg_type), {}, {}, false));
       }
-      rets.reserve(ret_types.size());
-      idx = 0;
-      for (auto &ret_type : ret_types) {
-        rets.push_back(Argument(std::to_string(idx++), std::move(ret_type), {}, {}, false));
-      }
+      rets.push_back(Argument("0", std::move(ret_type), {}, {}, false));
     } else {
       // Create a default signature using what information we have
 
@@ -99,10 +96,12 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
       for (size_t i=0; i < actual_n_args; ++i) {
         args.push_back(Argument(std::to_string(i), DynamicType::get(), {}, {}, false));
       }
-      rets.reserve(n_binders);
-      for (size_t i = 0; i < n_binders; ++i) {
-        rets.push_back(Argument(std::to_string(i), DynamicType::get(), {}, {}, false));
+      TypePtr ret_type = DynamicType::get();
+      if(n_binders != 1) {
+        std::vector<TypePtr> tuple_values(n_binders, ret_type);
+        ret_type = TupleType::create(std::move(tuple_values));
       }
+      rets.push_back(Argument("0", ret_type, {}, {}, false));
     }
     return FunctionSchema("", std::move(args), std::move(rets));
   }
@@ -120,27 +119,17 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
 
     // Release the function object so we can wrap it in a PythonOp
     py::object func = self;
-    std::string cconv(inputs.size(), 't');
+    std::string cconv(inputs.size(), 'd');
     Node* new_node = m.graph()->insertNode(m.graph()->createPythonOp(
       THPObjectPtr(func.release().ptr()), cconv, {}));
     new_node->setSourceLocation(std::make_shared<SourceRange>(loc));
     for(auto &i : *all_inputs)
       new_node->addInput(i);
 
-    // This is really dumb, but relaxing the constraints on return types would
-    // require us to change the implementation of PythonOps in the interpreter.
-    // Note that this effectively makes the return type of Tuple[Tensor] and Tensor
-    // equivalent, but the PythonOp impl ends with an optional tuple unpack, so we need
-    // to do it.
-    for (auto & ret_arg : schema.returns) {
-      if (!ret_arg.type->isSubtypeOf(DynamicType::get())) {
-        throw ErrorReport(loc) << "Python functions can currently only return Tensors";
-      }
-    }
-
     std::vector<Value*> outputs;
-    for(size_t i = 0; i < schema.returns.size(); ++i)
-      outputs.push_back(new_node->addOutput());
+    for(auto & ret_arg : schema.returns) {
+      outputs.push_back(new_node->addOutput()->setType(ret_arg.type));
+    }
     return std::make_shared<SimpleValue>(packOutputs(*m.graph(), outputs));
   }
 
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index dd497d5c50da90..c31504ad83ab22 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -998,7 +998,7 @@ def _get_methods(cls):
     '_named_members', 'parameters', 'named_parameters',
     'buffers', 'named_buffers', 'children', 'named_children', 'modules',
     'named_modules', 'zero_grad', 'share_memory', '_get_name', 'extra_repr',
-    '_slow_forward', '_tracing_name'
+    '_slow_forward', '_tracing_name', 'eval', 'train',
 }
 
 
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index f5d824130f4167..6930d61ed5a64e 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -105,16 +105,6 @@ def get_num_params(fn):
         return num_params
 
 
-def flatten_return_type(type):
-    if isinstance(type, TupleType):
-        return_types = []
-        for elem_type in type.elements():
-            return_types.append(elem_type)
-        return return_types
-    else:
-        return [type]
-
-
 def parse_type_line(type_line):
     """Parses a type annotation specified as a comment.
 
@@ -138,9 +128,7 @@ def parse_type_line(type_line):
         raise RuntimeError("Failed to parse the return type of a type annotation")
 
     arg_types = [ann_to_type(ann) for ann in arg_ann]
-    ret_types = flatten_return_type(ann_to_type(ret_ann))
-
-    return arg_types, ret_types
+    return arg_types, ann_to_type(ret_ann)
 
 
 def get_type_line(source):
@@ -191,8 +179,8 @@ def as_ann(ann):
 
     arg_types = [ann_to_type(as_ann(p.annotation))
                  for p in sig.parameters.values()]
-    return_types = flatten_return_type(ann_to_type(as_ann(sig.return_annotation)))
-    return arg_types, return_types
+    return_type = ann_to_type(as_ann(sig.return_annotation))
+    return arg_types, return_type
 
 
 def ann_to_type(ann):

From 3a8e39b21506752f77c24229008a48a0e33a66c9 Mon Sep 17 00:00:00 2001
From: Roger-luo <hiroger@qq.com>
Date: Tue, 11 Sep 2018 15:54:18 -0700
Subject: [PATCH 049/237] Support load and store between Py_complex and
 std::complex (#11493)

Summary: Printing for complex numbers requires loading and storing between `Py_complex` and `std::complex`. This patch aims to support this for the plugin.

Differential Revision: D9771808

Pulled By: ezyang

fbshipit-source-id: 024865f1945d63ddb5efc775a35438c8ea06408e
---
 torch/csrc/utils/python_numbers.h | 9 +++++++++
 torch/csrc/utils/python_scalars.h | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index 6a292e215108bb..0d3fa2f5cc37d6 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -126,3 +126,12 @@ inline double THPUtils_unpackDouble(PyObject* obj) {
   }
   return value;
 }
+
+inline std::complex<double> THPUtils_unpackComplexDouble(PyObject *obj) {
+  Py_complex value = PyComplex_AsCComplex(obj);
+  if (value.real == -1.0 && PyErr_Occurred()) {
+    throw python_error();
+  }
+
+  return std::complex<double>(value.real, value.imag);
+}
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index 820e6d2776596f..85ed0a0a1bc88f 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -20,6 +20,8 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
       break;
     case at::kFloat: *(float*)data = (float)THPUtils_unpackDouble(obj); break;
     case at::kDouble: *(double*)data = THPUtils_unpackDouble(obj); break;
+    case at::kComplexFloat: *(std::complex<float>*)data = (std::complex<float>)THPUtils_unpackComplexDouble(obj); break;
+    case at::kComplexDouble: *(std::complex<double>*)data = THPUtils_unpackComplexDouble(obj); break;
     default: throw std::runtime_error("invalid type");
   }
 }
@@ -34,6 +36,8 @@ inline PyObject* load_scalar(void* data, at::ScalarType scalarType) {
     case at::kHalf: return PyFloat_FromDouble(at::convert<double, at::Half>(*(at::Half*)data));
     case at::kFloat: return PyFloat_FromDouble(*(float*)data);
     case at::kDouble: return PyFloat_FromDouble(*(double*)data);
+    case at::kComplexFloat: return PyComplex_FromCComplex(*reinterpret_cast<Py_complex *>((std::complex<float>*)data));
+    case at::kComplexDouble: return PyComplex_FromCComplex(*reinterpret_cast<Py_complex *>((std::complex<double>*)data));
     default: throw std::runtime_error("invalid type");
   }
 }

From 3a39006d38e797f0a9aea2edb1b2846ec81c7c30 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Tue, 11 Sep 2018 16:25:15 -0700
Subject: [PATCH 050/237] Fix some more doc

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11531

Differential Revision: D9776541

Pulled By: SsnL

fbshipit-source-id: 8725485639ea6e9479b6ea95a49f5b75a9457db7
---
 torch/_tensor_docs.py | 39 +++++++++++++++++++++++----------------
 torch/_torch_docs.py  |  6 ++++--
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index c3039583b04fb7..151b0dee218876 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -482,7 +482,7 @@ def add_docstr_all(method, docstr):
 
 .. math::
 
-    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - median)^2 + \sigma^2}
+    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2}
 """)
 
 add_docstr_all('ceil',
@@ -1229,17 +1229,18 @@ def add_docstr_all(method, docstr):
 In-place version of :meth:`~Tensor.log2`
 """)
 
-add_docstr_all('log_normal_', u"""
+add_docstr_all('log_normal_', r"""
 log_normal_(mean=1, std=2, *, generator=None)
 
 Fills :attr:`self` tensor with numbers samples from the log-normal distribution
-parameterized by the given mean (\u00B5) and standard deviation (\u03C3).
-Note that :attr:`mean` and :attr:`stdv` are the mean and standard deviation of
-the underlying normal distribution, and not of the returned distribution:
+parameterized by the given mean :math:`\mu` and standard deviation
+:math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and
+standard deviation of the underlying normal distribution, and not of the
+returned distribution:
 
 .. math::
 
-    f(x) = \\dfrac{1}{x \\sigma \\sqrt{2\\pi}}\ e^{-\\dfrac{(\\ln x - \\mu)^2}{2\\sigma^2}}
+    f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}}
 """)
 
 add_docstr_all('logsumexp',
@@ -1745,13 +1746,16 @@ def callable(a, b) -> number
                r"""
 reshape(*shape) -> Tensor
 
-Returns a tensor with the same data and number of elements as :attr:`self`,
-but with the specified shape.
+Returns a tensor with the same data and number of elements as :attr:`self`
+but with the specified shape. This method returns a view if :attr:`shape` is
+compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+possible to return a view.
+
+See :func:`torch.reshape`
 
 Args:
     shape (tuple of ints or int...): the desired shape
 
-See :func:`torch.reshape`
 """)
 
 add_docstr_all('reshape_as',
@@ -1760,8 +1764,10 @@ def callable(a, b) -> number
 
 Returns this tensor as the same shape as :attr:`other`.
 ``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+This method returns a view if ``other.sizes()`` is compatible with the current
+shape. See :meth:`torch.Tensor.view` on when it is possible to return a view.
 
-Please see :meth:`~Tensor.reshape` for more information about ``reshape``.
+Please see :meth:`reshape` for more information about ``reshape``.
 
 Args:
     other (:class:`torch.Tensor`): The result tensor has the same shape
@@ -2524,10 +2530,10 @@ def callable(a, b) -> number
 
 add_docstr_all('view',
                r"""
-view(*args) -> Tensor
+view(*shape) -> Tensor
 
 Returns a new tensor with the same data as the :attr:`self` tensor but of a
-different size.
+different :attr:`shape`.
 
 The returned tensor shares the same data and must have the same number
 of elements, but may have a different size. For a tensor to be viewed, the new
@@ -2538,13 +2544,14 @@ def callable(a, b) -> number
 
 .. math::
 
-  stride[i] = stride[i+1] \times size[i+1]
+  \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
 
-Otherwise, :func:`contiguous` needs to be called before the tensor can be
-viewed.
+Otherwise, :meth:`contiguous` needs to be called before the tensor can be
+viewed. See also: :meth:`reshape`, which returns a view if the shapes are
+compatible, and copies (equivalent to calling :meth:`contiguous`) otherwise.
 
 Args:
-    args (torch.Size or int...): the desired size
+    shape (torch.Size or int...): the desired size
 
 Example::
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index f66a4e1c17d5bc..19a1fcbc625003 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3346,7 +3346,7 @@ def parse_kwargs(desc):
     inv = (u^T u)^{-1}
 
 If :attr:`upper` is ``False``, :attr:`u` is lower triangular
-such that the tensor value is
+such that the returned tensor is
 
 .. math::
     inv = (uu^{T})^{-1}
@@ -3993,6 +3993,8 @@ def parse_kwargs(desc):
 with compatible strides can be reshaped without copying, but you should not
 depend on the copying vs. viewing behavior.
 
+See :meth:`torch.Tensor.view` on when it is possible to return a view.
+
 A single dimension may be -1, in which case it's inferred from the remaining
 dimensions and the number of elements in :attr:`input`.
 
@@ -5308,7 +5310,7 @@ def parse_kwargs(desc):
 
 .. math::
     out_i = \begin{cases}
-        x_i & \text{if } condition_i \\
+        x_i & \text{if } \text{condition}_i \\
         y_i & \text{otherwise} \\
     \end{cases}
 

From b6b0b5222db91309fbdb4e4b2ff40548216f06b6 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Tue, 11 Sep 2018 17:24:44 -0700
Subject: [PATCH 051/237] fix missing libnccl.so.1 error (#11553)

Summary:
what it says on the tin.

I broke the build in https://github.com/pytorch/pytorch/pull/11487 but contbuild didn't end up catching it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11553

Differential Revision: D9781557

Pulled By: soumith

fbshipit-source-id: 2a1fa314af4b85b5491d74110bfee3d80599aa95
---
 tools/build_pytorch_libs.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 579cde711eae43..e0593badd97614 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -237,7 +237,7 @@ function build_nccl() {
   fi
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   mkdir -p ${INSTALL_DIR}/lib
-  $SYNC_COMMAND "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1"
+  find lib -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "${INSTALL_DIR}/lib/"
   if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then
     ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so"
   fi
@@ -312,8 +312,8 @@ function build_caffe2() {
 
   # This is needed by the aten tests built with caffe2
   if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
-    # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
-    $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
+      # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
+      $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
   fi
 
   ${CMAKE_INSTALL} -j"$MAX_JOBS"

From 8b196d671b1a2be26fd6fe73f55aa45fb8a91e1e Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 17:41:28 -0700
Subject: [PATCH 052/237] Allow tracing random functions (only when using
 default generators) (#11539)

Summary:
Fixes #11504.

zdevito, neerajprad, fritzo
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11539

Differential Revision: D9777897

Pulled By: apaszke

fbshipit-source-id: 56983260f5b93da7d5540a6242769ea7bd50eb06
---
 test/test_jit.py                              | 23 +++++++++++++++++++
 tools/jit/gen_jit_dispatch.py                 | 10 ++++----
 torch/csrc/jit/constants.cpp                  |  4 ++++
 torch/csrc/jit/ir.h                           |  4 +++-
 .../csrc/jit/passes/constant_propagation.cpp  |  2 +-
 torch/csrc/jit/register_prim_ops.cpp          |  2 +-
 torch/csrc/jit/tracer.cpp                     |  9 +++++++-
 7 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 0c64a48f0bdcaa..68aaf4df5d3afd 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1118,6 +1118,17 @@ def fn(x, y):
         self.assertExpectedGraph(traced_fn.graph)
         self.assertExportImport(traced_fn.graph, (x, y))
 
+    def test_trace_random(self):
+        def f(mean, std):
+            return torch.normal(mean, std)
+
+        traced = torch.jit.trace(f, (torch.zeros(2, 3), torch.ones(2, 3)), check_trace=False)
+        mean, std = torch.zeros(5, 5), torch.ones(5, 5)
+        with torch.random.fork_rng(devices=[]):
+            output = f(mean, std)
+        traced_output = traced(mean, std)
+        self.assertEqual(output, traced_output)
+
     def test_trace_tensor_factory(self):
         def run(**kwargs):
             inputs_require_grads = kwargs.pop('inputs_require_grads', True)
@@ -2563,6 +2574,18 @@ def func(x):
         x = torch.rand(10, dtype=torch.float, requires_grad=True)
         self.checkScript(func, [x], optimize=True)
 
+    def test_random(self):
+        @torch.jit.script
+        def f(mean, std):
+            return torch.normal(mean, std)
+
+        mean, std = torch.zeros(5, 5), torch.ones(5, 5)
+        with torch.random.fork_rng(devices=[]):
+            output = torch.normal(mean, std)
+        with torch.random.fork_rng(devices=[]):
+            script_output = f(mean, std)
+        self.assertEqual(output, script_output)
+
     def _check_code(self, code_str, fn_name, inputs):
         scope = {}
         exec(code_str, globals(), scope)
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 2f24986c39cdc3..67bba43913ef16 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -68,6 +68,7 @@ def jit_type_of(arg):
     'double': '{}.toDouble()',
     'int64_t': '{}.toInt()',
     'std::string': '{}.toString()->string()',
+    'Generator': 'nullptr',
     'std::array<bool,2>': 'as_bool_array<2>({}.toIntList()->elements())',
     'std::array<bool,3>': 'as_bool_array<3>({}.toIntList()->elements())',
     'std::array<bool,4>': 'as_bool_array<4>({}.toIntList()->elements())',
@@ -197,12 +198,9 @@ def emit_decl_variant(decl):
 
         real_inputs = 0
         for arg in decl['arguments']:
-            if arg['simple_type'] in default_only_types:
-                arguments.append(arg['default'])
-            else:
-                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
-                arguments.append(from_ivalue(arg, value))
-                real_inputs += 1
+            value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
+            arguments.append(from_ivalue(arg, value))
+            real_inputs += 1
 
         call = get_invocation(decl, arguments, num_inputs)
 
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 8bf1ab566b31a9..4cdb193d8434d8 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -35,6 +35,10 @@ Value* insertConstant(
   } else if(val.isString()) {
     n->s_(attr::value, val.toString()->string());
     n->output()->setType(StringType::get());
+  } else if(val.isNone()) {
+    n->destroy();
+    n = g.create(prim::None);
+    n->output()->setType(NoneType::get());
   } else {
     throw constant_not_supported_error("Unsupported value kind: " + val.tagKind());
   }
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index ae3c8191922c8c..5e70adfe400916 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -993,7 +993,9 @@ friend struct Block;
     return create(prim::Undefined);
   }
   Node * createNoneGenerator() {
-    return create(prim::NoneGenerator);
+    auto n = create(prim::NoneGenerator);
+    n->output()->setType(GeneratorType::get());
+    return n;
   }
   Node * createFusionGroup(int device) {
     auto n = create(prim::FusionGroup, 0);
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index bfd8ec9b9f1764..e83971bfaf9dfb 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -19,7 +19,6 @@ std::unordered_set<Symbol> skip_list = {
   //FIXME treated as having side effects but ONNX depends on them being removed
   prim::Print,
   //all the rand functions from native_functions.yaml
-  aten::permute,
   aten::rand,
   aten::rand_out,
   aten::rand_like,
@@ -33,6 +32,7 @@ std::unordered_set<Symbol> skip_list = {
   aten::randperm_out,
   prim::Constant,
   prim::Undefined,
+  prim::NoneGenerator,
   // TODO (zach): we should consider skipping tensor factories in the cases
   // where the constant tensor would be large but cheap to create.
  };
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 7e70550a38094b..67274f8a9192d6 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -146,7 +146,7 @@ RegisterOperators reg({
         prim::NoneGenerator,
         [](Node* node) {
           return [](Stack& stack) {
-            stack.push_back(at::Tensor());
+            stack.emplace_back();
             return 0;
           };
         }),
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 4f496420956b0b..0f8d65dcc5873a 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -42,7 +42,14 @@ void addInputs(Node *n, const char * name, const at::Scalar& value)  { detail::g
 void addInputs(Node *n, const char * name, const std::string& value) { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Tensor& value)  { n->addInput(getValueTrace(value)); }
 void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); }
-void addInputs(Node *n, const char * name, at::Generator * value)            { detail::badArgType(value); }
+void addInputs(Node *n, const char * name, at::Generator * value)            {
+  if (value) {
+    detail::badArgType(value);
+  }
+  Graph * g = n->owningGraph();
+  Value * undef_gen = g->insertNode(g->createNoneGenerator())->output();
+  n->addInput(undef_gen);
+}
 void addInputs(Node *n, const char * name, at::ScalarType value)             { detail::badArgType(value); }
 
 void addInputs(Node *n, const char * name, at::TensorList value) {

From cda74ac476471e753ca06bd3491c84dc0bcaa813 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Tue, 11 Sep 2018 17:42:03 -0700
Subject: [PATCH 053/237] fix nested no_grad decorator and with-statement
 (#11479)

Summary:
- fixes https://github.com/pytorch/pytorch/issues/10858
- allow `no_grad` decorator to apply `with torch.no_grad()` at the correct context
- current behavior:
```
import torch

torch.no_grad()
def nothing(x):
    return x

testin = torch.Tensor([0])
with torch.no_grad():
    print(torch.is_grad_enabled()) # False
    testout = nothing(testin)
    print(torch.is_grad_enabled()) # False
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11479

Differential Revision: D9758691

Pulled By: weiyangfb

fbshipit-source-id: 87de2219c6c45f65a2c0406ae152c3ad760be8f2
---
 test/test_autograd.py       |  6 ++++++
 torch/autograd/grad_mode.py | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index faba5efb1847d5..47ff4c66be0d33 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -724,6 +724,12 @@ def adder(x, y):
         self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5)))
         self.assertIsNone(z.grad_fn)
 
+        # test nested decorator and with-statement on no_grad
+        with torch.no_grad():
+            self.assertFalse(torch.is_grad_enabled())
+            w = adder(x, y)
+            self.assertFalse(torch.is_grad_enabled())
+
     def test_no_grad_python_function(self):
         """Python Functions should respect grad mode."""
         x = torch.ones(5, 5, requires_grad=True)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 35dada34265d12..a4a768b633e331 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -28,11 +28,8 @@ class no_grad(object):
         >>> z.requires_grad
         False
     """
-
-    def __init__(self):
-        self.prev = torch.is_grad_enabled()
-
     def __enter__(self):
+        self.prev = torch.is_grad_enabled()
         torch._C.set_grad_enabled(False)
 
     def __exit__(self, *args):
@@ -75,11 +72,8 @@ class enable_grad(object):
         True
 
     """
-
-    def __init__(self):
-        self.prev = torch.is_grad_enabled()
-
     def __enter__(self):
+        self.prev = torch.is_grad_enabled()
         torch._C.set_grad_enabled(True)
 
     def __exit__(self, *args):

From bbf54ea37cc063503d6e30ffe2124031f3c87f59 Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritzo@uber.com>
Date: Tue, 11 Sep 2018 18:24:02 -0700
Subject: [PATCH 054/237] Ensure .enumerate_support() methods are jittable
 (#11542)

Summary:
This works around #11535 by avoiding `arange(n, out=x)` and `eye(n, out=x)` in `torch.distributions`. I've confirmed that the `.enumerate_support()` methods are now jittable.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11542

Differential Revision: D9777805

Pulled By: apaszke

fbshipit-source-id: fa38f2f1acfc0a289f725fd8c92478573cfdbefb
---
 torch/distributions/bernoulli.py           | 3 +--
 torch/distributions/binomial.py            | 5 ++---
 torch/distributions/categorical.py         | 4 +---
 torch/distributions/constraints.py         | 2 +-
 torch/distributions/one_hot_categorical.py | 7 +++++--
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 067a6eba9a43c4..ddbbcde7bed300 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -97,8 +97,7 @@ def entropy(self):
         return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none')
 
     def enumerate_support(self, expand=True):
-        values = self._new((2,))
-        torch.arange(2, out=values)
+        values = torch.arange(2, dtype=self._param.dtype, device=self._param.device)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
         if expand:
             values = values.expand((-1,) + self._batch_shape)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index ad56856c90aef1..5e168ffe1d7a2f 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -98,7 +98,7 @@ def sample(self, sample_shape=torch.Size()):
             shape = self._extended_shape(sample_shape) + (max_count,)
             bernoullis = torch.bernoulli(self.probs.unsqueeze(-1).expand(shape))
             if self.total_count.min() != max_count:
-                arange = torch.arange(max_count, out=self.total_count.new_empty(max_count))
+                arange = torch.arange(max_count, dtype=self._param.dtype, device=self._param.device)
                 mask = arange >= self.total_count.unsqueeze(-1)
                 bernoullis.masked_fill_(mask, 0.)
             return bernoullis.sum(dim=-1)
@@ -119,8 +119,7 @@ def enumerate_support(self, expand=True):
         total_count = int(self.total_count.max())
         if not self.total_count.min() == total_count:
             raise NotImplementedError("Inhomogeneous total count not supported by `enumerate_support`.")
-        values = self._new(1 + total_count,)
-        torch.arange(1 + total_count, out=values)
+        values = torch.arange(1 + total_count, dtype=self._param.dtype, device=self._param.device)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
         if expand:
             values = values.expand((-1,) + self._batch_shape)
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 267bed8826e7de..2ad69cb1bacec1 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -120,10 +120,8 @@ def entropy(self):
 
     def enumerate_support(self, expand=True):
         num_events = self._num_events
-        values = torch.arange(num_events).long()
+        values = torch.arange(num_events, dtype=torch.long, device=self._param.device)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
         if expand:
             values = values.expand((-1,) + self._batch_shape)
-        if self._param.is_cuda:
-            values = values.cuda(self._param.get_device())
         return values
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index f214cf1caab1ea..83205353761fd5 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -269,7 +269,7 @@ def check(self, value):
         lower_triangular = (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0]
 
         n = value.size(-1)
-        diag_mask = torch.eye(n, n, out=value.new(n, n))
+        diag_mask = torch.eye(n, n, dtype=value.dtype, device=value.device)
         positive_diagonal = (value * diag_mask > (diag_mask - 1)).min(-1)[0].min(-1)[0]
         return lower_triangular & positive_diagonal
 
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index c28949b079a115..dc37b4c7fabb8e 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -49,6 +49,10 @@ def expand(self, batch_shape, _instance=None):
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
+    @property
+    def _param(self):
+        return self._categorical._param
+
     @property
     def probs(self):
         return self._categorical.probs
@@ -89,8 +93,7 @@ def entropy(self):
 
     def enumerate_support(self, expand=True):
         n = self.event_shape[0]
-        values = self._new((n, n))
-        torch.eye(n, out=values)
+        values = torch.eye(n, dtype=self._param.dtype, device=self._param.device)
         values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
         if expand:
             values = values.expand((n,) + self.batch_shape + (n,))

From 35d52dbb0e026f7f0b24f6b81bc1d10c0cb0983a Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Tue, 11 Sep 2018 18:24:55 -0700
Subject: [PATCH 055/237] re-enable USE_MPI (#11416)

Summary:
The previous error was caused by mpi_test not depending on MPI_CXX_LIBRARIES. This might solve the problem.

Not tested locally - waiting for CI test.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11416

Reviewed By: mingzhe09088

Differential Revision: D9771694

Pulled By: Yangqing

fbshipit-source-id: 53e7b4f64eadc88313bc4dd9b8e3f7931cda6e91
---
 CMakeLists.txt        | 2 +-
 caffe2/CMakeLists.txt | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8dc6dc146ffb79..60d69ef2d9bc68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,7 +116,7 @@ option(USE_IDEEP "Use IDEEP interface in MKL BLAS" ON)
 option(USE_MKLML "Use MKLML interface in MKL BLAS" ON)
 option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
-    USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." OFF
+    USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
     "USE_DISTRIBUTED" OFF)
 cmake_dependent_option(
     USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 99aca34a860d79..4e9b3eef8937f4 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -427,6 +427,13 @@ if (BUILD_TEST)
     endforeach()
   endif()
 
+  # For special tests that explicitly uses dependencies, we add them here
+  if (USE_MPI)
+    target_link_libraries(mpi_test ${MPI_CXX_LIBRARIES})
+    if (USE_CUDA)
+      target_link_libraries(mpi_gpu_test ${MPI_CXX_LIBRARIES})
+    endif()
+  endif()
 endif()
 
 if (BUILD_PYTHON)

From 92fd69f256a7bf2ec9e022aef5a66b97b9b82132 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 11 Sep 2018 20:10:21 -0700
Subject: [PATCH 056/237] Split Type into TypeExtendedInterface and Type
 (#11520)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11520

Previously, we had Type which was a catch all interface for all
functions and methods we could possibly want to do dynamic dispatch
on. However, we want to check in a non-autogenerated Tensor class
to ATen/core, and to do this, we must also check in a non-autogenerated
Type class which we can do dispatch on. In principle, we could
put the full Type interface in ATen/core, but this would be
a bad developer experience, since any time you add a new free
function, you'd have to regenerate the checked in Type header.

For a better dev experience, we split Type into a two parts,
Type, which will be checked in (though not in this diff), and
TypeExtendedInterface, which will NOT be checked in. Type contains
just enough methods to let Tensor be defined, and leaves the
rest to TypeExtendedInterface.

Some complications:

- We (very unfortunately) have overloaded virtual methods. Because
of C++'s rules, we cannot move one overload without doing some
extra work to make sure that overload in a superclass and an
overload in a subclass resolve together. I've chosen to resolve
this problem simply by moving ALL overloads of a method which
occurs in Tensor to Type.

- There are some places where we take a type() object and call
a method on it, which is not a Tensor base method. I've eliminated
some where possible, but in other cases calling the method on type
is the ONLY way to invoke it; in that case, I've just inserted
a cast. Further refactoring is necessary.

Reviewed By: gchanan

Differential Revision: D9771708

fbshipit-source-id: c59d39fe919cd6f42be6dca699d474346ea3c614
---
 aten/src/ATen/function_wrapper.py             | 64 ++++++++++++++-----
 aten/src/ATen/gen.py                          |  5 +-
 aten/src/ATen/native/LegacyBridge.cpp         | 16 ++---
 aten/src/ATen/native/TensorFactories.cpp      | 18 ++++--
 aten/src/ATen/preprocess_declarations.py      | 26 ++++++++
 aten/src/ATen/templates/Functions.h           | 17 +++--
 aten/src/ATen/templates/TypeDefault.h         |  6 +-
 .../ATen/templates/TypeExtendedInterface.h    | 12 ++++
 caffe2/contrib/aten/aten_op_template.h        |  6 +-
 caffe2/contrib/aten/gen_op.py                 |  2 +-
 tools/autograd/templates/VariableType.cpp     |  2 +-
 tools/autograd/templates/VariableType.h       |  2 +-
 12 files changed, 131 insertions(+), 45 deletions(-)
 create mode 100644 aten/src/ATen/templates/TypeExtendedInterface.h

diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index b830aa3e5fe65e..287b7895008cc2 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -107,6 +107,10 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # NB: As far as ezyang can tell, we don't *have* to codegen this,
 # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in
 # the superclass.  But it doesn't seem to be harmful.
+#
+# TODO: self_ty is a hack to make things work for native methods which need to
+# take a dtype, but also need to dispatch differently for different types.
+# Eliminate it at some point.
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
@@ -173,7 +177,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # the same name (but different signature) already
 ZERO_DIM_CHECK = CodeTemplate("""\
 if (${check_name}.dim() == 0) {
-    return static_cast<const Type*>(this)->${api_name}(${zero_dim_actuals});
+    return static_cast<const TypeExtendedInterface*>(this)->${api_name}(${zero_dim_actuals});
 }""")
 
 ZERO_DIM_ONLY = CodeTemplate("""\
@@ -183,7 +187,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 SPARSE_CHECK = CodeTemplate("""\
 if(${check_name}.type().is_sparse()) {
-    return static_cast<const Type*>(this)->${api_name}(${sparse_actuals});
+    return static_cast<const TypeExtendedInterface*>(this)->${api_name}(${sparse_actuals});
 }""")
 
 BUFFER_DEFINITION = CodeTemplate("""\
@@ -390,6 +394,7 @@ def __getitem__(self, x):
     'type_registrations': List[str],
     'type_headers': List[str],
     'pure_virtual_type_method_declarations': List[str],
+    'pure_virtual_extended_type_method_declarations': List[str],
     'type_method_declarations': List[str],
     'type_method_definitions': List[str],
     'type_method_inline_definitions': List[str],
@@ -490,6 +495,9 @@ def __getitem__(self, x):
     'formals': List[str],
     'inferred_type': str,
     'inplace': bool,
+    # This controls whether or not we generate the interface in Type or
+    # TypeExtendedInterface
+    'extended_method': bool,
     'method_actuals': List[str],
     'method_formals_with_defaults': List[str],
     'method_formals': List[str],
@@ -836,8 +844,12 @@ def process_option(option, output_options):
             # NN function with no _forward/_backward suffix don't have cimpls.
             # They call the _forward function and discard any buffer returns
             abstract = False
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            else:
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(
                 TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
             body = emit_nn_body(option)
@@ -845,17 +857,27 @@ def process_option(option, output_options):
                 TYPE_METHOD_DEFINITION_CONCRETE.substitute(
                     env, type_definition_body=body))
         elif broadcast_arg is None:
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            else:
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(
                 TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env))
             top_env['type_method_definitions'].append(
                 TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env))
         else:
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
+            else:
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
             top_env['type_method_declarations'].append(
                 TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
             top_env['type_method_declarations'].append(
@@ -888,7 +910,7 @@ def process_option(option, output_options):
             method_of.append('Tensor')
 
         if is_namespace_function:
-            option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor)
+            option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor)
             top_env['function_declarations'].append(
                 FUNCTION_DECLARATION.substitute(env))
             top_env['function_definitions'].append(
@@ -1060,11 +1082,21 @@ def find_formal(formal_name, formals):
         # Factory methods are not dispatched over `Type`.
         if not is_factory_method:
             if option['deprecated']:
+                # Deprecated functions are always non-extended,
+                # because they need to be made available from Type
+                # (the public interface) so that code like
+                # tensor.type().arange(...) keeps working.  Once
+                # we remove the deprecated functions, we can eliminate
+                # these methods entirely.
                 top_env['pure_virtual_type_method_declarations'].append(
                     DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             else:
-                top_env['pure_virtual_type_method_declarations'].append(
-                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                if option['extended_method']:
+                    top_env['pure_virtual_extended_type_method_declarations'].append(
+                        PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                else:
+                    top_env['pure_virtual_type_method_declarations'].append(
+                        PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
         dispatch = option['type_method_definition_dispatch']
         option['native_type_method_dispatch'] = dispatch
@@ -1116,12 +1148,12 @@ def find_formal(formal_name, formals):
 
         if is_namespace_function:
             if dispatch_type:
-                option['inferred_type'] = dispatch_type['name']
+                option['inferred_type'] = 'static_cast<const TypeExtendedInterface&>({})'.format(dispatch_type['name'])
             elif dispatch_tensor:
-                option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor)
+                option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor)
             else:
                 # doesn't depend on a specific type, use undefined float
-                option['inferred_type'] = 'at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)'
+                option['inferred_type'] = 'detail::non_specific_type()'
             declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION
             top_env['function_declarations'].append(declaration.substitute(env))
             if is_factory_method:
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 025faffe5d2fe8..5497a752dc4fb4 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -107,6 +107,7 @@ def check_all_files_written(self):
 SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
 TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
 TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h")
+TYPE_EXTENDED_INTERFACE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeExtendedInterface.h")
 TYPE_DEFAULT_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.h")
 TYPE_DEFAULT_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.cpp")
 
@@ -165,6 +166,7 @@ def check_all_files_written(self):
     'cuda_type_registrations': [],
     'cuda_type_headers': [],
     'pure_virtual_type_method_declarations': [],
+    'pure_virtual_extended_type_method_declarations': [],
     'type_method_declarations': [],
     'type_method_definitions': [],
     'type_method_inline_definitions': [],
@@ -330,7 +332,7 @@ def iterate_types():
 # so that the script runs quickly when we are just querying the
 # outputs
 def declare_outputs():
-    files = ['Declarations.yaml', 'Type.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h',
+    files = ['Declarations.yaml', 'Type.h', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h',
              'TensorMethods.h', 'Functions.h',
              'CPUCopy.cpp', 'NativeFunctions.h',
              'RegisterCPU.cpp', 'RegisterCPU.h']
@@ -400,6 +402,7 @@ def generate_outputs():
             backend, density, scalar_type, declarations))
 
     file_manager.write('Type.h', TYPE_H, top_env)
+    file_manager.write('TypeExtendedInterface.h', TYPE_EXTENDED_INTERFACE_H, top_env)
     file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env)
     file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env)
 
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 07d7e46ff79a56..1364c0cceb1847 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -144,34 +144,34 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta
 
 Tensor tensor(const Type& dtype) {
   if (_type_has_native(dtype)) {
-    return dtype.native_tensor();
+    return static_cast<const TypeExtendedInterface&>(dtype).native_tensor();
   } else {
-    return dtype.th_tensor();
+    return static_cast<const TypeExtendedInterface&>(dtype).th_tensor();
   }
 }
 
 Tensor tensor(const Type& dtype, ArrayRef<int64_t> size) {
   if (_type_has_native(dtype)) {
-    return dtype.native_tensor(size);
+    return static_cast<const TypeExtendedInterface&>(dtype).native_tensor(size);
   } else {
-    return dtype.th_tensor(size);
+    return static_cast<const TypeExtendedInterface&>(dtype).th_tensor(size);
   }
 }
 
 Tensor sparse_coo_tensor(const Type& dtype, ArrayRef<int64_t> size) {
-  return dtype.toSparse().native_sparse_coo_tensor(size);
+  return static_cast<const TypeExtendedInterface&>(dtype.toSparse()).native_sparse_coo_tensor(size);
 }
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
-  return values.type().toSparse().native_sparse_coo_tensor(indices, values);
+  return static_cast<const TypeExtendedInterface&>(values.type().toSparse()).native_sparse_coo_tensor(indices, values);
 }
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return values.type().toSparse().native_sparse_coo_tensor(indices, values, size);
+  return static_cast<const TypeExtendedInterface&>(values.type().toSparse()).native_sparse_coo_tensor(indices, values, size);
 }
 
 Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return values.type().toSparse()._native_sparse_coo_tensor_unsafe(indices, values, size);
+  return static_cast<const TypeExtendedInterface&>(values.type().toSparse())._native_sparse_coo_tensor_unsafe(indices, values, size);
 }
 
 int64_t get_device(const Tensor& self) {
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 1a12549b5e70e9..20211aeedfa32b 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -59,6 +59,10 @@ void window_function_checks(
       window_length);
 }
 
+const TypeExtendedInterface& getFactoryType(const TensorOptions& options) {
+  return static_cast<const TypeExtendedInterface&>(at::getType(options));
+}
+
 } // namespace
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -73,7 +77,7 @@ Tensor arange(
     Scalar step,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._arange(start, end, step);
+  return getFactoryType(options)._arange(start, end, step);
 }
 
 Tensor& arange_out(Tensor& result, Scalar start, Scalar end) {
@@ -86,7 +90,7 @@ Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
 
 Tensor arange(Scalar end, const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._arange(end);
+  return getFactoryType(options)._arange(end);
 }
 
 Tensor& arange_out(Tensor& result, Scalar end) {
@@ -94,7 +98,7 @@ Tensor& arange_out(Tensor& result, Scalar end) {
 }
 
 Tensor _dim_arange(const Tensor& like, int64_t dim) {
-  return like.type().toScalarType(at::kLong)._arange(like.size(dim));
+  return static_cast<const TypeExtendedInterface&>(like.type().toScalarType(at::kLong))._arange(like.size(dim));
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -102,7 +106,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {
 Tensor empty(IntList size, const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
   // Can't call a factory function, because the buck stops with us!
-  return at::getType(options).tensor(size);
+  return getFactoryType(options).tensor(size);
 }
 
 Tensor& empty_out(Tensor& result, IntList size) {
@@ -218,7 +222,7 @@ Tensor linspace(
     int64_t steps,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._linspace(start, end, steps);
+  return getFactoryType(options)._linspace(start, end, steps);
 }
 
 Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) {
@@ -241,7 +245,7 @@ Tensor logspace(
     int64_t steps,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._logspace(start, end, steps);
+  return getFactoryType(options)._logspace(start, end, steps);
 }
 
 Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) {
@@ -475,7 +479,7 @@ Tensor range(
     Scalar step,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._range(start, end, step);
+  return getFactoryType(options)._range(start, end, step);
 }
 
 Tensor& range_out(Tensor& result, Scalar start, Scalar end) {
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
index 173ac439487d26..98b22c7f8e1d6a 100644
--- a/aten/src/ATen/preprocess_declarations.py
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -217,8 +217,20 @@ def signature(option, i=None, value=None):
                                                                        (raw_args - filtered_args)]
 
 
+def is_extended_method(option):
+    if 'method' in option['variants']:
+        return False
+    elif option.get('deprecated', False):
+        return False
+    elif not option['variants']:
+        return False
+    else:
+        return True
+
+
 def run(declarations):
     declarations = [d for d in declarations if not exclude(d)]
+    non_extended_methods = set()
     for declaration in declarations:
         common_with_cwrap.set_declaration_defaults(declaration)
         declaration['options'] = [deepcopy(o) for o in declaration['options']]
@@ -237,6 +249,20 @@ def run(declarations):
                 sanitize_return(option)
             process_types_and_backends(option)
             add_variants(option)
+            if not is_extended_method(option):
+                non_extended_methods.add(option['api_name'])
         declaration['options'] = handle_outputs_taken_as_arguments(
             declaration['options'])
+
+    # We (very unfortunately) have overloaded virtual methods. Because
+    # of C++'s rules, we cannot move one overload without doing some
+    # extra work to make sure that overload in a superclass and an
+    # overload in a subclass resolve together. I've chosen to resolve
+    # this problem simply by moving ALL overloads of a method which
+    # occurs in Tensor to Type.  This is why we have to first compute
+    # which methods *names* go on type, and then move ALL overloads
+    # of this name to Type.
+    for declaration in declarations:
+        for option in declaration['options']:
+            option['extended_method'] = option['api_name'] not in non_extended_methods
     return declarations
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index b4a2e05e759ea3..7e2b6580343be4 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -4,6 +4,7 @@
 
 #include "ATen/core/Scalar.h"
 #include "ATen/Type.h"
+#include "ATen/TypeExtendedInterface.h"
 #include "ATen/Tensor.h"
 #include "ATen/core/Storage.h"
 #include "ATen/core/Generator.h"
@@ -20,14 +21,22 @@ using native::tensor;
 
 ${function_declarations}
 
-static inline Type & infer_type(const Tensor & t) {
+namespace detail {
+
+static inline TypeExtendedInterface & infer_type(const Tensor & t) {
   AT_CHECK(t.defined(), "undefined Tensor");
-  return t.type();
+  return static_cast<TypeExtendedInterface&>(t.type());
 }
-static inline Type & infer_type(const TensorList & tl) {
+static inline TypeExtendedInterface & infer_type(const TensorList & tl) {
   AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
-  return tl[0].type();
+  return static_cast<TypeExtendedInterface&>(tl[0].type());
+}
+static inline TypeExtendedInterface & non_specific_type() {
+  return static_cast<TypeExtendedInterface&>(at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float));
 }
+
+} // namespace detail
+
 // function definitions are all static inline because
 // they are one-line statically dispatched functions that
 // invoke the actual dynamic dispatch on the correct argument
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h
index 0f315e590d700c..e4a75abb48993e 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/templates/TypeDefault.h
@@ -2,13 +2,13 @@
 
 // ${generated_comment}
 
-#include "ATen/Type.h"
+#include "ATen/TypeExtendedInterface.h"
 
 namespace at {
 
-struct AT_API TypeDefault : public Type {
+struct AT_API TypeDefault : public TypeExtendedInterface {
   explicit TypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
-      : Type(type_id, is_variable, is_undefined) {}
+      : TypeExtendedInterface(type_id, is_variable, is_undefined) {}
 
   // Make sure overload resolution considers the nullary virtual method.
   // (A single argument overload is generated in the list.)
diff --git a/aten/src/ATen/templates/TypeExtendedInterface.h b/aten/src/ATen/templates/TypeExtendedInterface.h
new file mode 100644
index 00000000000000..82cb658c9eeea8
--- /dev/null
+++ b/aten/src/ATen/templates/TypeExtendedInterface.h
@@ -0,0 +1,12 @@
+#pragma once
+#include <ATen/Type.h>
+
+namespace at {
+
+struct AT_API TypeExtendedInterface : public Type {
+  explicit TypeExtendedInterface(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : Type(type_id, is_variable, is_undefined) {}
+  ${pure_virtual_extended_type_method_declarations}
+};
+
+} // namespace at
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index 9f327fdd5d82d4..c573891f6f04ac 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -214,10 +214,10 @@ class ATenOp : public Operator<Context> {
     DEFINE_IF(int64, Long)
     CAFFE_THROW("unsupported type annotation: ", name);
   }
-  at::Type & stringToType(const std::string & name) {
-    return at::getNonVariableType(backend(), stringToScalarType(name));
+  at::TypeExtendedInterface & stringToType(const std::string & name) {
+    return static_cast<at::TypeExtendedInterface&>(at::getNonVariableType(backend(), stringToScalarType(name)));
   }
-  at::Type * readTypeAttribute(const std::string & name) {
+  at::TypeExtendedInterface * readTypeAttribute(const std::string & name) {
     CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
     return &stringToType(OperatorBase::GetSingleArgument<std::string>(name, ""));
   }
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 18a3db4c7daed3..bc75ac9859dcc5 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -278,7 +278,7 @@ def find_factory_methods(decls):
                     # first tensor input is used to define the output type.
                     defined_inferred_type = True
                     env['statements'].append(
-                        'auto inferred_type = &({}.type());'.format(
+                        'auto inferred_type = &(static_cast<at::TypeExtendedInterface&>({}.type()));'.format(
                             arg['name']))
             else:
                 init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index d4a9a4eccab86a..589bbf8323f155 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -44,7 +44,7 @@ namespace torch { namespace autograd {
 
 VariableType::VariableType(Context* context, Type* baseType)
   : TypeDefault(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
-  , baseType(baseType)
+  , baseType(static_cast<TypeExtendedInterface*>(baseType))
   , id_(context->freshTypeID()) {
   str = std::string("Variable[") + baseType->toString() + "]";
 }
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index fe3e57f4fc0246..b9d84ad85672a5 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -72,7 +72,7 @@ struct TORCH_API VariableType final : public at::TypeDefault {
   static at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
   static std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos);
 
-  at::Type* baseType;
+  at::TypeExtendedInterface* baseType;
   std::string str;
   size_t id_;
 };

From 3121c8f5265e0f7410d22be27c13c1d55e496043 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Tue, 11 Sep 2018 20:11:26 -0700
Subject: [PATCH 057/237] Update gtest and remove the macro guide on gtest from
 #11321 (#11417)

Summary:
Last PR seems to have test failures, re-issuing.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11417

Reviewed By: orionr

Differential Revision: D9784706

Pulled By: Yangqing

fbshipit-source-id: 9e5f347e19fa2700ff69d2cd69ea7a9e01a91609
---
 cmake/Dependencies.cmake | 13 +++++--------
 third_party/googletest   |  2 +-
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index fed64398276078..551a84180b087a 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -187,20 +187,19 @@ endif()
 if(BUILD_TEST)
   # Preserve build options.
   set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
-  set(TEMP_CMAKE_DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})
 
   # We will build gtest as static libs and embed it directly into the binary.
   set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
 
   # For gtest, we will simply embed it into our test binaries, so we won't
   # need to install it.
-  set(BUILD_GTEST ON)
-  set(INSTALL_GTEST OFF)
+  set(BUILD_GTEST ON CACHE BOOL "Build gtest" FORCE)
+  set(INSTALL_GTEST OFF CACHE BOOL "Install gtest." FORCE)
   # We currently don't need gmock right now.
-  set(BUILD_GMOCK OFF)
+  set(BUILD_GMOCK OFF CACHE BOOL "Build gmock." FORCE)
   # For Windows, we will check the runtime used is correctly passed in.
   if (NOT CAFFE2_USE_MSVC_STATIC_RUNTIME)
-    set(gtest_force_shared_crt ON)
+      set(gtest_force_shared_crt ON CACHE BOOL "force shared crt on gtest" FORCE)
   endif()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest)
   include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include)
@@ -212,10 +211,8 @@ if(BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include)
 
-  # Recover build options. Unfortunately gtest modifies CMAKE_DEBUG_POSTFIX
-  # in some versions as detailed at https://github.com/google/googletest/issues/1334
+  # Recover build options.
   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
-  set(CMAKE_DEBUG_POSTFIX ${TEMP_CMAKE_DEBUG_POSTFIX} CACHE BOOL "Debug postfix" FORCE)
 endif()
 
 # ---[ LMDB
diff --git a/third_party/googletest b/third_party/googletest
index 69e48e92de4396..2fe3bd994b3189 160000
--- a/third_party/googletest
+++ b/third_party/googletest
@@ -1 +1 @@
-Subproject commit 69e48e92de43960a316a826293510b7b3deb9eca
+Subproject commit 2fe3bd994b3189899d93f1d5a881e725e046fdc2

From d95fedb43631625394120087090caa3560618d2a Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 11 Sep 2018 20:12:34 -0700
Subject: [PATCH 058/237] Use ATen dropout implementation in Dropout module and
 add FeatureDropout (#11458)

Summary:
This PR does two things:
1. Replaces the implementation of the `Dropout` module with a call to the ATen function,
2. Replaces `Dropout2d` with a new `FeatureDropout` module that shall take the place of `Dropout2d` and `Dropout3d`. I contemplated calling it `Dropout2d` and making `Dropout3d` an alias for it, but similar to our decision for `BatchNorm{1,2,3}d` (c.f. https://github.com/pytorch/pytorch/pull/9188), we can deviate from Python PyTorch in favor of the ideal-world solution, which is to have a single module, since both actually just call `feature_dropout`.

I also replaced the implementation of `dropout3d`  with a call to `dropout2d` in Python. The code is the same and it's easier for developers to parse than having to manually match the tokens to make sure it's really 100% the same code (which it is, if I matched the tokens correctly).

ebetica ezyang SsnL
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11458

Differential Revision: D9756603

Pulled By: goldsborough

fbshipit-source-id: fe847cd2cda2b6da8b06779255d76e32a974807c
---
 test/cpp/api/integration.cpp                  |  2 +-
 .../api/include/torch/nn/modules/dropout.h    | 46 ++++++++++---------
 torch/csrc/api/src/nn/modules/dropout.cpp     | 23 ++--------
 torch/nn/functional.py                        |  2 +
 4 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 58e716e3096cfd..a8656a16f5d658 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -336,7 +336,7 @@ TEST_CASE("integration/mnist", "[cuda]") {
   auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
   auto conv2 = model->add(Conv2d(10, 20, 5), "conv2");
   auto drop = Dropout(0.3);
-  auto drop2d = Dropout2d(0.3);
+  auto drop2d = FeatureDropout(0.3);
   auto linear1 = model->add(Linear(320, 50), "linear1");
   auto linear2 = model->add(Linear(50, 10), "linear2");
 
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index f547b5b2fa2d2b..bfe230a597215f 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -9,10 +9,13 @@
 
 namespace torch {
 namespace nn {
+
+/// Options for `Dropout` and `FeatureDropout`.
 struct DropoutOptions {
   DropoutOptions(double rate);
   /// The probability with which a particular component of the input is set to
   /// zero.
+  /// Changes to this parameter at runtime are effective.
   TORCH_ARG(double, rate) = 0.5;
 };
 
@@ -26,14 +29,7 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
 
   void reset() override;
 
-  /// During training, applies a noise mask to the input tensor.
-  /// During evaluation, applies an identity function.
-  Tensor forward(Tensor input);
-
-  /// Returns a noise mask that can be applied to the given input tensor.
-  /// Used inside `forward()` to generate the noise mask for dropout.
-  virtual Tensor noise_mask(Tensor input) const = 0;
-
+  /// The options used to configure this `Dropout` module.
   DropoutOptions options;
 };
 } // namespace detail
@@ -45,18 +41,26 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
 class DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
  public:
   using detail::DropoutImplBase<DropoutImpl>::DropoutImplBase;
-  Tensor noise_mask(Tensor input) const override;
+  /// During training, applies a noise mask to the input tensor.
+  /// During evaluation, applies an identity function.
+  Tensor forward(Tensor input);
 };
 
-/// Applies [Dropout](https://arxiv.org/abs/1207.0580) to inputs with
-/// 2-dimensional features.
+/// Applies spatial [Dropout](https://arxiv.org/abs/1207.0580) to inputs with
+/// 2-D or 3-D features.
 ///
-/// See https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d to learn more
-/// about the exact semantics of this module.
-class Dropout2dImpl : public detail::DropoutImplBase<Dropout2dImpl> {
+/// The equivalent in Python is
+/// [Dropout2d](https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d) for
+/// 2-D features and
+/// [Dropout3d](https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout3d) for
+/// 3-D features. This `FeatureDropout` module can instead deal with both 2-D
+/// and 3-D features.
+class FeatureDropoutImpl : public detail::DropoutImplBase<FeatureDropoutImpl> {
  public:
-  using detail::DropoutImplBase<Dropout2dImpl>::DropoutImplBase;
-  Tensor noise_mask(Tensor input) const override;
+  using detail::DropoutImplBase<FeatureDropoutImpl>::DropoutImplBase;
+  /// During training, applies a noise mask to the input tensor.
+  /// During evaluation, applies an identity function.
+  Tensor forward(Tensor input);
 };
 
 /// A `ModuleHolder` subclass for `DropoutImpl`.
@@ -65,10 +69,10 @@ class Dropout2dImpl : public detail::DropoutImplBase<Dropout2dImpl> {
 /// module storage semantics.
 TORCH_MODULE(Dropout);
 
-/// A `ModuleHolder` subclass for `Dropout2dImpl`.
-/// See the documentation for `Dropout2dImpl` class to learn what methods it
-/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
-/// module storage semantics.
-TORCH_MODULE(Dropout2d);
+/// A `ModuleHolder` subclass for `FeatureDropoutImpl`.
+/// See the documentation for `FeatureDropoutImpl` class to learn what methods
+/// it provides, or the documentation for `ModuleHolder` to learn about
+/// PyTorch's module storage semantics.
+TORCH_MODULE(FeatureDropout);
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index 68dbac486d2c4c..c11c0e108413ac 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -20,31 +20,18 @@ DropoutImplBase<Derived>::DropoutImplBase(DropoutOptions options_)
 template <typename Derived>
 void DropoutImplBase<Derived>::reset() {}
 
-template <typename Derived>
-Tensor DropoutImplBase<Derived>::forward(Tensor input) {
-  if (options.rate_ == 0 || !this->is_training()) {
-    return input;
-  }
-
-  auto scale = 1.0f / (1.0f - options.rate_);
-  auto boolean_mask = noise_mask(input).uniform_(0, 1) > options.rate_;
-  auto noise = boolean_mask.to(input.dtype()).mul_(scale);
-
-  return input * noise;
-}
-
 template class DropoutImplBase<DropoutImpl>;
-template class DropoutImplBase<Dropout2dImpl>;
+template class DropoutImplBase<FeatureDropoutImpl>;
 } // namespace detail
 
 DropoutOptions::DropoutOptions(double rate) : rate_(rate) {}
 
-Tensor DropoutImpl::noise_mask(Tensor input) const {
-  return torch::empty_like(input);
+Tensor DropoutImpl::forward(Tensor input) {
+  return torch::dropout(input, options.rate_, this->is_training());
 }
 
-Tensor Dropout2dImpl::noise_mask(Tensor input) const {
-  return torch::empty({input.size(0), input.size(1), 1, 1}, input.options());
+Tensor FeatureDropoutImpl::forward(Tensor input) {
+  return torch::feature_dropout(input, options.rate_, this->is_training());
 }
 } // namespace nn
 } // namespace torch
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index d9889d631f7fb8..d0514d90bda588 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -661,6 +661,8 @@ def dropout3d(input, p=0.5, training=True, inplace=False):
         training: apply dropout if is ``True``. Defualt: ``True``
         inplace: If set to ``True``, will do this operation in-place. Default: ``False``
     """
+    # This is 100% the same code as dropout2d. We duplicate this code so that
+    # stack traces are not confusing.
     if p < 0 or p > 1:
         raise ValueError("dropout probability has to be between 0 and 1, "
                          "but got {}".format(p))

From 045f862574063dfe1f92b84f46cd97b2aeeaf829 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 11 Sep 2018 20:20:16 -0700
Subject: [PATCH 059/237] Use torch::nn::init::xavier_normal_

Summary: The PyTorch C++ API has `torch.nn.init` equivalents that the RNNG can use to initialize the state of its StackRNNs. This gets rid of the `fanInOut_` methods on `Parser` and tidies up `xavierInitialState` a little.

Reviewed By: wowitsmrinal

Differential Revision: D9472595

fbshipit-source-id: c202116f32383d3b4bba064c2c0d2656311e1170
---
 torch/csrc/api/include/torch/nn.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/api/include/torch/nn.h b/torch/csrc/api/include/torch/nn.h
index 408330d94232a6..7285a696350b7c 100644
--- a/torch/csrc/api/include/torch/nn.h
+++ b/torch/csrc/api/include/torch/nn.h
@@ -2,6 +2,7 @@
 
 #include <torch/nn/cloneable.h>
 #include <torch/nn/cursor.h>
+#include <torch/nn/init.h>
 #include <torch/nn/module.h>
 #include <torch/nn/modules.h>
 #include <torch/nn/pimpl.h>

From 54107ae8cf476b4ebe7c631f75273e0b014e748c Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Tue, 11 Sep 2018 20:20:54 -0700
Subject: [PATCH 060/237] convert output_device at data_parallel from
 torch.device to index (#10189)

Summary:
- fixes #9984
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10189

Differential Revision: D9545390

Pulled By: weiyangfb

fbshipit-source-id: 3a6a705437553ba319e9fd4b7f676ff73857a27e
---
 test/test_c10d.py                   | 11 +++++++----
 test/test_distributed.py            | 27 +++++++++++++++++----------
 test/test_nn.py                     | 18 ++++++++++++++++++
 torch/nn/parallel/_functions.py     |  4 ++++
 torch/nn/parallel/data_parallel.py  | 22 ++++++++++++----------
 torch/nn/parallel/distributed.py    | 13 +++++++------
 torch/nn/parallel/parallel_apply.py |  8 +++++++-
 torch/nn/parallel/replicate.py      |  3 ++-
 8 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/test/test_c10d.py b/test/test_c10d.py
index 64bedb3183a212..ff9d87be76c84f 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -567,8 +567,7 @@ class DistributedDataParallelTest(MultiProcessTestCase):
     def world_size(self):
         return 2
 
-    def _test_ddp_with_process_group(self, process_group):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
+    def _test_ddp_with_process_group(self, process_group, gpus):
         model = Net()
         ddp_model = DistributedDataParallel(
             copy.deepcopy(model).cuda(gpus[0]),
@@ -620,14 +619,18 @@ def test_gloo_backend(self):
         options = c10d.ProcessGroupGloo.Options()
         options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
-        self._test_ddp_with_process_group(process_group)
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+        self._test_ddp_with_process_group(process_group, gpus)
+        self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
 
     @skip_if_not_multigpu
     @skip_if_not_nccl
     def test_nccl_backend(self):
         store = c10d.TCPStore('localhost', self.port, self.is_master)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        self._test_ddp_with_process_group(process_group)
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+        self._test_ddp_with_process_group(process_group, gpus)
+        self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
 
     @skip_if_not_multigpu
     def test_dist_broadcast_coalesced(self):
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 41fa5cdac0c827..abf3663966700b 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -1126,24 +1126,15 @@ def _test_DDP_2iter(
             # Shuffle the input so that DDP input is different
             input = input[torch.randperm(batch_size)]
 
-    @unittest.skipIf(
-        BACKEND != "nccl" and BACKEND != "gloo",
-        "Only Nccl & Gloo backend support DistributedDataParallel",
-    )
-    @skip_if_no_cuda_distributed
-    @skip_if_no_gpu
-    def test_DistributedDataParallel(self):
+    def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
         # Run a simple end to end DDP model, use result of single node model
         # as baseline
-        group, group_id, rank = self._init_global_test()
-        rank_to_GPU = self._init_multigpu_helper()
 
         # cpu training setup
         model = self._create_Net()
 
         # single gpu training setup
         model_gpu = copy.deepcopy(model)
-        gpu_subset = list(rank_to_GPU[rank])
         model_gpu.cuda(gpu_subset[0])
 
         # DDP training setup
@@ -1195,6 +1186,22 @@ def test_DistributedDataParallelCPU(self):
         )
         self._barrier()
 
+    @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
+                     "Only Nccl & Gloo backend support DistributedDataParallel")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_DistributedDataParallel(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        gpus = list(rank_to_GPU[rank])
+        self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank)
+
+        # test output_device
+        self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'))
+
+        # test device_ids
+        gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
+        self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'))
 
 if BACKEND == "gloo" or BACKEND == "nccl":
     WORLD_SIZE = os.environ["WORLD_SIZE"]
diff --git a/test/test_nn.py b/test/test_nn.py
index b2597b894803f6..03a35bf10d88f8 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -3154,6 +3154,24 @@ def forward(self, input):
         self.assertEqual(out.get_device(), 0)
         self.assertEqual(out.data, expected_out)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
+    def test_data_parallel_device_args(self):
+        cuda0 = torch.device('cuda:0')
+        cuda1 = torch.device('cuda:1')
+
+        # test output_device
+        l = nn.Linear(10, 5).to(cuda0, torch.float)
+        i = torch.randn(20, 10, dtype=torch.float, device=cuda0, requires_grad=True)
+        out = dp.data_parallel(l, i, device_ids=(0, 1), output_device=cuda0)
+        self.assertEqual(out, l(i))
+
+        # test device_ids
+        l = nn.Linear(10, 5).to(cuda0, torch.float)
+        i = torch.randn(20, 10, dtype=torch.float, device=cuda0, requires_grad=True)
+        out = dp.data_parallel(l, i, device_ids=(cuda0, cuda1), output_device=cuda0)
+        self.assertEqual(out, l(i))
+
     def test_state_dict(self):
         l = nn.Linear(5, 5)
         block = nn.Module()
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index 48b2a77d2d598d..4b1f58356287e9 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -3,6 +3,7 @@
 import torch
 import torch.cuda.comm as comm
 from torch.autograd import Function
+from torch.cuda._utils import _get_device_index
 
 
 class Broadcast(Function):
@@ -11,6 +12,7 @@ class Broadcast(Function):
     def forward(ctx, target_gpus, *inputs):
         if not all(input.is_cuda for input in inputs):
             raise TypeError('Broadcast function not implemented for CPU tensors')
+        target_gpus = list(map(lambda x: _get_device_index(x, True), target_gpus))
         ctx.target_gpus = target_gpus
         if len(inputs) == 0:
             return tuple()
@@ -50,6 +52,7 @@ class Gather(Function):
     @staticmethod
     def forward(ctx, target_device, dim, *inputs):
         assert all(map(lambda i: i.is_cuda, inputs))
+        target_device = _get_device_index(target_device, True)
         ctx.target_device = target_device
         ctx.dim = dim
         ctx.input_gpus = tuple(map(lambda i: i.get_device(), inputs))
@@ -76,6 +79,7 @@ class Scatter(Function):
 
     @staticmethod
     def forward(ctx, target_gpus, chunk_sizes, dim, input):
+        target_gpus = list(map(lambda x: _get_device_index(x, True), target_gpus))
         ctx.dim = dim
         ctx.input_device = input.get_device() if input.is_cuda else -1
         streams = None
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 94b60bda836d12..231963a48d1dd8 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -5,6 +5,7 @@
 from .scatter_gather import scatter_kwargs, gather
 from .replicate import replicate
 from .parallel_apply import parallel_apply
+from torch.cuda._utils import _get_device_index
 
 
 def _check_balance(device_ids):
@@ -13,7 +14,7 @@ def _check_balance(device_ids):
     has less than 75% of the memory or cores of GPU {}. You can do so by setting
     the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
     environment variable."""
-
+    device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
     dev_props = [torch.cuda.get_device_properties(i) for i in device_ids]
 
     def warn_imbalance(get_prop):
@@ -77,9 +78,9 @@ class DataParallel(Module):
 
 
     Args:
-        module: module to be parallelized
-        device_ids: CUDA devices (default: all devices)
-        output_device: device location of output (default: device_ids[0])
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices (default: all devices)
+        output_device (int or torch.device): device location of output (default: device_ids[0])
 
     Attributes:
         module (Module): the module to be parallelized
@@ -104,10 +105,11 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0):
             device_ids = list(range(torch.cuda.device_count()))
         if output_device is None:
             output_device = device_ids[0]
+
         self.dim = dim
         self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
+        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
+        self.output_device = _get_device_index(output_device, True)
 
         _check_balance(self.device_ids)
 
@@ -143,10 +145,10 @@ def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, mo
     This is the functional version of the DataParallel module.
 
     Args:
-        module: the module to evaluate in parallel
-        inputs: inputs to the module
-        device_ids: GPU ids on which to replicate module
-        output_device: GPU location of the output  Use -1 to indicate the CPU.
+        module (Module): the module to evaluate in parallel
+        inputs (tensor): inputs to the module
+        device_ids (list of int or torch.device): GPU ids on which to replicate module
+        output_device (list of int or torch.device): GPU location of the output  Use -1 to indicate the CPU.
             (default: device_ids[0])
     Returns:
         a Tensor containing the result of module(input) located on
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 175aac70be9f3d..2ea2d451594700 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -12,6 +12,7 @@
 from .replicate import replicate
 from .scatter_gather import scatter_kwargs, gather
 from .parallel_apply import parallel_apply
+from torch.cuda._utils import _get_device_index
 
 
 class DistributedDataParallel(Module):
@@ -90,10 +91,10 @@ class DistributedDataParallel(Module):
         :meth:`forward` method.
 
     Args:
-        module: module to be parallelized
-        device_ids: CUDA devices (default: all devices)
-        output_device: device location of output (default: device_ids[0])
-        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices (default: all devices)
+        output_device (int or torch.device): device location of output (default: device_ids[0])
+        broadcast_buffers (bool): flag that enables syncing (broadcasting) buffers of
                            the module at beginning of the forward function.
                            (default: True)
         process_group: the c10d process group to be used for distributed data
@@ -133,8 +134,8 @@ def __init__(self, module, device_ids=None,
 
         self.dim = dim
         self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
+        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
+        self.output_device = _get_device_index(output_device, True)
         self.broadcast_buffers = broadcast_buffers
 
         self.allreduce_opts = dist.AllreduceOptions()
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index 6a5ab99cacfae1..438bdb9faf4037 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -1,5 +1,6 @@
 import threading
 import torch
+from torch.cuda._utils import _get_device_index
 
 
 def get_a_var(obj):
@@ -22,6 +23,11 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
     contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
     on each of :attr:`devices`.
 
+    Args:
+        modules (Module): modules to be parallelized
+        inputs (tensor): inputs to the modules
+        devices (list of int or torch.device): CUDA devices
+
     :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
     :attr:`devices` (if given) should all have same length. Moreover, each
     element of :attr:`inputs` can either be a single object as the only argument
@@ -36,7 +42,7 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
         assert len(modules) == len(devices)
     else:
         devices = [None] * len(modules)
-
+    devices = list(map(lambda x: _get_device_index(x, True), devices))
     lock = threading.Lock()
     results = {}
     grad_enabled = torch.is_grad_enabled()
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index b82ef08d727bcf..309c0db8f25054 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -1,10 +1,11 @@
 import torch.cuda.comm as comm
+from torch.cuda._utils import _get_device_index
 
 
 def replicate(network, devices, detach=False):
     from ._functions import Broadcast
 
-    devices = tuple(devices)
+    devices = list(map(lambda x: _get_device_index(x, True), devices))
     num_replicas = len(devices)
 
     params = list(network.parameters())

From 35348dab103316a929ae55d91a9d03e049d7fc43 Mon Sep 17 00:00:00 2001
From: Rasmus Diederichsen <rasmus@peltarion.com>
Date: Tue, 11 Sep 2018 20:23:03 -0700
Subject: [PATCH 061/237] WIP: Include note on cudnn determinism in each
 function backed by cudnn (#11434)

Summary:
Ping ezyang
This addresses your comment in #114. Strangely, when running the doc build (`make html`) none of my changes are actually showing, could you point out what I'm doing wrong?

Once #11329 is merged it might make sense to link to the reproducibility note everywhere.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11434

Differential Revision: D9751208

Pulled By: ezyang

fbshipit-source-id: cc672472449564ff099323c39603e8ff2b2d35c9
---
 docs/source/cudnn_deterministic.rst |  7 +++++++
 torch/nn/functional.py              | 14 ++++++++++++++
 torch/nn/modules/conv.py            | 12 ++++++++++++
 torch/nn/modules/loss.py            |  5 +++++
 4 files changed, 38 insertions(+)
 create mode 100644 docs/source/cudnn_deterministic.rst

diff --git a/docs/source/cudnn_deterministic.rst b/docs/source/cudnn_deterministic.rst
new file mode 100644
index 00000000000000..71c622ea3baeb8
--- /dev/null
+++ b/docs/source/cudnn_deterministic.rst
@@ -0,0 +1,7 @@
+.. note::
+
+    In some circumstances when using the CUDA backend with CuDNN, this operator
+    may select a nondeterministic algorithm to increase performance. If this is
+    undesirable, you can try to make the operation deterministic (potentially at
+    a performance cost) by setting ``torch.backends.cudnn.deterministic =
+    True``.
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index d0514d90bda588..7823d445a0c13a 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -67,6 +67,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.Conv1d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
     input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
     weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kW)`
@@ -95,6 +97,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.Conv2d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
     input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
     weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kH \times kW)`
@@ -124,6 +128,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.Conv3d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
     input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
     weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kT \times kH \times kW)`
@@ -152,6 +158,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.ConvTranspose1d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
     input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
     weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kW)`
@@ -183,6 +191,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.ConvTranspose2d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
     input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
     weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kH \times kW)`
@@ -216,6 +226,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.ConvTranspose3d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
     input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
     weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kT \times kH \times kW)`
@@ -1438,6 +1450,8 @@ def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0,
 
     See :class:`~torch.nn.CTCLoss` for details.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
             `T = input length`, and `N = batch size`.
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 7cbf5fbf604314..a1bfcbc08e097e 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -120,6 +120,8 @@ class Conv1d(_ConvNd):
         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
         :math:`(C_\text{in}=C_{in}, C_\text{out}=C_{in} \times K, ..., \text{groups}=C_{in})`.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -243,6 +245,8 @@ class Conv2d(_ConvNd):
         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
         :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -366,6 +370,8 @@ class Conv3d(_ConvNd):
          a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
          :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -527,6 +533,8 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
         that :attr:`output_padding` is only used to find output shape, but does
         not actually add zero-padding to output.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -637,6 +645,8 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
         that :attr:`output_padding` is only used to find output shape, but does
         not actually add zero-padding to output.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -777,6 +787,8 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
         that :attr:`output_padding` is only used to find output shape, but does
         not actually add zero-padding to output.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index ef6c8971691945..1737bd5f9da687 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1177,6 +1177,11 @@ class CTCLoss(_Loss):
         dtype :attr:`torch.int32`.
 
         The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
+
+
+    .. include:: cudnn_deterministic.rst
+
+
     """
 
     def __init__(self, blank=0, reduction='elementwise_mean'):

From f4d9f39a94f4dea478a489a7c9d31dedbad06bd5 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 11 Sep 2018 20:27:25 -0700
Subject: [PATCH 062/237] Test libtorch on cuda

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11554

Differential Revision: D9784772

Pulled By: goldsborough

fbshipit-source-id: c3e071695f56c1f427984f427b1f7722722947d3
---
 .jenkins/pytorch/common.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index 5ce6ee01a46975..ca728df2b826c5 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -112,7 +112,8 @@ else
   exit 1
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3 ]] || \
+   [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then
   BUILD_TEST_LIBTORCH=1
 else
   BUILD_TEST_LIBTORCH=0

From b75c32ded9e4bfdfedfd96e41af762dcdee6697c Mon Sep 17 00:00:00 2001
From: Anders Papitto <anderspapitto@gmail.com>
Date: Tue, 11 Sep 2018 20:31:44 -0700
Subject: [PATCH 063/237] link against TORCH_CUDA_LIBRARIES

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11475

Differential Revision: D9784616

Pulled By: anderspapitto

fbshipit-source-id: bb8b443bcb308bbbe9707d265f21e5d00d717d65
---
 torch/CMakeLists.txt | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 614dbf3937742d..b0f07a7ef3a73b 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -319,23 +319,11 @@ if(USE_CUDA)
     set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   else()
     set(TORCH_CUDA_LIBRARIES
-      ${CUDA_CUDA_LIB}
-      ${CUDA_NVRTC_LIB}
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
       ${CUDA_LIBRARIES})
   endif()
 
-  if(MSVC OR APPLE)
-    target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES})
-  else()
-    # TODO: using the full TORCH_CUDA_LIBRARIES here causes some
-    # builds to fail in CI, as libcuda.so can no longer be found. It's
-    # not clear why this is the case, and the situation should be
-    # investigated/cleaned up.  Note that the test+jit/test_api
-    # targets below do require linking against the full
-    # TORCH_CUDA_LIBRARIES, even on Linux
-    target_link_libraries(torch caffe2_gpu_library ${CUDA_LIBRARIES})
-  endif()
+  target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES})
   target_compile_definitions(torch PRIVATE USE_CUDA)
 endif()
 
@@ -412,7 +400,7 @@ if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
     ${ATen_CPU_INCLUDE})
 
   if (USE_CUDA)
-    target_link_libraries(test_jit ${CUDA_LIBRARIES})
+    target_link_libraries(test_jit ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB})
   endif()
 endif()
 
@@ -446,7 +434,7 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
     "${TORCH_ROOT}/third_party/catch/single_include"
     ${ATen_CPU_INCLUDE})
 
-  target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES})
+  target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB})
 
   if (NOT MSVC)
     if (APPLE)

From 8aa8ad8b019cb5825ddbe3748aa34a2ad12bb600 Mon Sep 17 00:00:00 2001
From: Rasmus Diederichsen <rasmus@peltarion.com>
Date: Tue, 11 Sep 2018 21:01:43 -0700
Subject: [PATCH 064/237] WIP: Reproducibility note (#11329)

Summary:
This adds a Note on making experiments reproducible.

It also adds Instructions for building the Documentation to `README.md`. Please ping if I missed any requirements.

I'm not sure what to do about the submodule changes. Please advise.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11329

Differential Revision: D9784939

Pulled By: ezyang

fbshipit-source-id: 5c5acbe343d1fffb15bdcb84c6d8d925c2ffcc5e
---
 README.md                        | 13 ++++++++++
 docs/source/notes/randomness.rst | 42 ++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 docs/source/notes/randomness.rst

diff --git a/README.md b/README.md
index b909001edc6f14..4b996cc1b440a6 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ We are in an early-release beta. Expect some adventures and rough edges.
   - [Binaries](#binaries)
   - [From Source](#from-source)
   - [Docker Image](#docker-image)
+  - [Building the Documentation](#building-the-documentation)
   - [Previous Versions](#previous-versions)
 - [Getting Started](#getting-started)
 - [Communication](#communication)
@@ -224,6 +225,18 @@ Please note that PyTorch uses shared memory to share data between processes, so
 for multithreaded data loaders) the default shared memory segment size that container runs with is not enough, and you
 should increase shared memory size either with `--ipc=host` or `--shm-size` command line options to `nvidia-docker run`.
 
+### Building the Documentation
+
+To build documentation in various formats, you will need Sphinx and the
+readthedocs theme.
+
+```
+cd docs/
+pip install -r requirements.txt
+```
+You can then build the documentation by running ``make <format>`` from the
+``docs/`` folder. Run ``make`` to get a list of all available output formats.
+
 ### Previous Versions
 
 Installation instructions and binaries for previous PyTorch versions may be found
diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
new file mode 100644
index 00000000000000..5ba2c87a8eecf3
--- /dev/null
+++ b/docs/source/notes/randomness.rst
@@ -0,0 +1,42 @@
+
+Reproducibility
+===============
+
+Completely reproducible results are not guaranteed across PyTorch releases,
+individual commits or different platforms. Furthermore, results need to be
+reproducible between CPU and GPU executions, even when using identical seeds.
+
+However, in order to make computations deterministic on your specific problem on
+one specific platform and PyTorch release, there are a couple of steps to take.
+
+There are two pseudorandom number generators involved in PyTorch, which you will
+need to seed manually to make runs reproducible. Furthermore, you should ensure
+that all other libraries your code relies on an which use random numbers also
+use a fixed seed.
+
+PyTorch
+.......
+You can use :meth:`torch.manual_seed()` to seed the RNG for all devices (both
+CPU and CUDA)
+
+    import torch
+    torch.manual_seed(0)
+
+
+CuDNN
+.....
+When running on the CuDNN backend, one further option must be set::
+
+    torch.backends.cudnn.deterministic = True
+
+.. warning::
+
+    Deterministic mode can have a performance impact, depending on your model.
+
+Numpy
+.....
+If you or any of the libraries you are using rely on Numpy, you should seed the
+Numpy RNG as well. This can be done with::
+
+    import numpy as np
+    np.random.seed(0)

From a11ebfa195db22268e9db562a6295752f55bec4a Mon Sep 17 00:00:00 2001
From: Tongliang Liao <xkszltl@gmail.com>
Date: Tue, 11 Sep 2018 21:04:10 -0700
Subject: [PATCH 065/237] Add explicit "this->" for nvcc. (#11196)

Summary:
Fix #11195
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11196

Differential Revision: D9737625

Pulled By: ezyang

fbshipit-source-id: fb62076f005bd619eba53c0ed3f07683633f6d91
---
 caffe2/operators/utility_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index 634281428bb898..c661ec2f5c55ec 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -325,7 +325,7 @@ class WeightedSumOp : public Operator<Context> {
 
   template <typename T>
   bool DoRunWithType() {
-    const int input_size = InputSize();
+    const int input_size = this->InputSize();
     CAFFE_ENFORCE_EQ(input_size % 2, 0);
     const auto& X0 = Input(0);
     const auto& weight0 = Input(1);

From 1a246c9c7eea7dfe8391645540053badab59747e Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Tue, 11 Sep 2018 21:07:03 -0700
Subject: [PATCH 066/237] guard spurious cudnn.h include (#11562)

Summary:
This fixes the build when CuDNN was not found on the system.

From the `git blame`, it looks like the bug has been around for 2 years :)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11562

Differential Revision: D9784589

Pulled By: soumith

fbshipit-source-id: b33153436dced0a503c9833cdf52f7093f3394b4
---
 caffe2/core/common_cudnn.h        | 4 ++--
 caffe2/python/pybind_state_gpu.cc | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index a8f2808e30eeab..2bbbce7df1e4ef 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -4,8 +4,6 @@
 #include <array>
 #include <mutex>
 
-#include <cudnn.h>
-
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
@@ -16,6 +14,8 @@
 #error("This Caffe2 install is not built with cudnn, so you should not include this file.");
 #endif
 
+#include <cudnn.h>
+
 static_assert(
     CUDNN_VERSION >= 5000,
     "Caffe2 requires cudnn version 5.0 or above.");
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index 9c15bc2145d5a9..8c547cf8eccca6 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -10,7 +10,9 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#ifdef CAFFE2_USE_CUDNN
 #include "caffe2/core/common_cudnn.h"
+#endif // CAFFE2_USE_CUDNN
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/operator_fallback_gpu.h"
 #include "caffe2/python/pybind_state_registry.h"
@@ -39,10 +41,12 @@ namespace py = pybind11;
 void addCUDAGlobalMethods(py::module& m) {
   m.def("num_cuda_devices", &NumCudaDevices);
   m.def("get_cuda_version", &CudaVersion);
+#ifdef CAFFE2_USE_CUDNN
   m.def("get_cudnn_version", &cudnnCompiledVersion);
   m.attr("cudnn_convolution_fwd_algo_count") = py::int_((int) CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
   m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
   m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
+#endif
   m.def("get_cuda_peer_access_pattern", []() {
     std::vector<std::vector<bool>> pattern;
     CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));

From a00fa2c6145685383788e5e3b08c87eadcd69ff2 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 21:47:46 -0700
Subject: [PATCH 067/237] Release GIL when calling into JIT interpreter

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11541

Differential Revision: D9777909

Pulled By: apaszke

fbshipit-source-id: d0217e203721262f3f131b54ea78f898df0b54ec
---
 torch/csrc/jit/init.cpp               | 6 +++++-
 torch/csrc/jit/pybind_utils.h         | 6 +++++-
 torch/csrc/jit/python_interpreter.cpp | 1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index ce472707319fbc..0d607c3c364a80 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -1,4 +1,5 @@
 #include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/auto_gil.h"
 
 #include "torch/csrc/jit/python_tracer.h"
 #include "torch/csrc/jit/tracer.h"
@@ -206,7 +207,10 @@ void initJITBindings(PyObject *module) {
       .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
         const auto & graph = ge.graph();
         auto stack = evilDeprecatedBadCreateStackDoNotUse(args, graph->inputs());
-        ge.run(stack);
+        {
+          AutoNoGIL no_gil_guard;
+          ge.run(stack);
+        }
         return createPyObjectForStack(std::move(stack));
       });
 
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index b9f9e6bb85fd4e..54414534b06215 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -7,6 +7,7 @@
 #include "torch/csrc/jit/type.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/auto_gil.h"
 
 #include <ATen/Error.h>
 
@@ -279,7 +280,10 @@ inline py::object invokeScriptMethodFromPython(
     script::Method& method,
     py::args args, py::kwargs kwargs) {
   auto stack = createStackForSchema(method.getSchema(), std::move(args), std::move(kwargs));
-  method.run(stack);
+  {
+    AutoNoGIL no_gil_guard;
+    method.run(stack);
+  }
   return createPyObjectForStack(std::move(stack));
 }
 
diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp
index 5cc4b70b2e9637..85f0c37c4ea7e3 100644
--- a/torch/csrc/jit/python_interpreter.cpp
+++ b/torch/csrc/jit/python_interpreter.cpp
@@ -28,6 +28,7 @@ namespace torch { namespace jit {
 namespace {
 
 Operation createPythonOperation(Node* op_) {
+  AutoGIL gil;
   PythonOp* op = static_cast<PythonOp*>(op_);
   py::function func = py::reinterpret_borrow<py::function>(py::handle(op->pyobj.get()));
   size_t num_inputs = 0;

From 62c9d4ac96182d08c0f17813564e54bc8afb9c06 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 21:48:22 -0700
Subject: [PATCH 068/237] Make .to() methods native functions (to fix JIT
 tracing)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11491

Differential Revision: D9771121

Pulled By: apaszke

fbshipit-source-id: 08d11101fb12093f8cf913b06359adddf3af9da7
---
 aten/src/ATen/native/TensorConversions.cpp    | 51 ++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    | 16 +++++
 aten/src/ATen/templates/Tensor.h              |  6 --
 aten/src/ATen/templates/TensorMethods.h       | 39 ------------
 test/test_jit.py                              | 28 +++++++++
 tools/autograd/gen_python_functions.py        |  2 +-
 tools/autograd/templates/VariableType.h       |  1 +
 .../templates/python_variable_methods.cpp     | 60 ++++++++++---------
 tools/jit/gen_jit_dispatch.py                 |  2 +-
 torch/csrc/jit/tracer.cpp                     | 22 ++++---
 torch/csrc/jit/tracer.h                       |  4 +-
 torch/onnx/symbolic.py                        | 18 ++++++
 12 files changed, 167 insertions(+), 82 deletions(-)
 create mode 100644 aten/src/ATen/native/TensorConversions.cpp

diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
new file mode 100644
index 00000000000000..9605736ee112fe
--- /dev/null
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -0,0 +1,51 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+namespace at {
+namespace native {
+
+static void ensure_has_index(Device* device) {
+  if (!device->is_cuda() || device->has_index()) {
+    return;
+  }
+  device->set_index(at::current_device());
+}
+
+static Tensor to_impl(const Tensor& self, const TensorOptions& options, bool non_blocking) {
+  return self.type().toBackend(options.backend()).toScalarType(options.dtype())
+                    .copy(self, non_blocking, options.device());
+}
+
+Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking) {
+  ensure_has_index(&device);
+  if (self.device() == device && self.dtype() == dtype) {
+    return self;
+  }
+  return to_impl(self, self.options().device(device).dtype(dtype), non_blocking);
+}
+
+Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking) {
+  if (self.dtype() == dtype) {
+    return self;
+  }
+  return to_impl(self, self.options().dtype(dtype), non_blocking);
+}
+
+Tensor to(const Tensor& self, Device device, bool non_blocking) {
+  ensure_has_index(&device);
+  if (self.device() == device) {
+    return self;
+  }
+  return to_impl(self, self.options().device(device), non_blocking);
+}
+
+Tensor to(const Tensor& self, const Tensor& other, bool non_blocking) {
+  auto self_options = self.options();
+  auto options = other.options();
+  if (self_options == options) {
+    return self;
+  }
+  return to_impl(self, options, non_blocking);
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 44fea84527acc2..024c33a19c31dc 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2030,6 +2030,22 @@
   variants: function, method
   device_guard: False
 
+- func: to(Tensor self, Device device, ScalarType dtype, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
+- func: to(Tensor self, ScalarType dtype, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
+- func: to(Tensor self, Device device, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
+- func: to(Tensor self, Tensor other, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
 - func: meshgrid(TensorList tensors) -> TensorList
 
 # This has a method dispatch to work around circular include problems
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 50e280be8f2fe9..aab8acd86a6ba5 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -154,12 +154,6 @@ struct AT_API Tensor {
   Tensor toType(ScalarType t) const;
   Tensor toBackend(Backend b) const;
 
-  /// New-style `to()` methods.
-  /// NB: These methods are defined in TensorOptions.h.
-  Tensor to(Device device, ScalarType dtype, bool non_blocking = false) const;
-  Tensor to(ScalarType dtype, bool non_blocking = false) const;
-  Tensor to(Device device, bool non_blocking = false) const;
-
   /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`.
   /// Defined in Type.h because of include order issues.
   bool is_variable() const noexcept;
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 5573f5802809e9..ec86db721b9f04 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -43,45 +43,6 @@ inline TensorOptions Tensor::options() const {
                         .is_variable(is_variable());
 }
 
-namespace detail {
-inline Tensor to(
-    const Tensor& tensor,
-    const TensorOptions& options,
-    bool non_blocking) {
-  // Don't copy if the options match.
-  if (tensor.options() == options) {
-    return tensor;
-  }
-  AT_CHECK(tensor.is_variable() == options.is_variable(),
-           "cannot change is_variable, from: ", tensor.is_variable(),
-           " to: ", options.is_variable());
-  return tensor.type().toBackend(options.backend()).toScalarType(options.dtype())
-               .copy(tensor, non_blocking, options.device());
-}
-} // namespace detail
-
-inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking)
-    const {
-  if (this->device() == device && this->dtype() == dtype) {
-    return *this;
-  }
-  return detail::to(*this, options().device(device).dtype(dtype), non_blocking);
-}
-
-inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const {
-  if (this->dtype() == dtype) {
-    return *this;
-  }
-  return detail::to(*this, options().dtype(dtype), non_blocking);
-}
-
-inline Tensor Tensor::to(Device device, bool non_blocking) const {
-  if (this->device() == device) {
-    return *this;
-  }
-  return detail::to(*this, options().device(device), non_blocking);
-}
-
 inline void Tensor::backward(
     at::optional<Tensor> gradient,
     bool keep_graph,
diff --git a/test/test_jit.py b/test/test_jit.py
index 68aaf4df5d3afd..69347e2d67b111 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1081,6 +1081,34 @@ def test_trace_size(self):
     def test_trace_size_with_grad(self):
         self.do_trace_size(True)
 
+    def test_trace_casts(self):
+        casts = [
+            lambda x: x.byte(),
+            lambda x: x.float(),
+            lambda x: x.cpu(),
+            lambda x: x.to(device='cpu'),
+            lambda x: x.to(dtype=torch.int64),
+            lambda x: x.to(device='cpu', dtype=torch.float),
+            lambda x: x.to(x)
+        ]
+
+        def assertContainsCast(trace):
+            self.assertEqual(sum(n.kind() == 'aten::to' for n in trace.graph.nodes()), 1)
+
+        for cast in casts:
+            trace = torch.jit.trace(cast, torch.randn(2, 2))
+            assertContainsCast(trace)
+            x = torch.randn(2, 2)
+            self.assertEqual(trace(x), cast(x))
+
+        def to_tensor(x, y):
+            return x.to(y)
+
+        to_tensor_trace = torch.jit.trace(to_tensor, (torch.randn(2, 2), torch.randn(1, 8)))
+        assertContainsCast(to_tensor_trace)
+        x, y = torch.randn(2, 2), torch.randn(1, 10)
+        self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y))
+
     def test_trace_warn(self):
         def fn(x):
             int(x)  # Warning 1.
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index b3ea70aa87222e..8712c7c6ca2d43 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -28,7 +28,7 @@
     '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
     'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?',
     '_local_scalar', '_local_scalar_dense',
-    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear'
+    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to'
 ]
 
 # These function signatures are not exposed to Python. Note that this signature
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index b9d84ad85672a5..d1e847a141aace 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -29,6 +29,7 @@ using at::TensorList;
 using at::Type;
 using at::ScalarType;
 using at::optional;
+using at::Device;
 
 void register_variable_type_for(at::Type* baseType);
 
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index a863a18e04bdf3..4f0bc94199af98 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -230,13 +230,26 @@ static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
   END_HANDLE_TH_ERRORS
 }
 
+static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking) {
+  AutoNoGIL no_gil;
+  return self.to(device, non_blocking);
+}
+
+static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking) {
+  AutoNoGIL no_gil;
+  return self.to(dtype, non_blocking);
+}
+
+static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) {
+  AutoNoGIL no_gil;
+  return self.to(device, dtype, non_blocking);
+}
+
 static PyObject * THPVariable_cpu(PyObject* self, PyObject* args)
 {
    HANDLE_TH_ERRORS
    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-   auto backend = self_.is_sparse() ? Backend::SparseCPU : Backend::CPU;
-   auto& type = self_.type().toBackend(backend);
-   return wrap(torch::utils::dispatch_type_conversion(self_, type));
+   return THPVariable_Wrap(dispatch_to(self_, at::Device(at::DeviceType::CPU), false));
    END_HANDLE_TH_ERRORS
 }
 
@@ -250,25 +263,17 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
-  auto backend = self_.is_sparse() ? at::Backend::SparseCUDA : at::Backend::CUDA;
-  auto& type = self_.type().toBackend(backend);
-  auto device_obj = r.device(0);
-  if (!r.isNone(0) && device_obj.is_cpu()) {
-    throw std::runtime_error("Invalid device, must be cuda device");
-  }
-  int32_t device_index = -1;
-  if (device_obj.has_index() && device_obj.is_cuda()) {
-    device_index = device_obj.index();
-  }
-  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, r.toBool(1)));
+  auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
+  AT_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
+  torch::utils::cuda_lazy_init();
+  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1)));
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType) {
   HANDLE_TH_ERRORS
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-  auto& type = self_.type().toScalarType(scalarType);
-  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
+  return THPVariable_Wrap(dispatch_to(self_, scalarType, false));
   END_HANDLE_TH_ERRORS
 }
 static PyObject * THPVariable_byte(PyObject* self, PyObject* args) {
@@ -497,18 +502,19 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
   auto& device = std::get<0>(parsed);
   auto& scalarType = std::get<1>(parsed);
   auto non_blocking = std::get<2>(parsed);
-  if (!device) {
-    // device not given
-    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-    auto& type = self_.type().toScalarType(scalarType.value_or(self_.type().scalarType()));
-    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (device && device->is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
+  if (!device && !scalarType) {
+    Py_INCREF(self);
+    return self;
+  } else if (!device) {
+    return THPVariable_Wrap(dispatch_to(self_, *scalarType, non_blocking));
+  } else if (!scalarType) {
+    return THPVariable_Wrap(dispatch_to(self_, *device, non_blocking));
   } else {
-    // device and maybe dtype are given
-    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-    auto& layout = *torch::getLayout(self_.type().backend());
-    auto& type = torch::getVariableType(scalarType.value_or(self_.type().scalarType()), layout, device->type());
-    const int32_t device_index = type.is_cuda() ? device->index() : -1;
-    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, non_blocking));
+    return THPVariable_Wrap(dispatch_to(self_, *device, *scalarType, non_blocking));
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 67bba43913ef16..b7326e526baa86 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -124,7 +124,7 @@ def is_magic_method(api_name):
     return api_name.startswith('__') and api_name.endswith('__')
 
 
-blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional<ScalarType>', 'void*'}
+blacklisted_types = {'SparseTensorRef', 'Storage', 'void*'}
 default_only_types = {'Generator'}
 
 
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 0f8d65dcc5873a..da110dd18d3fed 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -50,7 +50,18 @@ void addInputs(Node *n, const char * name, at::Generator * value)            {
   Value * undef_gen = g->insertNode(g->createNoneGenerator())->output();
   n->addInput(undef_gen);
 }
-void addInputs(Node *n, const char * name, at::ScalarType value)             { detail::badArgType(value); }
+void addInputs(Node *n, const char * name, at::Device value) {
+  std::vector<int64_t> device = {
+      static_cast<int64_t>(value.type()),
+      static_cast<int64_t>(value.index())};
+  detail::genericAddInput(n, std::move(device));
+}
+void addInputs(Node *n, const char * name, at::Layout value) {
+  detail::genericAddInput(n, static_cast<int64_t>(value));
+}
+void addInputs(Node *n, const char * name, at::ScalarType value) {
+  detail::genericAddInput(n, static_cast<int64_t>(value));
+}
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
   Graph *g = n->owningGraph();
@@ -60,12 +71,9 @@ void addInputs(Node *n, const char * name, at::TensorList value) {
 
 void addInputs(Node* n, const char * name, const at::TensorOptions& options) {
   // [TensorOptions in script] - update this when you change how we schematize TensorOptions
-  detail::genericAddInput(n, static_cast<int64_t>(options.dtype()));
-  detail::genericAddInput(n, static_cast<int64_t>(options.layout()));
-  std::vector<int64_t> device = {
-      static_cast<int64_t>(options.device().type()),
-      static_cast<int64_t>(options.device().index())};
-  detail::genericAddInput(n, std::move(device));
+  addInputs(n, name, options.dtype());
+  addInputs(n, name, options.layout());
+  addInputs(n, name, options.device());
 }
 
 void addInputs(Node *n, const char * name, at::IntList value) {
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 37cd9b332923ee..710aac85223044 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -172,8 +172,10 @@ TORCH_API void addInputs(Node *n, const char * name, const ArrayRef<double>& val
 TORCH_API void addInputs(Node *n, const char * name, const std::string& value);
 TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
 TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value);
-TORCH_API void addInputs(Node *n, const char * name, at::Generator * value);
+TORCH_API void addInputs(Node *n, const char * name, at::Device value);
+TORCH_API void addInputs(Node *n, const char * name, at::Layout value);
 TORCH_API void addInputs(Node *n, const char * name, at::ScalarType value);
+TORCH_API void addInputs(Node *n, const char * name, at::Generator * value);
 
 template<size_t N>
 void addInputs(Node *n, const char * name, std::array<bool, N> value) {
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 03e93125f40755..53d4f769a4e56f 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -991,6 +991,24 @@ def topk(g, self, k, dim, largest, sorted, out=None):
     return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
 
 
+def to(g, self, *args):
+    # ONNX doesn't have a concept of a device, so we ignore device casts
+    if len(args) == 2:
+        if args[0].type().isSubtypeOf(ListType.ofInts()):
+            # aten::to(Tensor, Device, bool)
+            return self
+        else:
+            # aten::to(Tensor, ScalarType, bool)
+            dtype = _get_const(args[0], 'i', 'dtype')
+            return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+    elif len(args) == 3:
+        # aten::to(Tensor, Device, ScalarType, bool)
+        dtype = _get_const(args[1], 'i', 'dtype')
+        return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+    else:
+        raise NotImplementedError("Unknown aten::to signature")
+
+
 def repeat(g, self, repeats):
     if not _is_value(repeats):
         repeats = g.op("Constant", value_t=torch.LongTensor(repeats))

From 90e31f4896c2062b97f7a4efb73754b840ba5dc6 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 11 Sep 2018 21:55:22 -0700
Subject: [PATCH 069/237] Improve tracer warnings (#11545)

Summary:
Also, fix a performance bug in `ensureUnique`. Previously it formatted the warning string even though we weren't tracing, so all that work would *always* happen in the hot path and be for nothing.

A sample of how the new warnings look like:
```
tmp.py:4: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Pytho
n values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  int(x)
tmp.py:5: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this fun
ction to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might caus
e the trace to be incorrect.
  torch.tensor([1.])
tmp.py:6: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator add_. This might cause t
he trace to be incorrect, because all other views that also reference this data will not not reflect this change in the trace! On the other ha
nd, if all other views use the same memory, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  torch.split(y, 2, dim=1)[0].add_(2)

```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11545

Differential Revision: D9782975

Pulled By: apaszke

fbshipit-source-id: 5b3abd31366e59c69e0b7ff278042b5563deb5a9
---
 .../templates/python_torch_functions.cpp      |  8 ++---
 .../templates/python_variable_methods.cpp     | 14 ++++-----
 torch/csrc/autograd/python_variable.cpp       |  2 +-
 torch/csrc/jit/tracer.cpp                     | 29 +++++++++----------
 torch/csrc/jit/tracer.h                       | 15 +++++++++-
 torch/csrc/jit/tracing_state.h                |  8 +++--
 torch/jit/__init__.py                         |  2 +-
 7 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index a7174b52e52b88..51ded6eb792f34 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -321,7 +321,7 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject*
 static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.as_tensor");
+  jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR);
   return THPVariable_Wrap(torch::utils::as_tensor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
@@ -329,7 +329,7 @@ static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject
 static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.from_numpy");
+  jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR);
   auto data = torch::utils::tensor_from_numpy(arg);
   return THPVariable_Wrap(make_variable(std::move(data), /*requires_grad=*/false));
   END_HANDLE_TH_ERRORS
@@ -354,7 +354,7 @@ static PyObject * THPVariable__promote_types(PyObject* self, PyObject* args, PyO
 static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.sparse_coo_tensor");
+  jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR);
   return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
@@ -362,7 +362,7 @@ static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args,
 static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.tensor");
+  jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR);
   return THPVariable_Wrap(torch::utils::tensor_ctor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 4f0bc94199af98..d92ad3dbf7688b 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -179,7 +179,7 @@ static int64_t dispatch_to_CLong(const Tensor & self) {
 
 static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python float");
+  jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   return wrap(dispatch_to_CDouble(self_));
   END_HANDLE_TH_ERRORS
@@ -187,7 +187,7 @@ static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
 
 static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python integer");
+  jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (isFloatingType(self_.type().scalarType())) {
     // we can't dispatch to toCLong here because we want to avoid ATen overflow checks;
@@ -203,7 +203,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
 // called when used as a slice.
 static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python index");
+  jit::tracer::warn("Converting a tensor to a Python index", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   // TODO: change the condition to `self_.dim() != 0` once we expose scalars
   // in PyTorch.
@@ -320,7 +320,7 @@ static PyObject * THPVariable_element_size(PyObject* self, PyObject* args)
 static PyObject * THPVariable_numpy(PyObject* self, PyObject* arg)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a NumPy array");
+  jit::tracer::warn("Converting a tensor to a NumPy array", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (self_.requires_grad()) {
     throw std::runtime_error(
@@ -375,7 +375,7 @@ static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyO
 static PyObject * THPVariable_item(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python number");
+  jit::tracer::warn("Converting a tensor to a Python number", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (self_.is_floating_point()) {
     return wrap(dispatch_to_CDouble(self_));
@@ -523,7 +523,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
 static PyObject * THPVariable_tolist(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python list");
+  jit::tracer::warn("Converting a tensor to a Python list", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   return torch::utils::tensor_to_list(self_.data());
   END_HANDLE_TH_ERRORS
@@ -570,7 +570,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
 ${py_methods}
 
 static PyObject * THPVariable_bool(PyObject* self, PyObject* args) {
-  jit::tracer::warn("Converting a tensor to a Python boolean");
+  jit::tracer::warn("Converting a tensor to a Python boolean", jit::tracer::WARN_PYTHON_DATAFLOW);
   return THPVariable_is_nonzero(self, args);
 }
 
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 5666da28135f1f..c3f9bd510a8289 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -126,7 +126,7 @@ static void THPVariable_dealloc(THPVariable* self)
 static PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.Tensor");
+  jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR);
   auto& default_type = torch::tensors::get_default_tensor_type();
   auto tensor = torch::utils::legacy_tensor_ctor(default_type, args, kwargs);
   return THPVariable_NewWithVar(type, std::move(tensor));
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index da110dd18d3fed..c4e4948fa9248a 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -190,12 +190,22 @@ void setRecordSourceLocation(void (*v)(Node*)) {
 void defaultWarn(const std::string& str) { AT_WARN(str); }
 std::atomic<warn_fn_type> warn_callback { defaultWarn };
 
-void _do_warn(const char * _reason) {
+const char * WARN_PYTHON_DATAFLOW =
+  " might cause the trace to be incorrect. We can't record the data flow of "
+  "Python values, so this value will be treated as a constant in the future. "
+  "This means that the trace might not generalize to other inputs!";
+const char * WARN_CONSTRUCTOR =
+  " results are registered as constants in the trace. You can safely ignore this "
+  "warning if you use this function to create tensors out of constant variables "
+  "that would be the same every time you call this function. In any other case, "
+  "this might cause the trace to be incorrect.";
+
+// XXX: _kind can be a nullptr
+void _do_warn(const char * _reason, const char * _kind) {
   std::string reason { _reason };
+  std::string kind { _kind ? _kind : "" };
   std::ostringstream s;
-  s << std::string(reason);
-  s << " might cause the trace to be incorrect. We can't record the data flow of "
-       " Python values, which means the trace might not generalize to other inputs.";
+  s << reason << kind;
   warn_callback.load()(s.str());
 }
 
@@ -203,15 +213,4 @@ void setWarn(warn_fn_type fn) {
   warn_callback.store(fn);
 }
 
-void ensureUnique(const char * name, const at::Tensor& tensor) {
-  auto aliases = tensor.storage().use_count();
-  if (aliases > 1) {
-    std::stringstream ss;
-    ss << "There are " << aliases
-       << " live references to the tensor being modified when tracing in-place operator "
-       << name << " which ";
-    warn(ss.str().c_str());
-  }
-}
-
 }}}
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 710aac85223044..21a806d86a245e 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -182,7 +182,20 @@ void addInputs(Node *n, const char * name, std::array<bool, N> value) {
   throw std::runtime_error("Found an unsupported argument type in the JIT tracer. File a bug report.");
 }
 
-TORCH_API void ensureUnique(const char * name, const at::Tensor& tensor);
+inline void ensureUnique(const char * name, const at::Tensor& tensor) {
+  auto aliases = tensor.storage().use_count();
+  if (isTracing() && aliases > 1) {
+    std::stringstream ss;
+    ss << "There are " << aliases
+       << " live references to the data region being modified when tracing in-place operator "
+       << name << ". This might cause the trace to be incorrect, because all other views "
+       << "that also reference this data will not not reflect this change in the trace! "
+       << "On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. "
+       << "are outputs of torch.split), this might still be safe.";
+    warn(ss.str().c_str());
+  }
+}
+
 
 template <
     typename T,
diff --git a/torch/csrc/jit/tracing_state.h b/torch/csrc/jit/tracing_state.h
index 34808f0272e467..598c165650daa4 100644
--- a/torch/csrc/jit/tracing_state.h
+++ b/torch/csrc/jit/tracing_state.h
@@ -95,11 +95,13 @@ inline bool isTracing() {
 }
 
 using warn_fn_type = void (*)(const std::string& msg);
-TORCH_API void _do_warn(const char * _reason);
-inline void warn(const char * _reason) {
+TORCH_API extern const char * WARN_PYTHON_DATAFLOW;
+TORCH_API extern const char * WARN_CONSTRUCTOR;
+TORCH_API void _do_warn(const char * _reason, const char * _kind);
+inline void warn(const char * _reason, const char * _kind=nullptr) {
   if (auto state = getTracingState()) {
     if (!state->warn) return;
-    _do_warn(_reason);
+    _do_warn(_reason, _kind);
   }
 }
 TORCH_API void setWarn(warn_fn_type fn);
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index c31504ad83ab22..1397f0cd18af24 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -389,7 +389,7 @@ def maybe_warn_nondeterministic():
                 nondeterministic_ops_warning = "Trace had nondeterministic nodes. Nodes:\n"
                 nondeterministic_ops_warning += "\n".join([indent(str(op)) for op in nondeterm_ops][:20])
                 nondeterministic_ops_warning += "\nThis may cause errors in trace checking. To disable trace checking,"\
-                                                " pass disable_checks=True to torch.jit.trace()"
+                                                " pass check_trace=False to torch.jit.trace()"
                 warnings.warn(nondeterministic_ops_warning, category=TracerWarning, stacklevel=5)
 
         def compare_outputs(original, reference, match_what):

From 6dcdbd3a1d5772115aee822bec29c21ec790a442 Mon Sep 17 00:00:00 2001
From: Teng Li <tengli@fb.com>
Date: Tue, 11 Sep 2018 22:05:22 -0700
Subject: [PATCH 070/237] Make C10d support CPU only build (#11513)

Summary:
This makes torch.distributed works for CPU only build.

Also added one more CI test case to cover MPI CPU build.
All CI tests should cover this change
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11513

Differential Revision: D9784546

Pulled By: teng-li

fbshipit-source-id: 0976a6b0fd199670926f0273e17ad7d2805e42e7
---
 .jenkins/pytorch/build.sh            |  2 +-
 setup.py                             | 11 ++--
 torch/csrc/distributed/c10d/init.cpp |  2 +
 torch/lib/c10d/CMakeLists.txt        | 82 +++++++++++++++++++---------
 torch/lib/c10d/ProcessGroupGloo.cpp  | 56 +++++++++++++++----
 torch/lib/c10d/ProcessGroupGloo.hpp  | 11 ++++
 torch/lib/c10d/test/CMakeLists.txt   | 20 ++++---
 7 files changed, 132 insertions(+), 52 deletions(-)

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index d9e58d7e8b1604..7d491e0fccb7ef 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -11,7 +11,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
   sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]] || [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then
   # TODO: move this to Docker
   sudo apt-get update
   sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
diff --git a/setup.py b/setup.py
index 1ed87cc7819cf7..03e7b745a83539 100644
--- a/setup.py
+++ b/setup.py
@@ -448,9 +448,7 @@ def check_file(f):
         if USE_DISTRIBUTED:
             if IS_LINUX:
                 libs += ['gloo']
-                # TODO: make c10d build without CUDA
-                if USE_CUDA:
-                    libs += ['c10d']
+                libs += ['c10d']
             libs += ['THD']
         build_libs(libs)
 
@@ -624,7 +622,7 @@ def run(self):
         if USE_DISTRIBUTED:
             print('-- Building with THD distributed package ')
             monkey_patch_THD_link_flags()
-            if IS_LINUX and USE_CUDA:
+            if IS_LINUX:
                 print('-- Building with c10d distributed package ')
                 monkey_patch_C10D_inc_flags()
             else:
@@ -964,10 +962,11 @@ def run(self):
     ]
     include_dirs += [tmp_install_path + "/include/THD"]
     main_link_args += [THD_LIB]
-    if IS_LINUX and USE_CUDA:
+    if IS_LINUX:
         extra_compile_args.append('-DUSE_C10D')
         main_sources.append('torch/csrc/distributed/c10d/init.cpp')
-        main_sources.append('torch/csrc/distributed/c10d/ddp.cpp')
+        if USE_CUDA:
+            main_sources.append('torch/csrc/distributed/c10d/ddp.cpp')
         main_link_args.append(C10D_LIB)
 
 if USE_CUDA:
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index bfe381baf62f1f..d4f9e0f07b1d23 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -375,6 +375,7 @@ PyObject* c10d_init(PyObject* _unused) {
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
 
+#ifdef USE_CUDA
   module.def(
       "_dist_broadcast_coalesced",
       &::c10d::distBroadcastCoalesced,
@@ -392,6 +393,7 @@ PyObject* c10d_init(PyObject* _unused) {
       py::arg("broadcast_bucket_size"),
       py::arg("broadcast_buffers"),
       py::call_guard<py::gil_scoped_release>());
+#endif
 
   Py_RETURN_TRUE;
 }
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 9a09065bc4e380..c8afadc345d34f 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -14,6 +14,21 @@ if(NOT Caffe2_FOUND)
   message(FATAL_ERROR "Caffe2 not found")
 endif()
 
+if(USE_CUDA)
+  find_package(CUDA)
+  if(CUDA_FOUND)
+    set(C10D_USE_CUDA true)
+    message(STATUS "Building C10D with CUDA support")
+    add_definitions(-DUSE_CUDA=1)
+  else()
+    set(C10D_USE_CUDA false)
+    message(STATUS "CUDA not found, building C10D without CUDA support")
+  endif()
+else()
+  set(C10D_USE_CUDA false)
+  message(STATUS "Building C10D without CUDA support")
+endif()
+
 find_package(Gloo REQUIRED)
 if(Gloo_FOUND)
   message(STATUS "Gloo_LIBRARY: ${Gloo_LIBRARY}")
@@ -32,27 +47,24 @@ else()
   message(STATUS "Not able to find MPI, will compile c10d without MPI support")
 endif()
 
-find_package(NCCL)
-if(NCCL_FOUND)
-  message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}")
-  message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}")
-  IF(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2))
-    message(STATUS "NCCL Version 2 or higher found, will "
-                   "compile with NCCL distributed backend")
-    SET(DISTRIBUTED_NCCL_FOUND TRUE)
+if(C10D_USE_CUDA)
+  find_package(NCCL)
+  if(NCCL_FOUND)
+    message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}")
+    message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}")
+    if(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2))
+      message(STATUS "NCCL Version 2 or higher found, will "
+        "compile with NCCL distributed backend")
+      set(DISTRIBUTED_NCCL_FOUND true)
+    else()
+      message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not "
+        "determinable, will not compile with NCCL distributed "
+        "backend")
+    endif()
   else()
-    message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not "
-                   "determinable, will not compile with NCCL distributed "
-                   "backend")
+    message(STATUS "Not able to find NCCL, will not "
+      "compile with NCCL distributed backend")
   endif()
-else()
-  message(STATUS "Not able to find NCCL, will not "
-                 "compile with NCCL distributed backend")
-endif()
-
-find_package(CUDA REQUIRED)
-if(NOT CUDA_FOUND)
-  message(FATAL_ERROR "CUDA not found")
 endif()
 
 function(copy_header file)
@@ -68,7 +80,6 @@ endif()
 configure_file(cmake/Def.hpp.in ${CMAKE_BINARY_DIR}/include/c10d/Def.hpp @ONLY)
 
 set(C10D_SRCS
-  CUDAUtils.cpp
   FileStore.cpp
   ProcessGroup.cpp
   Store.cpp
@@ -78,11 +89,21 @@ set(C10D_SRCS
   ProcessGroupGloo.cpp
   )
 
-set(C10D_LIBS
-  caffe2_gpu
-  ${Gloo_LIBRARY}
-  ${Gloo_NATIVE_LIBRARY}
-  )
+if(C10D_USE_CUDA)
+  list(APPEND C10D_SRCS CUDAUtils.cpp)
+  set(C10D_LIBS
+    caffe2_gpu
+    ${Gloo_LIBRARY}
+    ${Gloo_NATIVE_LIBRARY}
+    )
+else()
+  set(C10D_LIBS
+    caffe2
+    ${Gloo_LIBRARY}
+    ${Gloo_NATIVE_LIBRARY}
+    )
+endif()
+
 
 if(DISTRIBUTED_NCCL_FOUND)
   list(APPEND C10D_SRCS ProcessGroupNCCL.cpp)
@@ -108,7 +129,11 @@ target_compile_options(c10d PUBLIC
 
 # c10d links to Caffe2/ATen, but the targets don't add TH/THC to the include path
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/TH)
-target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC)
+
+if(C10D_USE_CUDA)
+  target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC)
+endif()
+
 # For <c10d/...>
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
 # For torch/csrc/utils/hash.h and torch/csrc/utils/functional.h
@@ -116,7 +141,10 @@ target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 # For <gloo/...>
 target_include_directories(c10d PUBLIC ${GLOO_INCLUDE_DIR})
 
-copy_header(CUDAUtils.hpp)
+if(C10D_USE_CUDA)
+  copy_header(CUDAUtils.hpp)
+endif()
+
 copy_header(FileStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index b7bf001be9f891..124b4d4a9d0cc4 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -3,15 +3,20 @@
 #include <gloo/allreduce_halving_doubling.h>
 #include <gloo/allreduce_ring_chunked.h>
 #include <gloo/broadcast_one_to_all.h>
+
+#ifdef USE_CUDA
 #include <gloo/cuda_allreduce_halving_doubling.h>
 #include <gloo/cuda_allreduce_ring_chunked.h>
 #include <gloo/cuda_broadcast_one_to_all.h>
+#endif
+
 #include <gloo/rendezvous/context.h>
 #include <gloo/transport/tcp/device.h>
 
+#ifdef USE_CUDA
 #include <THC.h>
-
 #include <c10d/private/CUDAUtils.hpp>
+#endif
 
 #define GENERATE_ALL_TYPES(type, func, args...)        \
   switch (type) {                                      \
@@ -94,6 +99,7 @@ const ::gloo::ReductionFunction<T>* reductionFunction(const ReduceOp& r) {
   throw std::runtime_error("Unhandled ReduceOp");
 }
 
+#ifdef USE_CUDA
 std::vector<cudaStream_t> getStreamVector(AlgorithmEntry& entry) {
   std::vector<cudaStream_t> streams(entry.streams.size());
   for (size_t i = 0; i < entry.streams.size(); i++) {
@@ -124,10 +130,18 @@ void synchronizeStreams(THCState* thcState, AlgorithmEntry* entry) {
     C10D_CUDA_CHECK(cudaStreamWaitEvent(privateStream, event, 0));
   }
 }
+#endif
 
 } // namespace
 
-ProcessGroupGloo::WorkGloo::WorkGloo() : completed_(false), cuda_(false) {}
+ProcessGroupGloo::WorkGloo::WorkGloo()
+    : completed_(false)
+#ifdef USE_CUDA
+      ,
+      cuda_(false)
+#endif
+{
+}
 
 ProcessGroupGloo::WorkGloo::~WorkGloo() {}
 
@@ -140,6 +154,7 @@ bool ProcessGroupGloo::WorkGloo::isSuccess() const {
 }
 
 void ProcessGroupGloo::WorkGloo::synchronize() {
+#ifdef USE_CUDA
   if (cuda_) {
     auto thcState = ::at::globalContext().lazyInitCUDA();
     for (size_t i = 0; i < devices_.size(); i++) {
@@ -148,6 +163,7 @@ void ProcessGroupGloo::WorkGloo::synchronize() {
       C10D_CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0));
     }
   }
+#endif
 }
 
 bool ProcessGroupGloo::WorkGloo::wait() {
@@ -170,8 +186,8 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
   {
     std::unique_lock<std::mutex> lock(m_);
     completed_ = true;
+#ifdef USE_CUDA
     cuda_ = entry.key.type->is_cuda();
-
     // Populate devices and events so that we can later synchronize
     // with the operation associated with this work finishing.
     if (cuda_) {
@@ -186,6 +202,7 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
         C10D_CUDA_CHECK(cudaEventRecord(event, stream));
       }
     }
+#endif
   }
   cv_.notify_all();
 }
@@ -203,9 +220,7 @@ void ProcessGroupGloo::WorkGloo::finishWithException(
 ProcessGroupGloo::SendWork::SendWork(
     at::Tensor& tensor,
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer)
-    : tensor_(tensor),
-      buffer_(std::move(buffer)) {
-}
+    : tensor_(tensor), buffer_(std::move(buffer)) {}
 
 bool ProcessGroupGloo::SendWork::isCompleted() const {
   // No way to poll for completion yet
@@ -235,10 +250,7 @@ ProcessGroupGloo::RecvWork::RecvWork(
     at::Tensor& tensor,
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
     int* srcRank)
-    : tensor_(tensor),
-      buffer_(std::move(buffer)),
-      srcRank_(srcRank) {
-}
+    : tensor_(tensor), buffer_(std::move(buffer)), srcRank_(srcRank) {}
 
 bool ProcessGroupGloo::RecvWork::isCompleted() const {
   // No way to poll for completion yet
@@ -295,7 +307,9 @@ ProcessGroupGloo::ProcessGroupGloo(
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this);
   }
 
+#ifdef USE_CUDA
   thcState_ = ::at::globalContext().lazyInitCUDA();
+#endif
 }
 
 ProcessGroupGloo::~ProcessGroupGloo() {
@@ -405,6 +419,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
     return;
   }
 
+#ifdef USE_CUDA
   if (backend == at::Backend::CUDA) {
     if (getSize() < 16) {
       entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
@@ -423,6 +438,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
     }
     return;
   }
+#endif
 
   throw std::runtime_error(
       "Unhandled backend: " + std::string(at::toString(backend)));
@@ -448,6 +464,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
     return;
   }
 
+#ifdef USE_CUDA
   if (backend == at::Backend::CUDA) {
     entry.algorithm =
         std::unique_ptr<::gloo::Algorithm>(new ::gloo::CudaBroadcastOneToAll<T>(
@@ -459,6 +476,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
             getStreamVector(entry)));
     return;
   }
+#endif
 
   throw std::runtime_error(
       "Unhandled backend: " + std::string(at::toString(backend)));
@@ -483,10 +501,18 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
   auto& srcSizes = key.srcSizes;
   entry->src.resize(srcSizes.size());
   for (size_t i = 0; i < srcSizes.size(); i++) {
+#ifdef USE_CUDA
     deviceGuard.set_index(key.type->is_cuda() ? key.devices[i] : -1);
+#else
+    if (key.type->is_cuda()) {
+      throw std::runtime_error("ProcessGroupGloo is not built with CUDA");
+    }
+    deviceGuard.set_index(-1);
+#endif
     entry->src[i] = key.type->tensor(srcSizes[i]);
   }
 
+#ifdef USE_CUDA
   // If these are CUDA tensors, create streams and events
   if (key.type->is_cuda()) {
     entry->streams.resize(key.devices.size());
@@ -497,6 +523,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
       entry->events[i] = CUDAEvent::create();
     }
   }
+#endif
 
   return entry;
 }
@@ -561,6 +588,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
     entry->src[opts.rootTensor].copy_(tensors[opts.rootTensor]);
   }
 
+#ifdef USE_CUDA
   // In case of CUDA, ensure that operations that are queued after
   // this collective wait for the collective to complete.
   if (key.type->is_cuda()) {
@@ -576,13 +604,16 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
       }
     };
   } else {
+#endif
     entry->run = [=]() mutable {
       entry->algorithm->run();
       for (size_t i = 0; i < tensors.size(); i++) {
         tensors[i].copy_(entry->src[i]);
       }
     };
+#ifdef USE_CUDA
   }
+#endif
 
   return enqueue(entry);
 }
@@ -607,6 +638,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
     entry->src[i].copy_(tensors[i]);
   }
 
+#ifdef USE_CUDA
   // In case of CUDA, ensure that operations that are queued after
   // this collective wait for the collective to complete.
   if (key.type->is_cuda()) {
@@ -622,14 +654,16 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
       }
     };
   } else {
+#endif
     entry->run = [=]() mutable {
       entry->algorithm->run();
       for (size_t i = 0; i < tensors.size(); i++) {
         tensors[i].copy_(entry->src[i]);
       }
     };
+#ifdef USE_CUDA
   }
-
+#endif
   return enqueue(entry);
 }
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index 9d5be5d200d3d1..0fe69250b31528 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -15,14 +15,19 @@
 
 #include <torch/csrc/utils/hash.h>
 
+#ifdef USE_CUDA
 #include <c10d/CUDAUtils.hpp>
+#endif
+
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Store.hpp>
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
 
+#ifdef USE_CUDA
 // Forward declaration
 struct THCState;
+#endif
 
 namespace c10d {
 
@@ -95,6 +100,7 @@ struct AlgorithmEntry {
   std::vector<at::Tensor> dst;
   std::function<void()> run;
 
+#ifdef USE_CUDA
   // For CUDA tensors, the following happens:
   //
   // - Input tensor A is copied to persistent tensor B on the stream
@@ -120,6 +126,7 @@ struct AlgorithmEntry {
   //
   std::vector<CUDAStream> streams;
   std::vector<CUDAEvent> events;
+#endif
 
   // Used to synchronize between calling thread and worker threads.
   std::mutex m;
@@ -189,6 +196,7 @@ class ProcessGroupGloo : public ProcessGroup {
     // is probably cheaper (this is highly speculative).
     std::unique_ptr<::gloo::Exception> ex_;
 
+#ifdef USE_CUDA
     // List of devices and events so that we can synchronize the
     // streams of the caller with the kernels that were launched
     // asynchronously to finish this operation.
@@ -208,6 +216,7 @@ class ProcessGroupGloo : public ProcessGroup {
     bool cuda_;
     std::vector<int> devices_;
     std::vector<CUDAEvent> events_;
+#endif
 
     friend class ProcessGroupGloo;
   };
@@ -378,8 +387,10 @@ class ProcessGroupGloo : public ProcessGroup {
   std::condition_variable queueProduceCV_;
   std::condition_variable queueConsumeCV_;
 
+#ifdef USE_CUDA
   // Store copy of pointer to THCState retrieved from ::at::globalContext().
   THCState* thcState_;
+#endif
 };
 
 } // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 6bcb2b5d625fb0..caa3d2d056c83c 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -1,5 +1,7 @@
-cuda_add_library(c10d_cuda_test CUDATest.cu)
-target_link_libraries(c10d_cuda_test c10d)
+if(C10D_USE_CUDA)
+  cuda_add_library(c10d_cuda_test CUDATest.cu)
+  target_link_libraries(c10d_cuda_test c10d)
+endif()
 
 function(c10d_add_test test_src)
   get_filename_component(test_name ${test_src} NAME_WE)
@@ -11,12 +13,16 @@ endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d)
 c10d_add_test(TCPStoreTest.cpp c10d)
-c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test)
-c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test)
+
+if(C10D_USE_CUDA)
+  c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test)
+  c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test)
+  if(DISTRIBUTED_NCCL_FOUND)
+    c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
+  endif()
+endif()
+
 if(MPI_FOUND)
   add_definitions(-DMPIEXEC=${MPIEXEC})
   c10d_add_test(ProcessGroupMPITest.cpp c10d)
 endif()
-if(DISTRIBUTED_NCCL_FOUND)
-  c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
-endif()

From 3e3d8caecd2986aafe46d131a0e25210e936a3ff Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 11 Sep 2018 23:03:46 -0700
Subject: [PATCH 071/237] Allow setting deletion constant

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11529

Differential Revision: D9775398

Pulled By: goldsborough

fbshipit-source-id: 8593d1afcf8be3150dcc4a58433f53307e3ae665
---
 torch/csrc/autograd/function.cpp | 8 ++++----
 torch/csrc/autograd/function.h   | 2 ++
 torch/csrc/autograd/init.cpp     | 6 ++++++
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index e077fadcb8214d..ba33a6f116bac2 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -79,9 +79,9 @@ thread_local size_t deleteFunctionRecursionDepth = 0;
  *   times have gotten the following numbers: ~8300, 3669
  */
 #ifdef _WIN32
-constexpr size_t kDeleteFunctionMaxRecursionDepth = 3000;
+size_t deleteFunctionMaxRecursionDepth = 3000;
 #else
-constexpr size_t kDeleteFunctionMaxRecursionDepth = 10000;
+size_t deleteFunctionMaxRecursionDepth = 10000;
 #endif
 
 struct RecursionDepthCounter {
@@ -107,7 +107,7 @@ struct RecursionDepthCounter {
 void deleteFunction(Function* function) {
   RecursionDepthCounter recursion_depth;
 
-  if (recursion_depth.value() > kDeleteFunctionMaxRecursionDepth) {
+  if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) {
     deleteFunctionQueue.push_back(function);
     return;
   }
@@ -117,7 +117,7 @@ void deleteFunction(Function* function) {
   if (deleteFunctionQueue.empty()) {
     return;
   }
-  if (recursion_depth.value() != kDeleteFunctionMaxRecursionDepth) {
+  if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) {
     AT_ERROR("Only one deleter per thread should be able to process "
              "the delete queue. Please open an issue.");
   }
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index b4c90b1489a261..dfe8683197f8ee 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -33,6 +33,8 @@ using edge_list = std::vector<Edge>;
 using saved_variable_list = std::vector<SavedVariable>;
 using IndexRange = std::pair<size_t, size_t>;
 
+TORCH_API extern size_t deleteFunctionMaxRecursionDepth;
+
 // Custom deleter to prevent stack overflows.
 void deleteFunction(Function* function);
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ca1575699cf08f..b306426025e025 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -5,6 +5,7 @@
 #include "torch/csrc/autograd/grad_mode.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/function.h"
 
 PyObject * THPAutograd_initExtension(PyObject *_unused)
 {
@@ -49,6 +50,11 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   });
   m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
+  /// TODO: Replace this ASAP with a better solution for deep autograd graphs!
+  m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) {
+    torch::autograd::deleteFunctionMaxRecursionDepth = value;
+  });
+
   Py_RETURN_TRUE;
 }
 

From 659777984783700c64fd73b60cbc6fa91a0a5f23 Mon Sep 17 00:00:00 2001
From: Owen Anderson <owen.anderson@oculus.com>
Date: Tue, 11 Sep 2018 23:32:44 -0700
Subject: [PATCH 072/237] Clean up some C++ cruftiness in the script lexer.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11408

Differential Revision: D9772843

Pulled By: resistor

fbshipit-source-id: 07f16bf7eaf4f1d8700e46e91a485de4b2d9ed83
---
 torch/csrc/jit/script/lexer.cpp | 42 +++++++++++++++
 torch/csrc/jit/script/lexer.h   | 90 ++++++++++-----------------------
 2 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/torch/csrc/jit/script/lexer.cpp b/torch/csrc/jit/script/lexer.cpp
index 55e63df128a4d8..d5410c39b84455 100644
--- a/torch/csrc/jit/script/lexer.cpp
+++ b/torch/csrc/jit/script/lexer.cpp
@@ -10,6 +10,48 @@ namespace torch {
 namespace jit {
 namespace script {
 
+static const std::unordered_map<int, int> binary_prec = {
+    {TK_IF,  1},
+    {TK_AND, 2},
+    {TK_OR,  2},
+    // reserve a level for unary not
+    {'<',    4},
+    {'>',    4},
+    {TK_EQ,  4},
+    {TK_LE,  4},
+    {TK_GE,  4},
+    {TK_NE,  4},
+    {'+',    5},
+    {'-',    5},
+    {'*',    6},
+    {'/',    6},
+    {'@',    6},
+    {TK_POW, 7},
+};
+
+static const std::unordered_map<int, int> unary_prec = {
+    {TK_NOT, 3},
+    {'-',    8},
+    {'*',    8},
+};
+
+bool SharedParserData::isUnary(int kind, int* prec) {
+  auto it = unary_prec.find(kind);
+  if (it != unary_prec.end()) {
+    *prec = it->second;
+    return true;
+  }
+  return false;
+}
+bool SharedParserData::isBinary(int kind, int* prec) {
+  auto it = binary_prec.find(kind);
+  if (it != binary_prec.end()) {
+    *prec = it->second;
+    return true;
+  }
+  return false;
+}
+
 int stringToKind(std::string str) {
   static std::once_flag init_flag;
   static std::unordered_map<std::string, int> str_to_kind;
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
index 45b7061c3bf3d8..617addf486c736 100644
--- a/torch/csrc/jit/script/lexer.h
+++ b/torch/csrc/jit/script/lexer.h
@@ -8,7 +8,7 @@
 #include <vector>
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/source_range.h"
-
+#include <torch/csrc/utils/memory.h>
 
 namespace torch {
 namespace jit {
@@ -111,34 +111,28 @@ struct TokenTrie {
       kind = tok;
       return;
     }
-    auto& entry = children[*str];
-    if (entry == nullptr) {
-      entry.reset(new TokenTrie());
+
+    for (size_t i = 0, e = child_chars.size(); i < e; ++i) {
+      if (child_chars[i] == *str) {
+        child_tries[i]->insert(str + 1, tok);
+        return;
+      }
     }
-    entry->insert(str + 1, tok);
+
+    child_chars.emplace_back(*str);
+    child_tries.emplace_back(torch::make_unique<TokenTrie>());
+    child_tries.back()->insert(str + 1, tok);
   }
   int kind; // 0 == invalid token
-  std::unordered_map<char, TokenTrieRef> children;
+
+  std::vector<char> child_chars;
+  std::vector<TokenTrieRef> child_tries;
 };
 
 // stuff that is shared against all TC lexers/parsers and is initialized only
 // once.
 struct SharedParserData {
   SharedParserData() : head(new TokenTrie()) {
-    // listed in increasing order of precedence
-    std::vector<std::vector<int>> binary_ops = {
-        {TK_IF},
-        {TK_AND, TK_OR},
-        {}, // reserve a level for unary not
-        {'<', '>', TK_EQ, TK_LE, TK_GE, TK_NE},
-        {'+', '-'},
-        {'*', '/', '@'},
-        {TK_POW},
-    };
-    std::vector<std::vector<int>> unary_ops = {
-        {'-', '*'},
-    };
-
     std::stringstream ss;
     for (const char* c = valid_single_char_tokens; *c; c++) {
       std::string str(1, *c);
@@ -151,26 +145,6 @@ struct SharedParserData {
   }
     TC_FORALL_TOKEN_KINDS(ADD_CASE)
 #undef ADD_CASE
-
-    // precedence starts at 1 so that there is always a 0 precedence
-    // less than any other precedence
-    int prec = 1;
-    for (auto& group : binary_ops) {
-      for (auto& element : group) {
-        binary_prec[element] = prec;
-      }
-      prec++;
-    }
-    // unary ops
-    for (auto& group : unary_ops) {
-      for (auto& element : group) {
-        unary_prec[element] = prec;
-      }
-      prec++;
-    }
-    // add unary not separately because it slots into the precedence of
-    // binary operators
-    unary_prec[TK_NOT] = binary_prec[TK_AND] + 1;
   }
   // 1. skip whitespace
   // 2. handle comment or newline
@@ -195,7 +169,7 @@ struct SharedParserData {
     return start + len <= str.size() && std::count(str.begin() + start, str.begin() + start + len, c) == len;
   }
 
-  // python conconcatenates all adjacent strings "a" "b" == "ab"
+  // python concatenates all adjacent strings "a" "b" == "ab"
   // strings can be enclosed with 1 or 3 single or double quotes
   // if enclosed with 3 quotes newlines are valid
   // as elsewhere, backslash and new line should be ignored
@@ -314,8 +288,16 @@ struct SharedParserData {
       // rather the
       // identifier 'max'
       if (cur) {
-        auto it = cur->children.find(str[pos + i]);
-        cur = (it == cur->children.end()) ? nullptr : it->second.get();
+        size_t child_offset = 0;
+        for (size_t e = cur->child_chars.size(); child_offset < e; ++child_offset) {
+          if (cur->child_chars[child_offset] == str[pos + i])
+          break;
+        }
+
+        cur = (child_offset == cur->child_chars.size())
+          ? nullptr
+          : cur->child_tries[child_offset].get();
+
         if (cur && cur->kind != 0) {
           matched = true;
           *len = i + 1;
@@ -325,22 +307,8 @@ struct SharedParserData {
     }
     return matched;
   }
-  bool isUnary(int kind, int* prec) {
-    auto it = unary_prec.find(kind);
-    if (it != unary_prec.end()) {
-      *prec = it->second;
-      return true;
-    }
-    return false;
-  }
-  bool isBinary(int kind, int* prec) {
-    auto it = binary_prec.find(kind);
-    if (it != binary_prec.end()) {
-      *prec = it->second;
-      return true;
-    }
-    return false;
-  }
+  bool isUnary(int kind, int* prec);
+  bool isBinary(int kind, int* prec);
   bool isRightAssociative(int kind) {
     switch (kind) {
       case '?':
@@ -356,10 +324,6 @@ struct SharedParserData {
     return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
   }
   TokenTrieRef head;
-  std::unordered_map<int, int>
-      unary_prec; // map from token to its unary precedence
-  std::unordered_map<int, int>
-      binary_prec; // map from token to its binary precedence
 };
 
 SharedParserData& sharedParserData();

From 76070fe73c5cce61cb9554990079594f83384629 Mon Sep 17 00:00:00 2001
From: Teng Li <tengli@fb.com>
Date: Wed, 12 Sep 2018 01:27:18 -0700
Subject: [PATCH 073/237] Make c10d test work on CPU only build (#11567)

Summary:
Make test work with CPU only build, also fixed the test failures for a long time
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11567

Differential Revision: D9785740

Pulled By: teng-li

fbshipit-source-id: 61c43b758c1ee53117e30de8074583e6faea863a
---
 torch/lib/c10d/test/CMakeLists.txt                |  2 ++
 torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp |  3 +++
 torch/lib/c10d/test/ProcessGroupGlooTest.cpp      | 13 +++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index caa3d2d056c83c..d7a8643e16db1a 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -20,6 +20,8 @@ if(C10D_USE_CUDA)
   if(DISTRIBUTED_NCCL_FOUND)
     c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
   endif()
+else()
+  c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d)
 endif()
 
 if(MPI_FOUND)
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
index 7e7062b9ce5401..74e96acc15152b 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
@@ -52,6 +52,8 @@ class AsyncTest {
     // Use tiny timeout to make this test run fast
     ::c10d::ProcessGroupGloo::Options options;
     options.timeout = std::chrono::milliseconds(50);
+    ::gloo::transport::tcp::attr attr;
+    options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr));
 
     pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>(
         new ::c10d::ProcessGroupGloo(store, rank, size, options));
@@ -262,4 +264,5 @@ int main(int argc, char** argv) {
     TemporaryFile file;
     runAsyncBroadcastTest(file.path, 4, 1);
   }
+  std::cout << "Test successful" << std::endl;
 }
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 89f3377dce1a26..a9b272cab8f5c5 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -11,7 +11,10 @@
 
 #include <gloo/transport/tcp/device.h>
 
+#ifdef USE_CUDA
 #include <c10d/CUDAUtils.hpp>
+#endif
+
 #include <c10d/FileStore.hpp>
 #include <c10d/ProcessGroupGloo.hpp>
 #include <c10d/test/TestUtils.hpp>
@@ -43,6 +46,8 @@ class SignalTest {
     // Use tiny timeout to make this test run fast
     ::c10d::ProcessGroupGloo::Options options;
     options.timeout = std::chrono::milliseconds(50);
+    ::gloo::transport::tcp::attr attr;
+    options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr));
 
     ::c10d::ProcessGroupGloo pg(store, rank, size, options);
 
@@ -125,6 +130,9 @@ class CollectiveTest {
     ::c10d::ProcessGroupGloo::Options options;
     options.timeout = std::chrono::milliseconds(50);
 
+    ::gloo::transport::tcp::attr attr;
+    options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr));
+
     pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>(
         new ::c10d::ProcessGroupGloo(store, rank, size, options));
   }
@@ -264,20 +272,25 @@ int main(int argc, char** argv) {
     testAllreduce(file.path, at::Backend::CPU);
   }
 
+#ifdef USE_CUDA
   {
     TemporaryFile file;
     testAllreduce(file.path, at::Backend::CUDA);
   }
+#endif
 
   {
     TemporaryFile file;
     testBroadcast(file.path, at::Backend::CPU);
   }
 
+#ifdef USE_CUDA
   {
     TemporaryFile file;
     testBroadcast(file.path, at::Backend::CUDA);
   }
+#endif
 
+  std::cout << "Test successful" << std::endl;
   return 0;
 }

From efc0f6784aa94d75f2b68e2f42a54253c9729d72 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv@beamnet.de>
Date: Wed, 12 Sep 2018 07:05:06 -0700
Subject: [PATCH 074/237] Move some bmm/baddbmm to ATen (#11292)

Summary:
- Incorporates MKL addition by mingfeima  Thank you! (but all errors are my own)
- Native CPU implementation: defer to matrix multiplication for
  small batches and parallelize over batch dimension for large
  batches.
- Add bmm test for CUDA just to be sure.

This is a partial fix for #10661, getting down to a factor ~5.
Considerable overhead is incurred for the setup in einsum. It might
be more efficient to eventually define an optimized contraction
functions for arbitrary and several dimensions.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11292

Differential Revision: D9784941

Pulled By: ezyang

fbshipit-source-id: f6dded2c6f5e8f0461fb38f31f9a824992a58358
---
 aten/src/ATen/Declarations.cwrap           |  27 +---
 aten/src/ATen/native/LinearAlgebra.cpp     | 150 +++++++++++++++++++++
 aten/src/ATen/native/cuda/LinearAlgebra.cu |  25 ++++
 aten/src/ATen/native/mkl/LinearAlgebra.cpp |  95 +++++++++++++
 aten/src/ATen/native/native_functions.yaml |  33 +++++
 test/test_cuda.py                          |   1 +
 6 files changed, 311 insertions(+), 20 deletions(-)
 create mode 100644 aten/src/ATen/native/cuda/LinearAlgebra.cu
 create mode 100644 aten/src/ATen/native/mkl/LinearAlgebra.cpp

diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 9d67537ccdedd8..5a6687aaa53762 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -2471,11 +2471,12 @@
         - THTensor* mat2
 ]]
 [[
-  name: bmm
+  name: _th_bmm
   cname: baddbmm
   variants:
-    - method
     - function
+  backends:
+    - CUDA
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2525,10 +2526,12 @@
     - THTensor* batch2
 ]]
 [[
-  name: baddbmm
+  name: _th_baddbmm
+  cname: baddbmm
   variants:
-    - method
     - function
+  backends:
+    - CUDA
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2544,22 +2547,6 @@
     - THTensor* batch1
     - THTensor* batch2
 ]]
-[[
-  name: baddbmm_
-  cname: baddbmm
-  return: argument 0
-  arguments:
-    - THTensor* self
-    - arg: real beta
-      default: AS_REAL(1)
-      kwarg_only: True
-    - THTensor* self
-    - arg: real alpha
-      default: AS_REAL(1)
-      kwarg_only: True
-    - THTensor* batch1
-    - THTensor* batch2
-]]
 [[
   name: addcmul
   variants:
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index d596a7cb99a265..50726cb99b81b9 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1,7 +1,10 @@
 #include "ATen/ATen.h"
 #include "ATen/ExpandUtils.h"
+#include "ATen/Dispatch.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/native/LinearAlgebraUtils.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/Parallel.h"
 #include <functional>
 #include <numeric>
 #include <vector>
@@ -222,6 +225,153 @@ Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const T
   return at::_addr_out(result, self, vec1, vec2, beta, alpha);
 }
 
+template <typename scalar_t, bool is_bmm>
+inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, Scalar beta_, Scalar alpha_) {
+  int64_t bs = result.size(0);
+  int64_t is = result.size(1);
+  int64_t js = result.size(2);
+  int64_t ks = self.size(2);
+
+  scalar_t alpha = alpha_.to<scalar_t>();
+  scalar_t beta = beta_.to<scalar_t>();
+
+  auto r0 = result.accessor<scalar_t, 3>();
+  auto s0 = self.accessor<scalar_t, 3>();
+  auto m0 = mat2.accessor<scalar_t, 3>();
+
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
+  parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
+      for (int64_t b = b_begin; b < b_end; b++) {
+        auto r1 = r0[b];
+        auto s1 = s0[b];
+        auto m1 = m0[b];
+        for (int64_t i = 0; i < is; i++) {
+          auto r2 = r1[i];
+          auto s2 = s1[i];
+          for (int64_t j = 0; j < js; j++) {
+            scalar_t &r = r2[j];
+            if (is_bmm) {
+              r = 0;
+              for (int64_t k = 0; k < ks; k++) {
+                r += s2[k] * m1[k][j];
+              }
+            } else {
+              r *= beta;
+              for (int64_t k = 0; k < ks; k++) {
+                r += alpha * s2[k] * m1[k][j];
+              }
+            }
+          }
+        }
+      }
+    });
+}
+
+// This tries to apply some optimizations to bmm/baddbmm:
+// - When the operand size is small, computation are parallelized over the batch
+//   dimension using OMP and naive matrix multiplication is applied.
+// - When the operand size is larger than the threshold, if compiled with MKL, MKL's batch gemm is used.
+// - Otherwise, we use a series of matrix multiplications.
+// The threshold of 400 for the first has not been thoroughly benchmarked yet and may have room for further
+// optimization, it likely depends on the characteristics of the CPU, MKL will be different from non-MKL etc.,
+// but this seems to be a first starting point.
+
+static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha, bool is_bmm_out) {
+  // is_bmm_out: true for bmm_out, false for baddbmm_
+  // self_or_result is "self" for baddbmm_ and "result" for bmm_out
+  CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm");
+  TensorArg self_arg(self_or_result, is_bmm_out ? "self" : "result", 0);
+  TensorArg b1_arg(batch1, "batch1", 1);
+  TensorArg b2_arg(batch2, "batch2", 2);
+  checkDim(c, b1_arg, 3);
+  checkDim(c, b2_arg, 3);
+
+  int64_t bs = batch1.size(0);
+  checkSize(c, b2_arg, 0, bs);
+  int64_t contraction_size = batch1.size(2);
+  int64_t res_rows = batch1.size(1);
+  int64_t res_cols = batch2.size(2);
+  checkSize(c, b2_arg, 1, contraction_size);
+
+  if (is_bmm_out) {
+    self_or_result.resize_({bs, res_rows, res_cols});
+  } else {
+    checkSize(c, self_arg, 0, bs);
+    checkSize(c, self_arg, 1, res_rows);
+    checkSize(c, self_arg, 2, res_cols);
+  }
+
+  // handle pathological cases that blas may not like
+  if (self_or_result.numel() == 0) {
+    return self_or_result;
+  } else if (contraction_size == 0) {
+    return self_or_result.zero_();
+  }
+  
+  auto batch_items_contiguous_or_transposed = [&](const Tensor& t) {
+    return (t.stride(2) == 1 && t.stride(1) == t.size(2))
+            || (t.stride(1) == 1 && t.stride(2) == t.size(1));
+  };
+
+  if (contraction_size * res_rows * res_cols < 400) {
+    if (is_bmm_out) {
+      AT_DISPATCH_ALL_TYPES(batch1.type(), "bmm", [&] {
+          baddbmm_cpu_kernel<scalar_t, true>(self_or_result, batch1, batch2, beta, alpha);
+        });
+    } else {
+      AT_DISPATCH_ALL_TYPES(batch1.type(), "baddbmm", [&] {
+          baddbmm_cpu_kernel<scalar_t, false>(self_or_result, batch1, batch2, beta, alpha);
+        });
+    }
+  } else if (at::hasMKL() && at::native::is_floating_point(self_or_result)
+	     && batch_items_contiguous_or_transposed(batch1)
+	     && batch_items_contiguous_or_transposed(batch2)
+	     && self_or_result.is_contiguous()) {
+    at::native::_baddbmm_mkl_(self_or_result, batch1, batch2, beta, alpha);
+  } else { // split along batch dimension
+    if (is_bmm_out) {
+      for (int64_t b = 0; b < bs; b++) {
+        auto r = self_or_result.select(0, b);
+        at::native::mm_out(r, batch1.select(0, b), batch2.select(0, b));
+      }
+    } else {
+      for (int64_t b = 0; b < bs; b++) {
+        self_or_result.select(0, b).addmm_(batch1.select(0, b), batch2.select(0, b), beta, alpha);
+      }
+    }
+  }
+  return self_or_result;
+}
+
+
+Tensor baddbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  Tensor result = self.type().tensor();
+  return at::native::baddbmm_out_cpu(result, self, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm_out_cpu(Tensor &result, const Tensor& self_, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  Tensor self;
+  std::tie(self) = expand_size(self_, {batch1.size(0), batch1.size(1), batch2.size(2)}, "baddbmm");
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return at::native::baddbmm__cpu(result, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return bmm_out_or_baddbmm_(self, batch1, batch2, beta, alpha, false);
+}
+
+Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) {
+  Tensor result = self.type().tensor();
+  return at::native::bmm_out_cpu(result, self, mat2);
+}
+
+Tensor& bmm_out_cpu(Tensor &result, const Tensor& batch1, const Tensor& batch2) {
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  return bmm_out_or_baddbmm_(result, batch1, batch2, beta, alpha, true);
+}
+
 Tensor dot(const Tensor& self, const Tensor& tensor) {
   check_1d(self, "self", "dot");
   check_1d(tensor, "tensor", "dot");
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
new file mode 100644
index 00000000000000..1c3609f50b201c
--- /dev/null
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -0,0 +1,25 @@
+#include "ATen/ATen.h"
+
+namespace at { namespace native {
+
+Tensor baddbmm_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return _th_baddbmm(self, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm_out_cuda(Tensor &result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return _th_baddbmm_out(result, self, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm__cuda(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return _th_baddbmm_out(self, self, batch1, batch2, beta, alpha);
+}
+
+Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
+  return _th_bmm(self, mat2);
+}
+
+Tensor& bmm_out_cuda(Tensor &result, const Tensor& batch1, const Tensor& batch2) {
+  return _th_bmm_out(result, batch1, batch2);
+}
+
+} }
diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
new file mode 100644
index 00000000000000..062dd56d2ca300
--- /dev/null
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
@@ -0,0 +1,95 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Config.h"
+
+#if !AT_MKL_ENABLED()
+
+namespace at { namespace native {
+
+Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  AT_ERROR("bmm: ATen not compiled with MKL support");
+}
+
+}}
+
+#else // AT_MKL_ENABLED
+
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Utils.h"
+#include "ATen/NativeFunctions.h"
+
+#include <algorithm>
+#include <vector>
+#include <numeric>
+#include <cmath>
+
+#include <mkl.h>
+#include <ATen/mkl/Exceptions.h>
+#include <ATen/mkl/Descriptors.h>
+#include <ATen/mkl/Limits.h>
+
+namespace at { namespace native {
+
+static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int batch_size, const int M, const int N, const int K, const float alpha,
+  const float** A, const float** B, const float beta, float** C) {
+  const int lda = (trans_A == CblasNoTrans) ? K : M;
+  const int ldb = (trans_B == CblasNoTrans) ? N : K;
+  const int ldc = N;
+
+  cblas_sgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha,
+    A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size);
+}
+
+static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int batch_size, const int M, const int N, const int K, const double alpha,
+  const double** A, const double** B, const double beta, double** C) {
+  const int lda = (trans_A == CblasNoTrans) ? K : M;
+  const int ldb = (trans_B == CblasNoTrans) ? N : K;
+  const int ldc = N;
+
+  cblas_dgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha,
+    A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size);
+}
+
+template <typename scalar_t>
+static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, const Tensor& mat2, Scalar beta_, Scalar alpha_) {
+  auto is_transposed = [&](const Tensor& t) {
+    return t.stride(0) == 1 && t.stride(1) == t.size(0);
+  };
+  const CBLAS_TRANSPOSE trans_A = is_transposed(mat1[0]) ? CblasTrans : CblasNoTrans;
+  const CBLAS_TRANSPOSE trans_B = is_transposed(mat2[0]) ? CblasTrans : CblasNoTrans;
+
+  const int batch_size = mat1.size(0);
+  const int M = mat1.size(1);
+  const int N = mat2.size(2);
+  const int K = mat1.size(2);
+  scalar_t alpha = alpha_.to<scalar_t>();
+  scalar_t beta = beta_.to<scalar_t>();
+
+  std::vector<const scalar_t*> A(batch_size);
+  std::vector<const scalar_t*> B(batch_size);
+  std::vector<scalar_t*> C(batch_size);
+  for (int64_t batch = 0; batch < batch_size; batch++) {
+    A[batch] = mat1[batch].data<scalar_t>();
+    B[batch] = mat2[batch].data<scalar_t>();
+    C[batch] = res[batch].data<scalar_t>();
+  }
+
+  gemm_batched(trans_A, trans_B, batch_size, M, N, K, alpha, A.data(), B.data(), beta, C.data());
+}
+
+Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  // checks are done in native/LinearAlgebra.cpp
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "baddbmm__mkl", [&] {
+      baddbmm_mkl_template<scalar_t>(self, batch1, batch2, beta, alpha);
+    });
+
+  return self;
+}
+
+}} // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 024c33a19c31dc..cdd7a2655473f1 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -245,6 +245,27 @@
     CPU: _atan_out_cpu
     CUDA: _atan_out_cuda
 
+- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: baddbmm_cpu
+    CUDA: baddbmm_cuda
+
+- func: baddbmm_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method
+  dispatch:
+    CPU: baddbmm__cpu
+    CUDA: baddbmm__cuda
+
+- func: _baddbmm_mkl_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: baddbmm_out(Tensor result, Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: baddbmm_out_cpu
+    CUDA: baddbmm_out_cuda
+
 - func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor
 
 - func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
@@ -281,6 +302,18 @@
 
 - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
 
+- func: bmm(Tensor self, Tensor mat2) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: bmm_cpu
+    CUDA: bmm_cuda
+
+- func: bmm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor
+  variants: function
+  dispatch:
+    CPU: bmm_out_cpu
+    CUDA: bmm_out_cuda
+
 - func: broadcast_tensors(TensorList tensors) -> TensorList
 
 - func: cat(TensorList tensors, int64_t dim=0) -> Tensor
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 71e4a8220c6e68..d75f667281cf99 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -280,6 +280,7 @@ def tmp(t):
         types, False, "skipIfRocm:HalfTensor"),
     ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars',
         types, False, "skipIfRocm:HalfTensor"),
+    ('bmm', small_3d, lambda t: [small_3d(t)], '', float_types_no_half, False, "skipIfRocm:HalfTensor"),
     ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], '',
         types, False, "skipIfRocm:HalfTensor"),
     ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],

From 6fc18a75417264f1a43b874dd16948f158d80c2a Mon Sep 17 00:00:00 2001
From: Rasmus Diederichsen <rasmus@peltarion.com>
Date: Wed, 12 Sep 2018 08:18:52 -0700
Subject: [PATCH 075/237] Typo fix in randomness.rst (#11571)

Summary:
"need to be" -> "need not be"
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11571

Differential Revision: D9786001

Pulled By: soumith

fbshipit-source-id: 7cc408f5c8bfcc56d4b5c153646f30e1cec37539
---
 docs/source/notes/randomness.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
index 5ba2c87a8eecf3..c8c7b710d8e5cf 100644
--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@@ -3,7 +3,7 @@ Reproducibility
 ===============
 
 Completely reproducible results are not guaranteed across PyTorch releases,
-individual commits or different platforms. Furthermore, results need to be
+individual commits or different platforms. Furthermore, results need not be
 reproducible between CPU and GPU executions, even when using identical seeds.
 
 However, in order to make computations deterministic on your specific problem on

From f0a284502ad07c3a2088454f3d282db7d81f231c Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Wed, 12 Sep 2018 09:06:39 -0700
Subject: [PATCH 076/237] Document BatchNorm and update default behavior
 (#11484)

Summary:
This PR:

1. Documents `BatchNorm`,
2. Makes a number of API changes after reconsidering some quirks:
    1. The default value for the `stateful` parameter used to be `false`, but the most common usage of `BatchNorm` out of the wild is certainly stateful, and the default in Python is also statefulness. So we change the default to stateful.
    2. The `pure_forward` function used to use the internal running mean and variance variables instead of the ones supplied to that function call when `stateful` was true, which certainly seems odd. When you call `pure_forward` you would certainly expect the values you pass explicitly to be used. This is now fixed.
3. Adds tests for `BatchNorm`, finally.

ebetica apaszke ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11484

Reviewed By: pjh5

Differential Revision: D9779618

Pulled By: goldsborough

fbshipit-source-id: 59ba760e085c01454b75644b24b22317b688e459
---
 test/cpp/api/integration.cpp                  | 10 ++-
 test/cpp/api/modules.cpp                      | 61 ++++++++++++++++++-
 .../api/include/torch/nn/modules/batchnorm.h  | 56 ++++++++++++++++-
 .../api/include/torch/nn/modules/linear.h     |  1 +
 torch/csrc/api/src/nn/modules/batchnorm.cpp   | 15 ++---
 5 files changed, 128 insertions(+), 15 deletions(-)

diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index a8656a16f5d658..8d753197754c19 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -281,8 +281,8 @@ TEST_CASE("integration/cartpole") {
     for (auto i = 0U; i < saved_log_probs.size(); i++) {
       auto r = rewards[i] - saved_values[i].toCFloat();
       policy_loss.push_back(-r * saved_log_probs[i]);
-      value_loss.push_back(torch::smooth_l1_loss(
-          saved_values[i], torch::ones(1) * rewards[i]));
+      value_loss.push_back(
+          torch::smooth_l1_loss(saved_values[i], torch::ones(1) * rewards[i]));
     }
 
     auto loss =
@@ -370,12 +370,10 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
   torch::manual_seed(0);
   auto model = std::make_shared<SimpleContainer>();
   auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
-  auto batchnorm2d =
-      model->add(BatchNorm(BatchNormOptions(10).stateful(true)), "batchnorm2d");
+  auto batchnorm2d = model->add(BatchNorm(10), "batchnorm2d");
   auto conv2 = model->add(Conv2d(10, 20, 5), "conv2");
   auto linear1 = model->add(Linear(320, 50), "linear1");
-  auto batchnorm1 =
-      model->add(BatchNorm(BatchNormOptions(50).stateful(true)), "batchnorm1");
+  auto batchnorm1 = model->add(BatchNorm(50), "batchnorm1");
   auto linear2 = model->add(Linear(50, 10), "linear2");
 
   auto forward = [&](torch::Tensor x) {
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 2da1e1c45bf6f6..928a39f7d8b44a 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -12,6 +12,8 @@
 
 #include <test/cpp/api/util.h>
 
+using Catch::StartsWith;
+
 using namespace torch::nn;
 using namespace torch::test;
 
@@ -238,10 +240,67 @@ TEST_CASE("modules") {
       REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
     }
     {
-      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
+      auto functional =
+          Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
       REQUIRE(functional(torch::ones({})).toCFloat() == 0);
     }
   }
+
+  SECTION("batchnorm") {
+    {
+      BatchNorm bn(5);
+
+      // Is stateful by default.
+      REQUIRE(bn->options.stateful());
+
+      REQUIRE(bn->running_mean.defined());
+      REQUIRE(bn->running_mean.dim() == 1);
+      REQUIRE(bn->running_mean.size(0) == 5);
+
+      REQUIRE(bn->running_variance.defined());
+      REQUIRE(bn->running_variance.dim() == 1);
+      REQUIRE(bn->running_variance.size(0) == 5);
+
+      // Is affine by default.
+      REQUIRE(bn->options.affine());
+
+      REQUIRE(bn->weight.defined());
+      REQUIRE(bn->weight.dim() == 1);
+      REQUIRE(bn->weight.size(0) == 5);
+
+      REQUIRE(bn->bias.defined());
+      REQUIRE(bn->bias.dim() == 1);
+      REQUIRE(bn->bias.size(0) == 5);
+    }
+    {
+      BatchNorm bn(BatchNormOptions(5).stateful(false).affine(false));
+
+      REQUIRE(!bn->running_mean.defined());
+      REQUIRE(!bn->running_variance.defined());
+      REQUIRE(!bn->weight.defined());
+      REQUIRE(!bn->bias.defined());
+
+      REQUIRE_THROWS_WITH(
+          bn->forward(torch::ones({2, 5})),
+          StartsWith("Calling BatchNorm::forward is only permitted "
+                     "when the 'stateful' option is true (was false). "
+                     "Use BatchNorm::pure_forward instead."));
+    }
+    {
+      BatchNorm bn(BatchNormOptions(5).affine(false));
+      bn->eval();
+
+      // Want to make sure we use the supplied values in `pure_forward` even if
+      // we are stateful.
+      auto input = torch::randn({2, 5});
+      auto mean = torch::randn(5);
+      auto variance = torch::rand(5);
+      auto output = bn->pure_forward(input, mean, variance);
+      auto expected =
+          (input - mean) / torch::sqrt(variance + bn->options.eps());
+      REQUIRE(output.allclose(expected));
+    }
+  }
 }
 
 TEST_CASE("modules_cuda", "[cuda]") {
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index d4dd669a286fef..ff5cebc7ed6b19 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -8,15 +8,43 @@
 
 namespace torch {
 namespace nn {
+
+/// Options for the `BatchNorm` module.
 struct BatchNormOptions {
   /* implicit */ BatchNormOptions(int64_t features);
+  /// The number of features of the input tensor.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(int64_t, features);
+  /// Whether to learn a scale and bias that are applied in an affine
+  /// transformation on the input.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(bool, affine) = true;
-  TORCH_ARG(bool, stateful) = false;
+  /// Whether to store and update batch statistics (mean and variance) in the
+  /// module. If `false`, you should call `pure_forward` and supply those batch
+  /// statistics yourself.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, stateful) = true;
+  /// The epsilon value added for numerical stability.
+  /// Changing this parameter after construction __is effective__.
   TORCH_ARG(double, eps) = 1e-5;
+  /// A momentum multiplier for the mean and variance.
+  /// Changing this parameter after construction __is effective__.
   TORCH_ARG(double, momentum) = 0.1;
 };
 
+/// Applies [Batch Normalization](https://arxiv.org/abs/1502.03167) to an input.
+///
+/// Refer to the documentation for
+/// [`BatchNorm1d`](https://pytorch.org/docs/stable/nn.html#torch.nn.BatchNorm1d)
+/// in PyTorch to learn more about the exact semantics of this module, __but see
+/// the note below regarding differences between the Python and C++ API__.
+///
+/// \rst
+/// .. attention::
+///   In the Python API, there are separate implementations for 1-D, 2-D and 3-D
+///   BatchNorm. In C++, there is only one `BatchNorm` module, which works for
+///   any of these dimensions.
+/// \endrst
 class BatchNormImpl : public torch::nn::Cloneable<BatchNormImpl> {
  public:
   explicit BatchNormImpl(int64_t features)
@@ -25,16 +53,42 @@ class BatchNormImpl : public torch::nn::Cloneable<BatchNormImpl> {
 
   void reset() override;
 
+  /// Applies batch normalization on the `input` using the stored mean and
+  /// variance.
+  ///
+  /// The module must be constructed with `stateful = true` when calling this
+  /// method, as the module will otherwise not store running statistics. If you
+  /// want to supply the mean and variance yourself, use `pure_forward`.
   Tensor forward(Tensor input);
+
+  /// Applies batch normalization on the `input` using the given `mean` and
+  /// `variance` statistics.
   Tensor pure_forward(Tensor input, Tensor mean, Tensor variance);
 
+  /// The options with which this module was constructed.
   BatchNormOptions options;
+
+  /// The learned weight.
+  /// Only defined if the `affine` option was `true` upon construction.
   Tensor weight;
+
+  /// The learned bias.
+  /// Only defined if the `affine` option was `true` upon construction.
   Tensor bias;
+
+  /// The running mean.
+  /// Only defined if the `stateful` option was `true` upon construction.
   Tensor running_mean;
+
+  /// The running variance.
+  /// Only defined if the `stateful` option was `true` upon construction.
   Tensor running_variance;
 };
 
+/// A `ModuleHolder` subclass for `BatchNormImpl`.
+/// See the documentation for `BatchNormImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(BatchNorm);
 
 } // namespace nn
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
index 1d1a25da36163b..3e8044f5715826 100644
--- a/torch/csrc/api/include/torch/nn/modules/linear.h
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -10,6 +10,7 @@
 
 namespace torch {
 namespace nn {
+/// Options for the `Linear` module.
 struct LinearOptions {
   LinearOptions(int64_t in, int64_t out);
   /// The number of input features (columns of the input matrix).
diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp
index 1a9ae761c9f240..6a3920fe56a66d 100644
--- a/torch/csrc/api/src/nn/modules/batchnorm.cpp
+++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp
@@ -34,14 +34,15 @@ void BatchNormImpl::reset() {
 }
 
 Tensor BatchNormImpl::forward(Tensor input) {
-  return pure_forward(input, Tensor(), Tensor());
+  AT_CHECK(
+      options.stateful_,
+      "Calling BatchNorm::forward is only permitted when "
+      "the 'stateful' option is true (was false). "
+      "Use BatchNorm::pure_forward instead.");
+  return pure_forward(input, running_mean, running_variance);
 }
 
 Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) {
-  auto& running_mean = options.stateful_ ? this->running_mean : mean;
-  auto& running_variance =
-      options.stateful_ ? this->running_variance : variance;
-
   if (is_training()) {
     const auto num_channels = input.dim() > 1 ? input.size(1) : 1;
     AT_CHECK(
@@ -53,8 +54,8 @@ Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) {
       input,
       weight,
       bias,
-      running_mean,
-      running_variance,
+      mean,
+      variance,
       is_training(),
       options.momentum_,
       options.eps_,

From e5dd77c7ad362298bcc035ccedc447d161169721 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 09:09:42 -0700
Subject: [PATCH 077/237] Sync all libnccl soversions, not just libnccl.so.1
 (#11575)

Summary:
Fixes:

```
/bin/ld: warning: libnccl.so.1, needed by /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so, not found (try using -rp
ath or -rpath-link)
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclAllReduce'
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclBcast'
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclCommInitAll'
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclGetErrorString'
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclReduceScatter'
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclAllGather'
/data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclReduce'
```

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11575

Differential Revision: D9789956

Pulled By: ezyang

fbshipit-source-id: 63e48763cc233be9d137cec721b239159b511a24
---
 tools/build_pytorch_libs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index e0593badd97614..66c8e6d65cf0b0 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -313,7 +313,7 @@ function build_caffe2() {
   # This is needed by the aten tests built with caffe2
   if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
       # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
-      $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
+      find "${INSTALL_DIR}/lib" -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "lib/"
   fi
 
   ${CMAKE_INSTALL} -j"$MAX_JOBS"

From 12f4c46eea1177d64a3c91fb23744d806e21c612 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 12 Sep 2018 09:33:47 -0700
Subject: [PATCH 078/237] caffe2::StorageImpl use at::DataPtr (#11282)

Summary:
See title
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11282

Reviewed By: ezyang

Differential Revision: D9658503

Pulled By: cpuhrsch

fbshipit-source-id: 42fa73c979692cb1069c0345744a85d12150745c
---
 aten/src/ATen/core/TensorImpl.h        |   6 +
 caffe2/contrib/aten/aten_op_template.h |  20 ++-
 caffe2/core/storage.h                  | 234 ++++++++++++++-----------
 caffe2/core/tensor.h                   | 107 +++++++----
 caffe2/python/pybind_state_dlpack.h    |  21 ++-
 5 files changed, 238 insertions(+), 150 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index e0a649a49b6ccd..3f41c1163c563b 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -100,6 +100,12 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_.data<T>() + storage_offset_;
   }
 
+  inline void* data() const {
+    return static_cast<void*>(
+        static_cast<char*>(storage_.data()) +
+        at::elementSize(scalar_type_) * storage_offset_);
+  }
+
   template <typename T>
   inline T * unsafe_data() const {
     return storage_.unsafe_data<T>() + storage_offset_;
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index c573891f6f04ac..8b12cc9dad16d8 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -91,14 +91,22 @@ class ATenOp : public Operator<Context> {
   void assignTo(Tensor* dst, const at::Tensor& src_) {
     at::Tensor src = src_.contiguous();
     auto at_sizes = src.sizes();
-    std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
+    caffe2::TypeMeta type_meta = typeMetaFor(src);
+    at::Device device = src.device();
+    at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl();
+    std::vector<int64_t> dims(at_sizes.begin(), at_sizes.end());
     dst->Resize(dims);
     dst->ShareExternalPointer(
-        src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable {
-          // return a closure that holds a handle to t until it is called
-          // to keep the aten memory alive
-          return src.reset();
-        });
+        at::DataPtr(
+            src_impl->data(),
+            static_cast<void*>(src_impl),
+            [](void* t_ptr) -> void {
+              at::TensorImpl* local_impl = static_cast<at::TensorImpl*>(t_ptr);
+              c10::raw::intrusive_ptr::decref(local_impl);
+            },
+            device),
+        type_meta,
+        0);
   }
   void assignListStartingAt(
       size_t offset,
diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
index 973b07ee630642..974985625df6b1 100644
--- a/caffe2/core/storage.h
+++ b/caffe2/core/storage.h
@@ -16,52 +16,55 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/typeid.h"
 
+#include <ATen/core/Allocator.h>
 #include <ATen/core/Device.h>
 #include <ATen/core/DeviceType.h>
 #include <ATen/core/intrusive_ptr.h>
 
 namespace caffe2 {
 
-using DataType = TypeMeta;
-using DataPtr = std::shared_ptr<void>;
-using at::DeviceType;
-
 class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
  public:
   StorageImpl() = delete;
   StorageImpl(const StorageImpl&) = delete;
   StorageImpl& operator=(const StorageImpl&) = delete;
 
-  explicit StorageImpl(DeviceType device_type) : device_type_(device_type) {}
-  StorageImpl(DeviceType device_type, TypeMeta data_type)
-      : data_type_(data_type), device_type_(device_type) {}
-  template <typename Deleter = MemoryDeleter>
+  // Rule of Five
+  StorageImpl(StorageImpl&&) = default;
+  ~StorageImpl() = default;
+  StorageImpl& operator=(StorageImpl&&) = default;
+
   StorageImpl(
-      DeviceType device_type,
       TypeMeta data_type,
-      void* src,
-      size_t capacity,
-      Deleter d = nullptr)
-      : data_type_(data_type), device_type_(device_type) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != TypeIdentifier::uninitialized(),
-        "To create storage with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    // Check if the deleter is a MemoryDeleter and is a simple nullptr.
-    if (std::is_same<MemoryDeleter, Deleter>::value &&
-        reinterpret_cast<MemoryDeleter*>(static_cast<void*>(&d))[0] ==
-            nullptr) {
-      // Use aliasing constructor trick to avoid calling the destructor.
-      data_ptr_ = std::shared_ptr<void>(std::shared_ptr<void>(), src);
-    } else {
-      data_ptr_.reset(src, d);
+      int64_t numel,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator,
+      bool resizable)
+      : data_type_(data_type),
+        data_ptr_(std::move(data_ptr)),
+        numel_(numel),
+        resizable_(resizable),
+        allocator_(allocator) {
+    if (numel > 0) {
+      CAFFE_ENFORCE(
+          data_type.id() != TypeIdentifier::uninitialized(),
+          "Constructing a storage with meta of unknown type and non-zero numel");
     }
-    capacity_ = capacity;
   }
 
+  explicit StorageImpl(at::DeviceType device_type)
+      : StorageImpl(device_type, TypeMeta()) {}
+  StorageImpl(at::DeviceType device_type, TypeMeta data_type)
+      : StorageImpl(
+            data_type,
+            0,
+            at::DataPtr(nullptr, at::Device(device_type)),
+            nullptr,
+            true) {}
+
   void reset() {
-    data_ptr_.reset();
-    capacity_ = 0;
+    data_ptr_.clear();
+    numel_ = 0;
   }
 
   template <typename T>
@@ -77,117 +80,135 @@ class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
     return data_ptr_.get();
   }
 
-  DataPtr& data_ptr() {
+  at::DataPtr& data_ptr() {
     return data_ptr_;
   }
 
-  const DataPtr& data_ptr() const {
+  const at::DataPtr& data_ptr() const {
     return data_ptr_;
   }
 
-  void set_dtype(const DataType& data_type) {
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
+    std::swap(data_ptr_, data_ptr);
+    return std::move(data_ptr);
+  };
+
+  void set_dtype(const TypeMeta& data_type) {
+    int64_t capacity = numel_ * data_type_.itemsize();
     data_type_ = data_type;
+    numel_ = capacity / data_type_.itemsize();
   }
 
-  const DataType& dtype() const {
+  const TypeMeta& dtype() const {
     return data_type_;
   }
 
+  const at::Allocator* allocator() const {
+    return allocator_;
+  };
+  // You generally shouldn't use this method, but it is occasionally
+  // useful if you want to override how a tensor will be reallocated,
+  // after it was already allocated (and its initial allocator was
+  // set)
+  void set_allocator(at::Allocator* allocator) {
+    allocator_ = allocator;
+  }
+
   size_t capacity() const {
-    return capacity_;
+    return numel_ * itemsize();
   }
 
   int64_t numel() const {
-    return capacity_ / itemsize();
+    return numel_;
   }
 
   // TODO: remove later
   void set_numel(int64_t numel) {
-    capacity_ = numel * itemsize();
+    numel_ = numel;
   }
 
-  inline DeviceType device_type() const {
-    return device_type_;
+  at::DeviceType device_type() const {
+    return data_ptr_.device().type();
   }
 
   inline size_t itemsize() const {
     return data_type_.itemsize();
   }
 
-  // Rule of Five
-  StorageImpl(StorageImpl&&) = default;
-  ~StorageImpl() = default;
-  StorageImpl& operator=(StorageImpl&&) = default;
+  bool resizable() const {
+    return resizable_;
+  };
+
+  void set_resizable(bool resizable) {
+    resizable_ = resizable;
+  }
 
   /**
    * Can only be called when use_count is 1
    */
-  template <typename Deleter = MemoryDeleter>
   void UniqueStorageShareExternalPointer(
       void* src,
-      const DataType& data_type,
+      const TypeMeta& data_type,
       size_t capacity,
-      Deleter d = nullptr) {
+      MemoryDeleter d = nullptr) {
+    UniqueStorageShareExternalPointer(
+        at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity);
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
     data_type_ = data_type;
     CAFFE_ENFORCE_WITH_CALLER(
         data_type_.id() != TypeIdentifier::uninitialized(),
         "To share with a raw external pointer you need to have meta "
         "already set.");
-    // Check if the deleter is a MemoryDeleter and is a simple nullptr.
-    if (std::is_same<MemoryDeleter, Deleter>::value &&
-        reinterpret_cast<MemoryDeleter*>(&d)[0] == nullptr) {
-      // Use aliasing constructor trick to avoid calling the destructor.
-      data_ptr_ = std::shared_ptr<void>(std::shared_ptr<void>(), src);
-    } else {
-      data_ptr_.reset(src, d);
-    }
-    capacity_ = capacity;
+    data_ptr_ = std::move(data_ptr);
+    // NOTE: data_type might change and so it's also possible that capacity
+    // might not be divisible by itemsize. There is no way for us to keep track
+    // of the exact capacity if we're not explicity storing is. More conrectely
+    // capacity() might not return the value that was set here, if itemsize does
+    // not evenly divide it.
+    numel_ = capacity / data_type_.itemsize();
   }
 
  private:
-  int64_t capacity_ = 0;
-  DataType data_type_;
-  DataPtr data_ptr_;
+  TypeMeta data_type_;
+  at::DataPtr data_ptr_;
+  int64_t numel_;
+  bool resizable_;
+  at::Allocator* allocator_;
   // allocator_ takes precedence over StaticContext from device_type_
   // Allocator* allocator_;
-  DeviceType device_type_ = CPU;
+  // at::DeviceType device_type_ = CPU;
 };
 
 class CAFFE2_API Storage {
  public:
   Storage() {}
-  Storage(DeviceType device_type)
+  Storage(at::DeviceType device_type)
       : storage_impl_(c10::make_intrusive<StorageImpl>(device_type)) {}
-  Storage(DeviceType device_type, TypeMeta data_type)
+  Storage(at::DeviceType device_type, TypeMeta data_type)
       : storage_impl_(
             c10::make_intrusive<StorageImpl>(device_type, data_type)) {}
 
-  template <typename T, typename Deleter = MemoryDeleter>
   Storage(
-      T* src,
-      DeviceType device_type,
-      size_t capacity = 0,
-      Deleter d = nullptr)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(
-            src,
-            device_type,
-            TypeMeta::Make<T>(),
-            capacity,
-            d)) {}
-
-  template <typename Deleter = MemoryDeleter>
-  Storage(
-      void* src,
-      DeviceType device_type,
       TypeMeta data_type,
-      size_t capacity,
-      Deleter d = nullptr)
+      int64_t numel,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator,
+      bool resizable)
       : storage_impl_(c10::make_intrusive<StorageImpl>(
-            device_type,
             data_type,
-            src,
-            capacity,
-            d)) {}
+            numel,
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
 
   void reset() {
     storage_impl_->reset();
@@ -206,19 +227,23 @@ class CAFFE2_API Storage {
     return storage_impl_->data();
   }
 
-  DataPtr& data_ptr() {
+  at::DataPtr& data_ptr() {
     return storage_impl_->data_ptr();
   }
 
-  const DataPtr& data_ptr() const {
+  const at::DataPtr& data_ptr() const {
     return storage_impl_->data_ptr();
   }
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
+    return storage_impl_->set_data_ptr(std::move(data_ptr));
+  };
 
-  void set_dtype(const DataType& data_type) {
+  void set_dtype(const TypeMeta& data_type) {
     storage_impl_->set_dtype(data_type);
   }
 
-  const DataType& dtype() const {
+  const TypeMeta& dtype() const {
     return storage_impl_->dtype();
   }
   size_t capacity() const {
@@ -234,10 +259,14 @@ class CAFFE2_API Storage {
     storage_impl_->set_numel(numel);
   }
 
-  DeviceType device_type() const {
+  at::DeviceType device_type() const {
     return storage_impl_->device_type();
   }
 
+  const at::Allocator* allocator() const {
+    return storage_impl_->allocator();
+  }
+
   inline size_t itemsize() const {
     return storage_impl_->itemsize();
   }
@@ -250,38 +279,35 @@ class CAFFE2_API Storage {
     return storage_impl_.unique();
   }
 
-  template <typename Deleter = MemoryDeleter>
   void UniqueStorageShareExternalPointer(
       void* src,
-      const DataType& data_type,
+      const TypeMeta& data_type,
       size_t capacity,
-      Deleter d = nullptr) {
+      MemoryDeleter d = nullptr) {
     CAFFE_ENFORCE_WITH_CALLER(
         storage_impl_.unique(),
         "UniqueStorageShareExternalPointer can only be called when \
         use_count == 1");
-    storage_impl_->UniqueStorageShareExternalPointer<Deleter>(
+    storage_impl_->UniqueStorageShareExternalPointer(
         src, data_type, capacity, d);
   }
 
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_impl_.unique(),
+        "UniqueStorageShareExternalPointer can only be called when \
+        use_count == 1");
+    storage_impl_->UniqueStorageShareExternalPointer(
+        std::move(data_ptr), data_type, capacity);
+  }
+
  protected:
   c10::intrusive_ptr<StorageImpl> storage_impl_;
 };
 
-/**
- * Create a Storage given an external pointer `src`.
- * `device_type`: the device type of the storage
- * `capacity`: the capacity of the Tensor
- */
-template <typename T, typename Deleter = MemoryDeleter>
-Storage CreateStorage(
-    T* src,
-    DeviceType device_type,
-    size_t capacity = 0,
-    Deleter d = nullptr) {
-  return CreateStorage(src, device_type, TypeMeta::Make<T>(), capacity, d);
-}
-
 } // namespace caffe2
 
 #endif // CAFFE2_CORE_STORAGE_H_
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 392cb523c21b44..70bdbf83189ac5 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -349,7 +349,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       return;
     }
     // Old data is discarded
-    storage_.data_ptr().reset();
+    storage_.data_ptr().clear();
     auto oldSize = numel_;
     auto oldDims = dims_;
     Resize(newCapacity);
@@ -509,17 +509,34 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * using it. If a Deleter object is passed in, when this tensor is reallocated
    * or freed, the deleter function is going to be called.
    */
-  template <typename T, typename Deleter = MemoryDeleter>
-  void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) {
-    ShareExternalPointer(src, TypeMeta::Make<T>(), capacity, d);
+  template <typename T>
+  void
+  ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) {
+    ShareExternalPointer((void*)src, TypeMeta::Make<T>(), capacity, d);
+  }
+
+  template <typename T>
+  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
+    ShareExternalPointer(std::move(data_ptr), TypeMeta::Make<T>(), capacity);
   }
 
-  template <typename Deleter = MemoryDeleter>
   void ShareExternalPointer(
       void* src,
       const TypeMeta& data_type,
       size_t capacity = 0,
-      Deleter d = nullptr) {
+      MemoryDeleter d = nullptr) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    ShareExternalPointer(
+        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
     CAFFE_ENFORCE_WITH_CALLER(
         data_type.id() != TypeIdentifier::uninitialized(),
         "To share with a raw external pointer you need to pass in an "
@@ -531,10 +548,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       CAFFE_ENFORCE_WITH_CALLER(
           numel_ >= 0,
           "To share data with a raw pointer, you need to set shape first.");
-      storage_.UniqueStorageShareExternalPointer(src, data_type, capacity, d);
+      storage_.UniqueStorageShareExternalPointer(
+          std::move(data_ptr), data_type, capacity);
     } else {
+      int64_t numel = capacity / data_type.itemsize();
       // Create a new Storage
-      storage_ = Storage(src, GetDeviceType(), data_type, capacity, d);
+      storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true);
     }
   }
 
@@ -585,50 +604,61 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) {
       return storage_.data();
     } else {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "Tensor is not initialized. You probably need to call Resize() "
+          "before calling mutable_data()");
       bool had_special_dtor = storage_.dtype().dtor() != nullptr;
       if (storage_.unique()) {
         storage_.set_dtype(meta);
-        // TODO: recalcuate numel when we store numel instead of capacity in
-        // Storage
       } else {
         if (storage_.dtype() != meta) {
           storage_ = Storage(storage_.device_type(), meta);
         }
       }
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "Tensor is not initialized. You probably need to call Resize() "
-          "before calling mutable_data()");
 
       // We can reuse the existing buffer if the current data does not have
       // a special destructor and the new data doesn't have a special
       // constructor.
       if (numel_ == 0 ||
           (meta.ctor() == nullptr && !had_special_dtor &&
-           storage_.capacity() >= numel_ * storage_.itemsize())) {
+           storage_.numel() >= numel_)) {
         return storage_.data();
       }
+      const at::Allocator* allocator = storage_.allocator();
+      // TODO: Get rid of StaticContext
+      CAFFE_ENFORCE(
+          allocator == nullptr,
+          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
       if (meta.ctor()) {
         // For types that need placement new, we will call it, as well as
         // making sure that when the data is freed, it calls the right
         // destruction procedure.
         auto size = numel_;
         auto dtor = storage_.dtype().dtor();
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        auto deleter = ptr_and_deleter.second;
-        storage_.data_ptr().reset(
-            ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
-              dtor(ptr, size);
-              deleter(ptr);
-            });
+        void* ptr;
+        at::DeleterFnPtr deleter;
+        auto ptr_and_deleter = GetStaticContext()->New(
+            numel_ * storage_.itemsize()); // Removing this can get rid of
+                                           // InefficientStdFunctionContext
+        ptr = ptr_and_deleter.first;
+        deleter = ptr_and_deleter.second;
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr,
+            [size, dtor, deleter](void* local_ptr) -> void {
+              dtor(local_ptr, size);
+              deleter(local_ptr);
+            },
+            at::Device(storage_.device_type())));
         storage_.dtype().ctor()(storage_.data(), numel_);
       } else {
         // For fundamental type, new and delete is easier.
         auto ptr_and_deleter =
             GetStaticContext()->New(numel_ * storage_.itemsize());
-        storage_.data_ptr().reset(
-            ptr_and_deleter.first, ptr_and_deleter.second);
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr_and_deleter.first,
+            ptr_and_deleter.second,
+            at::Device(storage_.device_type())));
       }
       storage_.set_numel(numel_);
       return storage_.data();
@@ -1026,19 +1056,32 @@ class CAFFE2_API Tensor final {
     impl_.get()->ShareData(*src.impl_.get());
   }
 
-  template <typename T, typename Deleter = MemoryDeleter>
-  void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr)
-      const {
-    impl_.get()->ShareExternalPointer<T, Deleter>(src, capacity, d);
+  template <typename T>
+  void ShareExternalPointer(
+      T* src,
+      size_t capacity = 0,
+      MemoryDeleter d = nullptr) const {
+    impl_.get()->ShareExternalPointer<T>(src, capacity, d);
+  }
+
+  template <typename T>
+  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const {
+    impl_.get()->ShareExternalPointer<T>(std::move(data_ptr), capacity);
   }
 
-  template <typename Deleter = MemoryDeleter>
   void ShareExternalPointer(
       void* src,
       const TypeMeta& meta,
       size_t capacity = 0,
-      Deleter d = nullptr) const {
-    impl_.get()->ShareExternalPointer<Deleter>(src, meta, capacity, d);
+      MemoryDeleter d = nullptr) const {
+    impl_.get()->ShareExternalPointer(src, meta, capacity, d);
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
+    impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity);
   }
 
   inline const void* raw_data() const {
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 8b6f5e1a5c3033..e0122fdcc9983a 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -108,16 +108,21 @@ class DLPackWrapper {
     }
 
     tensor->Resize(dims);
-    const auto& meta = DLTypeToCaffe(dlTensor->dtype);
+    caffe2::TypeMeta meta = DLTypeToCaffe(dlTensor->dtype);
+    at::Device device = at::Device(tensor->GetDeviceType());
     tensor->ShareExternalPointer(
-        ((int8_t*)dlTensor->data) + dlTensor->byte_offset,
+        at::DataPtr(
+            (void*)(((int8_t*)dlTensor->data) + dlTensor->byte_offset),
+            static_cast<void*>(dlMTensor),
+            [](void* t_ptr) -> void {
+              DLManagedTensor* mt_ptr = static_cast<DLManagedTensor*>(t_ptr);
+              if (mt_ptr->destructor) {
+                mt_ptr->destructor(mt_ptr);
+              }
+            },
+            device),
         meta,
-        0,
-        [dlMTensor](void*) {
-          if (dlMTensor->destructor) {
-            dlMTensor->destructor(dlMTensor);
-          }
-        });
+        0);
   }
 
   Tensor* tensor;

From 6398d626f4b191dc437cb1ef0f12fc579ffaf0ee Mon Sep 17 00:00:00 2001
From: zou3519 <zou3519@gmail.com>
Date: Wed, 12 Sep 2018 10:54:32 -0700
Subject: [PATCH 079/237] Warn that export+import module always load onto the
 CPU (#11485)

Summary:
Test Plan
`cd docs && make html`
![image](https://user-images.githubusercontent.com/5652049/45325074-ed04e480-b51d-11e8-9d2d-685dbe8a08e9.png)

cc zdevito apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11485

Differential Revision: D9772119

Pulled By: zou3519

fbshipit-source-id: 3dcb16c9edc2e8deebef17accf91a1c7d4dc9063
---
 docs/source/jit.rst   | 4 ++++
 torch/jit/__init__.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 35e62f52ecefde..8bc4ce6e336fc9 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -33,6 +33,10 @@ Creating Torch Script Code
        To be able to save a module, it must not make any calls to native python functions.
        This means that all submodules must be subclasses of ScriptModules as well.
 
+       .. DANGER::
+          All modules, no matter their device, are always loaded onto the CPU during loading.
+          This is different from :func:`torch.load`'s semantics and may change in the future.
+
 
 .. autofunction:: load
 
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 1397f0cd18af24..f40cade7fec486 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -62,6 +62,10 @@ def load(filename):
     r"""
         Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.ScriptModule.save>`
 
+        .. DANGER::
+           All previously saved modules, no matter their device, are always loaded onto the CPU.
+           This is different from :func:`torch.load`'s semantics and may change in the future.
+
         Arguments:
             filename (string): the file to load
 

From 23d55883c0ae1362f0acdd7b59075566ab4147e2 Mon Sep 17 00:00:00 2001
From: Yan Zhu <yzhu@fb.com>
Date: Wed, 12 Sep 2018 11:17:52 -0700
Subject: [PATCH 080/237] minor formatting error log (#11528)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11528

as title

Reviewed By: chocjy

Differential Revision: D9773214

fbshipit-source-id: b7dd4c19ab83a18f344de8e71ce5b3bf74d1af72
---
 caffe2/core/tensor.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 70bdbf83189ac5..6fd35133811cd9 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -583,8 +583,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         IsType<T>(),
         "Tensor type mismatch, caller expects elements to be ",
         TypeMeta::TypeName<T>(),
-        " while tensor contains ",
-        storage_.dtype().name());
+        ", while tensor contains ",
+        storage_.dtype().name(),
+        ". ");
     return static_cast<T*>(storage_.data());
   }
 

From 13b05c8c78dc03c41e51cb7077910ac88f9b580b Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 12 Sep 2018 11:18:42 -0700
Subject: [PATCH 081/237] Add EndToEndHybridModel CUDA tests (#11544)

Summary:
Also adds two additional tests that check for memory leaks while the relevant graph executors are alive:
- (minimal test): Create a ScriptModule, keep it alive, and test that it does not leak memory while it is alive
- (large test) Do MNIST training with a traced MNIST module and test that no memory is leaked while the traced module (with graph executor) is alive

cc apaszke zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11544

Reviewed By: apaszke

Differential Revision: D9778479

Pulled By: zou3519

fbshipit-source-id: 2d6cdea81dd1264f2c0396b662f70fdafecb3647
---
 test/test_jit.py | 183 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 148 insertions(+), 35 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 69347e2d67b111..0295ebaf70afe9 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2620,6 +2620,29 @@ def _check_code(self, code_str, fn_name, inputs):
         cu = torch.jit.CompilationUnit(code_str)
         self.assertEqual(cu.func(*inputs), scope[fn_name](*inputs))
 
+    @unittest.skipIf(not RUN_CUDA, 'no CUDA')
+    def test_scriptmodule_releases_tensors_cuda(self):
+        @torch.jit.script
+        def fn(x, y):
+            return x.sigmoid() * y.tanh()
+
+        def test(backward=False):
+            x = torch.randn(3, 3, dtype=torch.double, device='cuda', requires_grad=True)
+            y = torch.randn(3, 3, dtype=torch.double, device='cuda', requires_grad=True)
+            out = fn(x, y)
+            if backward:
+                out.sum().backward()
+
+        with self.assertLeaksNoCudaTensors():
+            test()
+            test()
+            test()
+
+        with self.assertLeaksNoCudaTensors():
+            test(backward=True)
+            test(backward=True)
+            test(backward=True)
+
     def test_index(self):
         def consec(size, start=0):
             numel = torch.tensor(size).prod().item()
@@ -6760,9 +6783,29 @@ def foo(x):
         self.assertEqual(foo(input), input)
 
 
+class MnistNet(nn.Module):
+    def __init__(self):
+        super(MnistNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
 class TestEndToEndHybridFrontendModels(JitTestCase):
 
-    def test_dcgan_models(self):
+    @staticmethod
+    def _test_dcgan_models(self, device, check_export_import=True):
         class DCGANGenerator(nn.Module):
             def __init__(self, nz, ngf, nc):
                 super(DCGANGenerator, self).__init__()
@@ -6820,10 +6863,22 @@ def forward(self, input):
                 return self.main(input).view(-1, 1).squeeze(1)
 
         bs, nz, ngf, nc, ndf = 5, 6, 9, 3, 10
-        self.checkTrace(DCGANGenerator(nz, ngf, nc), (torch.rand(bs, nz, 1, 1),))
-        example_input = DCGANGenerator(nz, ngf, nc)(torch.rand(bs, nz, 1, 1))
-        self.checkTrace(DCGANDiscriminator(nc, ndf), (example_input,))
+        self.checkTrace(DCGANGenerator(nz, ngf, nc).to(device),
+                        (torch.rand(bs, nz, 1, 1, device=device),),
+                        export_import=check_export_import)
+        example_input = DCGANGenerator(nz, ngf, nc).to(device)(torch.rand(bs, nz, 1, 1, device=device))
+        self.checkTrace(DCGANDiscriminator(nc, ndf).to(device), (example_input,),
+                        export_import=check_export_import)
+
+    def test_dcgan_models(self):
+        self._test_dcgan_models(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_dcgan_models_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_dcgan_models(self, device='cuda', check_export_import=False)
 
+    # XXX: When this is fixed, write a CUDA test for this.
     @unittest.skip('https://github.com/pytorch/pytorch/issues/8439 InstanceNormalization bug')
     def test_neural_style(self):
         class TransformerNet(torch.nn.Module):
@@ -6924,29 +6979,45 @@ def forward(self, x):
 
         self.checkTrace(TransformerNet(), (torch.rand(5, 3, 224, 224),))
 
+    @staticmethod
+    def _test_mnist(self, device, check_export_import=True):
+        # eval() is present because dropout makes this nondeterministic
+        self.checkTrace(MnistNet().to(device).eval(), (torch.rand(5, 1, 28, 28, device=device),),
+                        export_import=check_export_import)
+
     def test_mnist(self):
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-                self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-                self.conv2_drop = nn.Dropout2d()
-                self.fc1 = nn.Linear(320, 50)
-                self.fc2 = nn.Linear(50, 10)
+        self._test_mnist(self, device='cpu')
 
-            def forward(self, x):
-                x = F.relu(F.max_pool2d(self.conv1(x), 2))
-                x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-                x = x.view(-1, 320)
-                x = F.relu(self.fc1(x))
-                x = F.dropout(x, training=self.training)
-                x = self.fc2(x)
-                return F.log_softmax(x, dim=1)
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_mnist_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_mnist(self, device='cuda', check_export_import=False)
 
-        # eval() is present because dropout makes this nondeterministic
-        self.checkTrace(Net().eval(), (torch.rand(5, 1, 28, 28),))
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_mnist_training_leaks_no_memory_cuda(self):
+        net = MnistNet().cuda()
+        traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')])
 
-    def test_reinforcement_learning(self):
+        def train(iters):
+            for _ in range(iters):
+                # Get some fake data
+                inp = torch.randn(5, 1, 28, 28, device='cuda')
+                out = traced_net(inp)
+
+                # Here's some fake loss
+                out.sum().backward()
+
+                # Zero out grads
+                traced_net.zero_grad()
+
+        # Set it up so the params have .grad fields so they are not reported as leaks
+        train(1)
+
+        with self.assertLeaksNoCudaTensors():
+            train(5)
+
+    @staticmethod
+    def _test_reinforcement_learning(self, device, test_export_import=True):
         class Policy(nn.Module):
             def __init__(self):
                 super(Policy, self).__init__()
@@ -6958,10 +7029,19 @@ def forward(self, x):
                 action_scores = self.affine2(x)
                 return F.softmax(action_scores, dim=1)
 
-        self.checkTrace(Policy(), (torch.rand(1, 4),))
+        self.checkTrace(Policy().to(device), (torch.rand(1, 4, device=device),),
+                        export_import=test_export_import)
 
-    @skipIfRocm
-    def test_snli(self):
+    def test_reinforcement_learning(self):
+        self._test_reinforcement_learning(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_reinforcement_learning_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_reinforcement_learning(self, device='cuda', test_export_import=False)
+
+    @staticmethod
+    def _test_snli(self, device, check_export_import=True):
         class Bottle(nn.Module):
 
             def forward(self, input):
@@ -7045,13 +7125,24 @@ class Config:
             n_layers = 2
             n_cells = 4  # 2 * n_layers because birnn = True
 
-        premise = torch.LongTensor(48, 128).random_(0, 100)
-        hypothesis = torch.LongTensor(24, 128).random_(0, 100)
+        premise = torch.LongTensor(48, 128).random_(0, 100).to(device)
+        hypothesis = torch.LongTensor(24, 128).random_(0, 100).to(device)
 
-        self.checkTrace(SNLIClassifier(Config()), (premise, hypothesis), inputs_require_grads=False)
+        self.checkTrace(SNLIClassifier(Config()).to(device), (premise, hypothesis),
+                        inputs_require_grads=False, export_import=check_export_import)
 
     @skipIfRocm
-    def test_super_resolution(self):
+    def test_snli(self):
+        self._test_snli(self, device='cpu')
+
+    @skipIfRocm
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_snli_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_snli(self, device='cuda', check_export_import=False)
+
+    @staticmethod
+    def _test_super_resolution(self, device, check_export_import=True):
         import torch.nn.init as init
 
         class Net(nn.Module):
@@ -7073,8 +7164,19 @@ def forward(self, x):
                 x = self.pixel_shuffle(self.conv4(x))
                 return x
 
-        net = Net(upscale_factor=4)
-        self.checkTrace(net, (torch.rand(5, 1, 64, 64),))
+        net = Net(upscale_factor=4).to(device)
+        self.checkTrace(net, (torch.rand(5, 1, 64, 64, device=device),),
+                        export_import=check_export_import)
+
+    @skipIfRocm
+    def test_super_resolution(self):
+        self._test_super_resolution(self, device='cpu')
+
+    @skipIfRocm
+    @unittest.skipIf(not RUN_CUDA, 'no CUDA')
+    def test_super_resolution_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_super_resolution(self, device='cuda', check_export_import=False)
 
     @suppress_warnings
     def test_time_sequence_prediction(self):
@@ -7131,9 +7233,11 @@ def forward(self, input):
                 return outputs
 
         # TODO: toggle export_import once above issues are fixed
-        self.checkTrace(Sequence(), (torch.rand(3, 4),), export_import=False)
+        self.checkTrace(Sequence(), (torch.rand(3, 4),),
+                        export_import=False)
 
-    def test_vae(self):
+    @staticmethod
+    def _test_vae(self, device, check_export_import=True):
         class VAE(nn.Module):
             def __init__(self):
                 super(VAE, self).__init__()
@@ -7166,7 +7270,16 @@ def forward(self, x):
                 return self.decode(z), mu, logvar
 
         # eval() is present because randn_like makes this nondeterministic
-        self.checkTrace(VAE().eval(), (torch.rand(128, 1, 28, 28),))
+        self.checkTrace(VAE().to(device).eval(), (torch.rand(128, 1, 28, 28, device=device),),
+                        export_import=check_export_import)
+
+    def test_vae(self):
+        self._test_vae(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_vae_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_vae(self, device='cuda', check_export_import=False)
 
 
 # Smoke tests for export methods

From 17e76e26c84d61ab9251660ccbcb1cf7494936a9 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Wed, 12 Sep 2018 12:05:19 -0700
Subject: [PATCH 082/237] Add trigonometry functions to docs/source/onnx.rst

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11581

Differential Revision: D9794449

Pulled By: soumith

fbshipit-source-id: 1218fcf8969a10ffbfefd3ced7fee9fe7df296f1
---
 docs/source/onnx.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index e461cef4be68c9..90bf2878e7b730 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -169,6 +169,12 @@ The following operators are supported:
 * ge
 * le
 * exp
+* sin
+* cos
+* tan
+* asin
+* acos
+* atan
 * permute
 * Conv
 * BatchNorm

From ad7936e10839edb5622a3c0391ae1dadfc79597c Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Wed, 12 Sep 2018 12:21:20 -0700
Subject: [PATCH 083/237] Fix reloading modules back into python (#11552)

Summary:
This changes the way module import works so that when a module
is reloaded in python it becomes a ScriptModule and not a _C.ScriptModule
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11552

Differential Revision: D9782751

Pulled By: zdevito

fbshipit-source-id: 9576850b75494b228ce3def94c0d371a4a44b11d
---
 test/test_jit.py               | 26 +++++++++++++++++++++
 torch/csrc/jit/import.cpp      | 42 +++++++++++++++++++---------------
 torch/csrc/jit/import.h        |  7 ++++--
 torch/csrc/jit/script/init.cpp |  8 +++----
 torch/jit/__init__.py          | 11 ++++++++-
 5 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 0295ebaf70afe9..e4797c57dda3b5 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1734,6 +1734,32 @@ def test_export_dropout(self):
         x = torch.randn(3, 4)
         self.assertEqual(traced(x), imported(x))
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    def test_cuda_export_restore(self):
+        class Sub(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sub, self).__init__()
+                self.weight = nn.Parameter(torch.randn(3, 4))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__()
+                self.mod = Sub()
+
+            @torch.jit.script_method
+            def forward(self, v):
+                return self.mod(v)
+        m = M()
+        m.cuda()
+        m2 = self.getExportImportCopy(m)
+        m2.cuda()
+        input = torch.rand(3, 4).cuda()
+        self.assertEqual(m(input), m2(input))
+
     def test_export_batchnorm(self):
         for mode in ['eval', 'train']:
             for clazz in [
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index f868f1ff555ad5..d2f828a2517ef8 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -180,7 +180,7 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
 
 class ModuleDecoder : DecoderBase {
  public:
-  ModuleDecoder(std::shared_ptr<script::Module> root_module,
+  ModuleDecoder(ModuleLookup module_lookup,
                 const std::string& filename);
 
  private:
@@ -202,7 +202,7 @@ class ModuleDecoder : DecoderBase {
                                const std::vector<int64_t>& strides);
 
   std::pair<std::shared_ptr<script::Module>, std::string> parseFullName(
-      std::shared_ptr<script::Module> root_module,
+      ModuleLookup module_lookup,
       const std::string fullname);
 
   PyTorchFileReader file_reader_;
@@ -330,7 +330,7 @@ at::Tensor ModuleDecoder::buildTensorCommon(
 // Given a full name of a parameter or method,
 // return the parent submodule and local name
 std::pair<std::shared_ptr<script::Module>, std::string> ModuleDecoder::parseFullName(
-    std::shared_ptr<script::Module> root_module,
+    ModuleLookup module_lookup,
     const std::string fullname) {
   std::vector<std::string> vec;
   std::stringstream ss(fullname);
@@ -339,18 +339,13 @@ std::pair<std::shared_ptr<script::Module>, std::string> ModuleDecoder::parseFull
     vec.push_back(name);
   }
 
-  std::shared_ptr<script::Module> curr = root_module;
-  for (size_t i = 0; i < vec.size() - 1; i++) {
-    if (curr->find_module(vec[i]) == nullptr) {
-      curr->register_module(vec[i], std::make_shared<script::Module>());
-    }
-    curr = curr->get_module(vec[i]);
-  }
-  return std::make_pair(curr, vec.back());
+  std::string last = vec.back();
+  vec.pop_back();
+  return std::make_pair(module_lookup(vec), std::move(last));
 }
 
 ModuleDecoder::ModuleDecoder(
-    const std::shared_ptr<script::Module> root_module,
+    ModuleLookup module_lookup,
     const std::string &filename) :
     file_reader_(filename) {
   auto model_proto = onnx::ModelProto();
@@ -363,7 +358,7 @@ ModuleDecoder::ModuleDecoder(
   for (auto &tensor_proto : graph_proto.initializer()) {
     std::shared_ptr<script::Module> parent_module;
     std::string name;
-    std::tie(parent_module, name) = parseFullName(root_module, tensor_proto.name());
+    std::tie(parent_module, name) = parseFullName(module_lookup, tensor_proto.name());
 
     auto param = buildParameter(tensor_proto);
     parent_module->register_parameter(name, param, /* is_buffer = */ tensor_proto.int64_data(0));
@@ -373,7 +368,7 @@ ModuleDecoder::ModuleDecoder(
   for (auto &node_proto : graph_proto.node()) {
     std::shared_ptr<script::Module> parent_module;
     std::string name;
-    std::tie(parent_module, name) = parseFullName(root_module, node_proto.name());
+    std::tie(parent_module, name) = parseFullName(module_lookup, node_proto.name());
 
     std::vector<at::Tensor*> member_inputs;
     for (auto &param_name : node_proto.input()) {
@@ -393,15 +388,26 @@ ModuleDecoder::ModuleDecoder(
 
 }  // namespace
 
-void ImportIRModule(
-    const std::shared_ptr<script::Module> module,
+void import_ir_module(
+    ModuleLookup module_lookup,
     const std::string& filename) {
-  ModuleDecoder(module, filename);
+  ModuleDecoder(module_lookup, filename);
 }
 
 std::shared_ptr<script::Module> load(const std::string& filename) {
   auto module = std::make_shared<script::Module>();
-  ModuleDecoder(module, filename);
+
+  auto module_lookup = [&](const std::vector<std::string>& qualified_name) {
+    std::shared_ptr<script::Module> curr = module;
+    for (const auto& name : qualified_name) {
+      if (curr->find_module(name) == nullptr) {
+        curr->register_module(name, std::make_shared<script::Module>());
+      }
+      curr = curr->get_module(name);
+    }
+    return curr;
+  };
+  ModuleDecoder(module_lookup, filename);
   return module;
 }
 
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
index ba466f736d3811..21f9d605ec59b3 100644
--- a/torch/csrc/jit/import.h
+++ b/torch/csrc/jit/import.h
@@ -5,8 +5,11 @@
 
 namespace torch { namespace jit {
 
-TORCH_API void ImportIRModule(
-    const std::shared_ptr<script::Module> module,
+using ModuleLookup = std::function<std::shared_ptr<script::Module>(
+    const std::vector<std::string>&)>;
+
+TORCH_API void import_ir_module(
+    ModuleLookup module_lookup,
     const std::string& filename);
 
 TORCH_API std::shared_ptr<script::Module> load(const std::string& filename);
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 9f8185621a1c0e..c09caf4c3702f9 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -24,6 +24,8 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include <pybind11/functional.h>
+
 
 namespace torch {
 namespace jit {
@@ -370,10 +372,6 @@ void initJitScriptBindings(PyObject* module) {
   py::class_<Module, std::shared_ptr<Module>>(m, "ScriptModule")
       .def(py::init<>())
       .def("save", &Module::save)
-      .def("_load", [](const std::shared_ptr<script::Module> module,
-                       const std::string& filename) {
-        ImportIRModule(module, filename);
-      })
       .def("_set_optimized", &Module::set_optimized)
       .def(
           "_define",
@@ -529,7 +527,7 @@ void initJitScriptBindings(PyObject* module) {
   });
 
   m.def("merge_type_from_type_comment", &mergeTypesFromTypeComment);
-
+  m.def("import_ir_module", import_ir_module);
 }
 
 } // namespace script
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f40cade7fec486..b32edf715a6c00 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -73,7 +73,16 @@ def load(filename):
             A ``ScriptModule`` object.
     """
     m = ScriptModule()
-    m._load(filename)
+
+    def module_lookup(names):
+        curr = m
+        for name in names:
+            if not hasattr(curr, name):
+                setattr(curr, name, ScriptModule())
+            curr = getattr(curr, name)
+        return curr
+
+    torch._C.import_ir_module(module_lookup, filename)
     return m
 
 

From 739e6af86940f558f384a4a8922df8dc660d5660 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Wed, 12 Sep 2018 12:28:31 -0700
Subject: [PATCH 084/237] Add reminder % to the jit

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11557

Reviewed By: apaszke

Differential Revision: D9784642

Pulled By: wanchaol

fbshipit-source-id: b7c60c3e9534555c9d7db83769965b3f2f277cdf
---
 test/test_jit.py                     | 39 +++++++++++++++++++++-------
 torch/csrc/jit/register_prim_ops.cpp | 15 +++++++----
 torch/csrc/jit/script/compiler.cpp   |  3 +++
 torch/csrc/jit/script/lexer.cpp      |  1 +
 torch/csrc/jit/script/lexer.h        |  2 +-
 torch/csrc/jit/script/parser.h       |  2 +-
 torch/csrc/jit/script/tree_views.h   |  3 +++
 torch/jit/frontend.py                |  1 +
 8 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index e4797c57dda3b5..4aa43b1c8596a5 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3791,19 +3791,32 @@ def test_number_math(self):
         template = dedent('''
         # int, int -> int
         def func1():
-            return 8 {op} 2
+            return 7 {op} 2
 
         def func2():
-            return 2 {op} 2
+            return 3 {op} 2
 
-        # float, float -> float
         def func3():
-            return 3.14 {op} 0.125
+            return -7 {op} 3
 
         def func4():
+            return 7 {op} -3
+
+        # float, float -> float
+        def func5():
+            return 3.14 {op} 0.125
+
+        def func6():
             return 3.14 {op} 3.14
+
+        def func7():
+            return -0.5 {op} 2.0
+
+        def func8():
+            return 3.5 {op} -2.0
+
         ''')
-        ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!=']
+        ops = ['+', '-', '*', '%', '<', '<=', '>', '>=', '==', '!=']
 
         for op in ops:
             code = template.format(op=op)
@@ -3815,6 +3828,10 @@ def func4():
             self.assertEqual(cu.func2(), scope['func2']())
             self.assertEqual(cu.func3(), scope['func3']())
             self.assertEqual(cu.func4(), scope['func4']())
+            self.assertEqual(cu.func5(), scope['func5']())
+            self.assertEqual(cu.func6(), scope['func6']())
+            self.assertEqual(cu.func7(), scope['func7']())
+            self.assertEqual(cu.func8(), scope['func8']())
 
     def test_number_div(self):
         self.checkScript(div_int_future, (), optimize=True)
@@ -3866,10 +3883,10 @@ def test(op, const, swap_args):
             cu = torch.jit.CompilationUnit(code)
             self.assertEqual(cu.func(tensor), scope['func'](tensor))
 
-        var_int = 2
-        var_float = 1.4321
+        var_int = [2, -2]
+        var_float = [1.4321, -1.2]
 
-        ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!=']
+        ops = ['+', '-', '*', '%', '<', '<=', '>', '>=', '==', '!=']
         # TODO: turn this on for py3 (and add PY3 division semantics)
         ops_py2_only = ['/']
         if PY2:
@@ -3881,7 +3898,7 @@ def test(op, const, swap_args):
         long_tensor[long_tensor == 0] = 2
 
         tensors = [float_tensor, double_tensor, long_tensor]
-        consts = [var_int, var_float]
+        consts = var_int + var_float
 
         for op, tensor, const, swap_args in product(ops, tensors, consts, [True, False]):
             # FIXME: things like 2 / long_tensor are not implemented correctly
@@ -3889,6 +3906,10 @@ def test(op, const, swap_args):
             if op == '/' and tensor.data_ptr() == long_tensor.data_ptr():
                 continue
 
+            # % operator does not take: const % tensor
+            if op == '%' and swap_args is True:
+                continue
+
             test(op, const, swap_args)
 
     def test_tensor_number_math(self):
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 67274f8a9192d6..13f5f6dbbad275 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -390,14 +390,14 @@ RegisterOperators reg({
 });
 
 // define implementations for primitive number ops
-#define DEFINE_GENERIC_OP(aten_op, op, float_result)                        \
+#define DEFINE_GENERIC_OP(aten_op, int_op, float_op, float_result)          \
   Operator(                                                                 \
       #aten_op "(int a, int b) -> int",                                     \
       [](Node* node) {                                                      \
         return [=](Stack& stack) {                                          \
           int64_t a, b;                                                     \
           pop(stack, a, b);                                                 \
-          push(stack, op);                                                  \
+          push(stack, int_op);                                                  \
           return 0;                                                         \
         };                                                                  \
       }),                                                                   \
@@ -406,7 +406,7 @@ RegisterOperators reg({
         return [=](Stack& stack) {                                          \
           double a, b;                                                      \
           pop(stack, a, b);                                                 \
-          push(stack, op);                                                  \
+          push(stack, float_op);                                                  \
           return 0;                                                         \
         };                                                                  \
       }),
@@ -421,8 +421,8 @@ RegisterOperators reg({
     };                                                        \
   }),
 
-#define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, float)
-#define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, int)
+#define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, float)
+#define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, int)
 
 // define helpers for where aten is missing scalar overloads
 // note: it would be better to define these in a standard library as
@@ -618,6 +618,11 @@ RegisterOperators reg2({
     DEFINE_BINARY_OP(aten::mul, a * b)
     DEFINE_BINARY_OP(aten::pow, static_cast<decltype(a)>(pow(a, b)))
 
+    // Pass in two ops for handling int and float separately as % in C++ only works for int
+    // The modulus calculation is different between C++ and Python (on negative), we preserve
+    // the python behavior as it's more common and match python syntax, hence the conversion.
+    DEFINE_GENERIC_OP(aten::remainder, (b + (a % b)) % b, fmod((b + fmod(a, b)), b), float)
+
     // TODO: Support python floordiv (//)
     // Right now aten::floordiv is only used by loop unrolling
     DEFINE_INT_OP(aten::floordiv, a / b)
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index a04e4ae0fc3f03..5efa0668d91296 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1338,6 +1338,8 @@ struct to_ir {
         return prim::Starred;
       case '/':
         return aten::div;
+      case '%':
+        return aten::remainder;
       case TK_NE:
         return aten::ne;
       case TK_EQ:
@@ -1479,6 +1481,7 @@ struct to_ir {
       case '/':
       case '+':
       case '-':
+      case '%':
       case TK_UNARY_MINUS: {
         const auto& inputs = tree->trees();
         auto kind = getNodeKind(tree->kind(), inputs.size());
diff --git a/torch/csrc/jit/script/lexer.cpp b/torch/csrc/jit/script/lexer.cpp
index d5410c39b84455..969a79ebb635bc 100644
--- a/torch/csrc/jit/script/lexer.cpp
+++ b/torch/csrc/jit/script/lexer.cpp
@@ -25,6 +25,7 @@ static const std::unordered_map<int, int> binary_prec = {
     {'-',    5},
     {'*',    6},
     {'/',    6},
+    {'%',    6},
     {'@',    6},
     {TK_POW, 7},
 };
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
index 617addf486c736..7d29ea5944ad10 100644
--- a/torch/csrc/jit/script/lexer.h
+++ b/torch/csrc/jit/script/lexer.h
@@ -85,7 +85,7 @@ namespace script {
   _(TK_SLICE_EXPR, "slice expr", "")             \
   _(TK_TYPE_COMMENT, "type comment", "# type:")
 
-static const char* valid_single_char_tokens = "+-*/@()[]:,={}><.?";
+static const char* valid_single_char_tokens = "+-*/%@()[]:,={}><.?";
 
 enum TokenKind {
   // we use characters to represent themselves so skip all valid characters
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index e808e70ccd4006..53f4dc368714f4 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -177,7 +177,7 @@ struct Parser {
     int binary_prec;
     while (shared.isBinary(L.cur().kind, &binary_prec)) {
       if (binary_prec <= precedence) // not allowed to parse something which is
-        // not greater than 'precedenc'
+        // not greater than 'precedence'
         break;
 
       int kind = L.cur().kind;
diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h
index b5d38f80fde390..162c33e6838666 100644
--- a/torch/csrc/jit/script/tree_views.h
+++ b/torch/csrc/jit/script/tree_views.h
@@ -233,6 +233,7 @@ struct Expr : public TreeView {
       case '*':
       case TK_STARRED:
       case '/':
+      case '%':
       case TK_NOT:
       case TK_CONST:
       case TK_STRINGLITERAL:
@@ -421,6 +422,7 @@ struct AssignKind : public TreeView {
       case '-':
       case '*':
       case '/':
+      case '%':
         return;
       default:
         throw ErrorReport(tree) << "is not a valid AssignKind";
@@ -496,6 +498,7 @@ struct BinOp : public Expr {
       case '-':
       case '@':
       case TK_POW:
+      case '%':
         if (tree->trees().size() != 2)
           throw ErrorReport(tree) << "BinOp expected 2 subtrees, found " << tree->trees().size();
         return;
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 659c3bc4cdd70f..99d767797e1b7e 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -303,6 +303,7 @@ class ExprBuilder(Builder):
         ast.Mult: '*',
         ast.Div: '/',
         ast.Pow: '**',
+        ast.Mod: '%',
     }
 
     if not PY2:

From 9a7c196040f55d921a4deee42cbb1dd294b8f23b Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Wed, 12 Sep 2018 12:57:32 -0700
Subject: [PATCH 085/237] Move Type, Tensor, TensorMethods to core.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11519

Reviewed By: yf225

Differential Revision: D9771684

Pulled By: gchanan

fbshipit-source-id: a57ee2072af99ce856f895c688b09d750a8606e0
---
 aten/src/ATen/ATen.h                      |    2 +-
 aten/src/ATen/Tensor.h                    |    2 +
 aten/src/ATen/Type.h                      |    2 +
 aten/src/ATen/core/ATenCoreTest.cpp       |    2 +
 aten/src/ATen/core/Tensor.h               |  686 ++++++++++++
 aten/src/ATen/core/TensorMethods.h        | 1242 +++++++++++++++++++++
 aten/src/ATen/core/Type.h                 |  647 +++++++++++
 aten/src/ATen/gen.py                      |   47 +-
 aten/src/ATen/templates/NativeFunctions.h |    2 +-
 aten/src/ATen/templates/Tensor.h          |    2 -
 aten/src/ATen/templates/TensorMethods.h   |    6 +-
 aten/src/ATen/templates/Type.h            |    4 +-
 cmake/Codegen.cmake                       |   10 +-
 13 files changed, 2633 insertions(+), 21 deletions(-)
 create mode 100644 aten/src/ATen/Tensor.h
 create mode 100644 aten/src/ATen/Type.h
 create mode 100644 aten/src/ATen/core/Tensor.h
 create mode 100644 aten/src/ATen/core/TensorMethods.h
 create mode 100644 aten/src/ATen/core/Type.h

diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
index cf074730bf072b..29812852e24fdb 100644
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@@ -19,7 +19,7 @@
 #include "ATen/core/Storage.h"
 #include "ATen/Tensor.h"
 #include "ATen/TensorGeometry.h"
-#include "ATen/TensorMethods.h"
+#include "ATen/core/TensorMethods.h"
 #include "ATen/TensorOperators.h"
 #include "ATen/core/TensorOptions.h"
 #include "ATen/Type.h"
diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h
new file mode 100644
index 00000000000000..cef05f5341cb31
--- /dev/null
+++ b/aten/src/ATen/Tensor.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Tensor.h>
diff --git a/aten/src/ATen/Type.h b/aten/src/ATen/Type.h
new file mode 100644
index 00000000000000..0c95f43e0482e9
--- /dev/null
+++ b/aten/src/ATen/Type.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Type.h>
diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp
index 5bb595a0bce5de..bb670b315f16c9 100644
--- a/aten/src/ATen/core/ATenCoreTest.cpp
+++ b/aten/src/ATen/core/ATenCoreTest.cpp
@@ -1,9 +1,11 @@
 #include <ATen/core/ATenCoreTest.h>
+#include <ATen/core/Tensor.h>
 
 namespace at {
 
 static int CoreTestGlobal = 0;
 int CoreTest() {
+  Tensor x;
   return CoreTestGlobal++;
 }
 
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
new file mode 100644
index 00000000000000..4b1f18655547d3
--- /dev/null
+++ b/aten/src/ATen/core/Tensor.h
@@ -0,0 +1,686 @@
+#pragma once
+
+#include "ATen/core/Device.h"
+#include "ATen/core/Layout.h"
+#include "ATen/core/Scalar.h"
+#include "ATen/core/ScalarType.h"
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/core/Storage.h"
+#include "ATen/core/TensorAccessor.h"
+#include "ATen/core/TensorImpl.h"
+#include "ATen/core/optional.h"
+#include "ATen/core/UndefinedTensorImpl.h"
+#include "ATen/core/Error.h"
+
+namespace at {
+struct Generator;
+struct Type;
+struct Tensor;
+struct TensorOptions;
+} // namespace at
+
+namespace at {
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+struct AT_API Tensor {
+  Tensor(){};
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : tensor_impl_(std::move(tensor_impl)) {
+    if (tensor_impl_.get() == nullptr) {
+      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+    }
+  }
+
+  Tensor(const Tensor&) = default;
+  Tensor(Tensor&&) = default;
+
+  int64_t dim() const {
+    return tensor_impl_->dim();
+  }
+
+  TensorImpl * unsafeGetTensorImpl() const {
+    return tensor_impl_.get();
+  }
+  TensorImpl * unsafeReleaseTensorImpl() {
+    return tensor_impl_.release();
+  }
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
+    return tensor_impl_;
+  }
+
+  bool defined() const {
+    return tensor_impl_;
+  }
+
+  void reset() {
+    tensor_impl_.reset();
+  }
+
+  // The following overloads are very intruiging.  Consider the following
+  // program:
+  //
+  //    x[1] = 3;
+  //
+  // We would expect that the first entry of x is written to 3.  But how can we
+  // actually achieve this?  x[1] evaluates to a tensor...
+  //
+  // The answer is, using a ref-qualifier.  x[1] is an rvalue, which cannot be
+  // (profitably) assigned to in the traditional sense, so we overload
+  // assignment to mean, "Actually, copy 3 into the tensor data."  This is done
+  // with an rvalue-reference ref-qualified overload (the methods with && at the
+  // end of their type.)
+  //
+  // There's one more fly in the ointment: We also want
+  //
+  //    Tensor x = y;
+  //
+  // to work, and we want it NOT to copy.  So we need a traditional operator=
+  // overload.  But we MUST specify a mutable lvalue ref-qualifier, to
+  // disambiguate the traditional overload from the rvalue-reference
+  // ref-qualified overload.  Otherwise, it will be ambiguous, because
+  // a non ref-qualified method is eligible for all situations.
+
+  // Unfortunately, we have to write these constructors out manually
+  // to work around an MSVC bug:
+  //    error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &':
+  //    multiple versions of a defaulted special member functions are not allowed
+  // Tensor& operator=(const Tensor&) & = default;
+  // Tensor& operator=(Tensor&&) & = default;
+  Tensor& operator=(const Tensor& x) & {
+    tensor_impl_ = x.tensor_impl_;
+    return *this;
+  }
+  Tensor& operator=(Tensor&& x) & {
+    tensor_impl_ = std::move(x.tensor_impl_);
+    return *this;
+  }
+
+  Tensor& operator=(Scalar v) &&;
+  Tensor& operator=(const Tensor&) &&;
+  Tensor& operator=(Tensor&&) &&;
+
+  bool is_same(const Tensor& other) const noexcept {
+    return tensor_impl_ == other.tensor_impl_;
+  }
+  size_t use_count() const noexcept {
+    return tensor_impl_.use_count();
+  }
+  size_t weak_use_count() const noexcept {
+    return tensor_impl_.weak_use_count();
+  }
+
+  const char * toString() const;
+
+  IntList sizes() const {
+    return tensor_impl_->sizes();
+  }
+  IntList strides() const {
+    return tensor_impl_->strides();
+  }
+  int64_t ndimension() const {
+    return dim();
+  }
+  Type & type() const {
+    return tensor_impl_->type();
+  }
+  TensorTypeId type_id() const {
+    return tensor_impl_->type_id();
+  }
+  ScalarType scalar_type() const {
+    return tensor_impl_->scalar_type();
+  }
+  const Storage& storage() const {
+    return tensor_impl_->storage();
+  }
+  Tensor toType(const Type & t, bool non_blocking=false) const;
+  Tensor & copy_(const Tensor & src, bool non_blocking=false);
+  Tensor toType(ScalarType t) const;
+  Tensor toBackend(Backend b) const;
+
+  /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`.
+  /// Defined in Type.h because of include order issues.
+  bool is_variable() const noexcept;
+
+  /// Returns a `Tensor`'s layout. Defined in Type.h
+  Layout layout() const noexcept;
+
+  /// Returns a `Tensor`'s dtype (`ScalarType`). Defined in Type.h
+  ScalarType dtype() const noexcept;
+
+  /// Returns a `Tensor`'s device.
+  Device device() const;
+
+  /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in
+  /// TensorOptions.h.
+  TensorOptions options() const;
+
+  template<typename T>
+  T * data() const;
+
+  // Purposely not defined here to avoid inlining
+  void print() const;
+
+  //toLongData(), toFloatData() etc.
+  #define TO_TYPE_DATA(T,name,_) \
+  T * to##name##Data() const;
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
+  #undef TO_TYPE_DATA
+
+  #define TO_C_TYPE(T,name,_) \
+  T toC##name () const;
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
+  #undef TO_C_TYPE
+
+  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
+  // dimension.
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return TensorAccessor<T,N>(data<T>(),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() && = delete;
+
+  // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
+  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
+  // cast the data pointer to a __restrict__ pointer.
+  // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor
+  // as an argument.
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+    PackedTensorAccessor<T,N,PtrTraits> packed_accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return PackedTensorAccessor<T,N,PtrTraits>(static_cast<typename PtrTraits<T>::PtrType>(data<T>()),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N,  template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor<T,N> packed_accessor() && = delete;
+
+  Tensor operator-() const;
+  Tensor& operator+=(const Tensor & other);
+  Tensor& operator+=(Scalar other);
+  Tensor& operator-=(const Tensor & other);
+  Tensor& operator-=(Scalar other);
+  Tensor& operator*=(const Tensor & other);
+  Tensor& operator*=(Scalar other);
+  Tensor& operator/=(const Tensor & other);
+  Tensor& operator/=(Scalar other);
+  Tensor operator[](Scalar index) const;
+  Tensor operator[](Tensor index) const;
+  Tensor operator[](int64_t index) const;
+
+  Tensor cpu() const;
+  Tensor cuda() const;
+
+  // ~~~~~ Autograd API ~~~~~
+
+  Tensor& set_requires_grad(bool requires_grad) {
+    tensor_impl_->set_requires_grad(requires_grad);
+    return *this;
+  }
+  bool requires_grad() const {
+    return tensor_impl_->requires_grad();
+  }
+
+  Tensor& grad() {
+    return tensor_impl_->grad();
+  }
+  const Tensor& grad() const {
+    return tensor_impl_->grad();
+  }
+
+  void set_data(Tensor new_data);
+
+  /// Computes the gradient of current tensor w.r.t. graph leaves.
+  void backward(
+      at::optional<Tensor> gradient = at::nullopt,
+      bool keep_graph = false,
+      bool create_graph = false);
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  int64_t storage_offset() const;
+  Tensor & resize_(IntList size);
+  Tensor & set_(Storage source);
+  Tensor & set_(Storage source, int64_t storage_offset, IntList size, IntList stride={});
+  Tensor & set_(const Tensor & source);
+  Tensor & set_();
+  bool is_contiguous() const;
+  bool is_set_to(const Tensor & tensor) const;
+  Tensor & masked_fill_(const Tensor & mask, Scalar value);
+  Tensor & masked_fill_(const Tensor & mask, const Tensor & value);
+  Tensor & masked_scatter_(const Tensor & mask, const Tensor & source);
+  Tensor masked_select(const Tensor & mask) const;
+  Tensor nonzero() const;
+  Tensor contiguous() const;
+  Tensor view(IntList size) const;
+  Tensor index_select(int64_t dim, const Tensor & index) const;
+  Tensor take(const Tensor & index) const;
+  Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false);
+  Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source);
+  Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value);
+  Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value);
+  Tensor unfold(int64_t dimension, int64_t size, int64_t step) const;
+  Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src);
+  Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value);
+  Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src);
+  Tensor gather(int64_t dim, const Tensor & index) const;
+  void* data_ptr() const;
+  bool equal(const Tensor & other) const;
+  Tensor __and__(Scalar other) const;
+  Tensor __and__(const Tensor & other) const;
+  Tensor & __iand__(Scalar other);
+  Tensor & __iand__(const Tensor & other);
+  Tensor __or__(Scalar other) const;
+  Tensor __or__(const Tensor & other) const;
+  Tensor & __ior__(Scalar other);
+  Tensor & __ior__(const Tensor & other);
+  Tensor __xor__(Scalar other) const;
+  Tensor __xor__(const Tensor & other) const;
+  Tensor & __ixor__(Scalar other);
+  Tensor & __ixor__(const Tensor & other);
+  Tensor __lshift__(Scalar other) const;
+  Tensor __lshift__(const Tensor & other) const;
+  Tensor & __ilshift__(Scalar other);
+  Tensor & __ilshift__(const Tensor & other);
+  Tensor __rshift__(Scalar other) const;
+  Tensor __rshift__(const Tensor & other) const;
+  Tensor & __irshift__(Scalar other);
+  Tensor & __irshift__(const Tensor & other);
+  Tensor lt(Scalar other) const;
+  Tensor lt(const Tensor & other) const;
+  Tensor & lt_(Scalar other);
+  Tensor & lt_(const Tensor & other);
+  Tensor gt(Scalar other) const;
+  Tensor gt(const Tensor & other) const;
+  Tensor & gt_(Scalar other);
+  Tensor & gt_(const Tensor & other);
+  Tensor le(Scalar other) const;
+  Tensor le(const Tensor & other) const;
+  Tensor & le_(Scalar other);
+  Tensor & le_(const Tensor & other);
+  Tensor ge(Scalar other) const;
+  Tensor ge(const Tensor & other) const;
+  Tensor & ge_(Scalar other);
+  Tensor & ge_(const Tensor & other);
+  Tensor eq(Scalar other) const;
+  Tensor eq(const Tensor & other) const;
+  Tensor & eq_(Scalar other);
+  Tensor & eq_(const Tensor & other);
+  Tensor ne(Scalar other) const;
+  Tensor ne(const Tensor & other) const;
+  Tensor & ne_(Scalar other);
+  Tensor & ne_(const Tensor & other);
+  Tensor min(const Tensor & other) const;
+  Tensor min() const;
+  Tensor max(const Tensor & other) const;
+  Tensor max() const;
+  Tensor median() const;
+  std::tuple<Tensor,Tensor> sort(int64_t dim=-1, bool descending=false) const;
+  std::tuple<Tensor,Tensor> topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  Tensor all() const;
+  Tensor any() const;
+  Tensor lgamma() const;
+  Tensor & lgamma_();
+  Tensor digamma() const;
+  Tensor & digamma_();
+  Tensor polygamma(int64_t n) const;
+  Tensor & polygamma_(int64_t n);
+  Tensor & erfinv_();
+  Tensor erfinv() const;
+  Tensor & frac_();
+  Tensor frac() const;
+  Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const;
+  Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm);
+  Tensor dist(const Tensor & other, Scalar p=2) const;
+  Tensor reciprocal() const;
+  Tensor & reciprocal_();
+  Tensor neg() const;
+  Tensor & neg_();
+  Tensor atan2(const Tensor & other) const;
+  Tensor & atan2_(const Tensor & other);
+  Tensor pow(const Tensor & exponent) const;
+  Tensor & pow_(Scalar exponent);
+  Tensor & pow_(const Tensor & exponent);
+  Tensor lerp(const Tensor & end, Scalar weight) const;
+  Tensor & lerp_(const Tensor & end, Scalar weight);
+  Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const;
+  Tensor sign() const;
+  Tensor & sign_();
+  Tensor trace() const;
+  Tensor fmod(Scalar other) const;
+  Tensor fmod(const Tensor & other) const;
+  Tensor & fmod_(Scalar other);
+  Tensor & fmod_(const Tensor & other);
+  Tensor remainder(Scalar other) const;
+  Tensor remainder(const Tensor & other) const;
+  Tensor & remainder_(Scalar other);
+  Tensor & remainder_(const Tensor & other);
+  Tensor tril(int64_t diagonal=0) const;
+  Tensor & tril_(int64_t diagonal=0);
+  Tensor triu(int64_t diagonal=0) const;
+  Tensor & triu_(int64_t diagonal=0);
+  Tensor cross(const Tensor & other, int64_t dim=-1) const;
+  Tensor diag(int64_t diagonal=0) const;
+  Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+  Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+  Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+  std::tuple<Tensor,Tensor> gels(const Tensor & A) const;
+  std::tuple<Tensor,Tensor> trtrs(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
+  std::tuple<Tensor,Tensor> symeig(bool eigenvectors=false, bool upper=true) const;
+  std::tuple<Tensor,Tensor> eig(bool eigenvectors=false) const;
+  std::tuple<Tensor,Tensor,Tensor> svd(bool some=true) const;
+  Tensor potrf(bool upper=true) const;
+  Tensor potrs(const Tensor & input2, bool upper=true) const;
+  Tensor potri(bool upper=true) const;
+  std::tuple<Tensor,Tensor> pstrf(bool upper=true, Scalar tol=-1) const;
+  std::tuple<Tensor,Tensor> qr() const;
+  std::tuple<Tensor,Tensor> geqrf() const;
+  Tensor orgqr(const Tensor & input2) const;
+  Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;
+  std::tuple<Tensor,Tensor> btrifact(bool pivot=true) const;
+  std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(bool pivot=true) const;
+  Tensor btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const;
+  Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr);
+  Tensor & random_(int64_t to, Generator * generator=nullptr);
+  Tensor & random_(Generator * generator=nullptr);
+  Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const;
+  Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr);
+  Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr);
+  Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr);
+  Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr);
+  Tensor & exponential_(double lambd=1, Generator * generator=nullptr);
+  Tensor & geometric_(double p, Generator * generator=nullptr);
+  Tensor abs() const;
+  Tensor & abs_();
+  Tensor acos() const;
+  Tensor & acos_();
+  Tensor add(const Tensor & other, Scalar alpha=1) const;
+  Tensor & add_(const Tensor & other, Scalar alpha=1);
+  Tensor add(Scalar other, Scalar alpha=1) const;
+  Tensor & add_(Scalar other, Scalar alpha=1);
+  Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+  Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+  Tensor all(int64_t dim, bool keepdim=false) const;
+  bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
+  Tensor any(int64_t dim, bool keepdim=false) const;
+  Tensor argmax(int64_t dim, bool keepdim=false) const;
+  Tensor argmax() const;
+  Tensor argmin(int64_t dim, bool keepdim=false) const;
+  Tensor argmin() const;
+  Tensor as_strided(IntList size, IntList stride) const;
+  Tensor & as_strided_(IntList size, IntList stride);
+  Tensor as_strided(IntList size, IntList stride, int64_t storage_offset) const;
+  Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset);
+  Tensor asin() const;
+  Tensor & asin_();
+  Tensor atan() const;
+  Tensor & atan_();
+  Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+  Tensor bernoulli(const Tensor & p, Generator * generator=nullptr) const;
+  Tensor bernoulli(double p, Generator * generator=nullptr) const;
+  Tensor bernoulli() const;
+  Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr);
+  Tensor & bernoulli_(double p, Generator * generator=nullptr);
+  Tensor & bernoulli_();
+  Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const;
+  Tensor bmm(const Tensor & mat2) const;
+  Tensor ceil() const;
+  Tensor & ceil_();
+  std::vector<Tensor> chunk(int64_t chunks, int64_t dim=0) const;
+  Tensor clamp(Scalar min, Scalar max) const;
+  Tensor & clamp_(Scalar min, Scalar max);
+  Tensor clamp_max(Scalar max) const;
+  Tensor & clamp_max_(Scalar max);
+  Tensor clamp_min(Scalar min) const;
+  Tensor & clamp_min_(Scalar min);
+  Tensor cos() const;
+  Tensor & cos_();
+  Tensor cosh() const;
+  Tensor & cosh_();
+  Tensor cumsum(int64_t dim, ScalarType dtype) const;
+  Tensor cumsum(int64_t dim) const;
+  Tensor cumprod(int64_t dim, ScalarType dtype) const;
+  Tensor cumprod(int64_t dim) const;
+  Tensor det() const;
+  Tensor diagflat(int64_t offset=0) const;
+  Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const;
+  Tensor div(const Tensor & other) const;
+  Tensor & div_(const Tensor & other);
+  Tensor div(Scalar other) const;
+  Tensor & div_(Scalar other);
+  Tensor dot(const Tensor & tensor) const;
+  Tensor erf() const;
+  Tensor & erf_();
+  Tensor erfc() const;
+  Tensor & erfc_();
+  Tensor exp() const;
+  Tensor & exp_();
+  Tensor expm1() const;
+  Tensor & expm1_();
+  Tensor expand(IntList size, bool implicit=false) const;
+  Tensor expand_as(const Tensor & other) const;
+  Tensor flatten(int64_t start_dim=0, int64_t end_dim=-1) const;
+  Tensor & fill_(Scalar value);
+  Tensor & fill_(const Tensor & value);
+  Tensor floor() const;
+  Tensor & floor_();
+  Tensor ger(const Tensor & vec2) const;
+  std::tuple<Tensor,Tensor> gesv(const Tensor & A) const;
+  Tensor fft(int64_t signal_ndim, bool normalized=false) const;
+  Tensor ifft(int64_t signal_ndim, bool normalized=false) const;
+  Tensor rfft(int64_t signal_ndim, bool normalized=false, bool onesided=true) const;
+  Tensor irfft(int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const;
+  Tensor index(TensorList indices) const;
+  Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source);
+  Tensor index_put(TensorList indices, const Tensor & values) const;
+  Tensor & index_put_(TensorList indices, const Tensor & values);
+  Tensor inverse() const;
+  Tensor isclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
+  bool is_cuda() const;
+  bool is_distributed() const;
+  bool is_floating_point() const;
+  bool is_complex() const;
+  bool is_nonzero() const;
+  bool is_same_size(const Tensor & other) const;
+  bool is_signed() const;
+  bool is_sparse() const;
+  std::tuple<Tensor,Tensor> kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const;
+  Tensor log() const;
+  Tensor & log_();
+  Tensor log10() const;
+  Tensor & log10_();
+  Tensor log1p() const;
+  Tensor & log1p_();
+  Tensor log2() const;
+  Tensor & log2_();
+  Tensor logdet() const;
+  Tensor log_softmax(int64_t dim) const;
+  Tensor logsumexp(int64_t dim, bool keepdim=false) const;
+  Tensor matmul(const Tensor & other) const;
+  Tensor matrix_power(int64_t n) const;
+  std::tuple<Tensor,Tensor> max(int64_t dim, bool keepdim=false) const;
+  Tensor max_values(int64_t dim, bool keepdim=false) const;
+  Tensor mean(ScalarType dtype) const;
+  Tensor mean() const;
+  Tensor mean(int64_t dim, bool keepdim, ScalarType dtype) const;
+  Tensor mean(int64_t dim, bool keepdim=false) const;
+  Tensor mean(int64_t dim, ScalarType dtype) const;
+  std::tuple<Tensor,Tensor> median(int64_t dim, bool keepdim=false) const;
+  std::tuple<Tensor,Tensor> min(int64_t dim, bool keepdim=false) const;
+  Tensor min_values(int64_t dim, bool keepdim=false) const;
+  Tensor mm(const Tensor & mat2) const;
+  std::tuple<Tensor,Tensor> mode(int64_t dim=-1, bool keepdim=false) const;
+  Tensor mul(const Tensor & other) const;
+  Tensor & mul_(const Tensor & other);
+  Tensor mul(Scalar other) const;
+  Tensor & mul_(Scalar other);
+  Tensor mv(const Tensor & vec) const;
+  Tensor mvlgamma(int64_t p) const;
+  Tensor & mvlgamma_(int64_t p);
+  Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
+  Tensor permute(IntList dims) const;
+  Tensor pin_memory() const;
+  Tensor pinverse(double rcond=1e-15) const;
+  Tensor repeat(IntList repeats) const;
+  Tensor reshape(IntList shape) const;
+  Tensor reshape_as(const Tensor & other) const;
+  Tensor round() const;
+  Tensor & round_();
+  Tensor relu() const;
+  Tensor & relu_();
+  Tensor hardshrink(Scalar lambd=0.5) const;
+  Tensor hardshrink_backward(const Tensor & grad_out, Scalar lambd) const;
+  Tensor rsqrt() const;
+  Tensor & rsqrt_();
+  Tensor select(int64_t dim, int64_t index) const;
+  Tensor sigmoid() const;
+  Tensor & sigmoid_();
+  Tensor sin() const;
+  Tensor & sin_();
+  Tensor sinh() const;
+  Tensor & sinh_();
+  Tensor detach() const;
+  Tensor & detach_();
+  int64_t size(int64_t dim) const;
+  Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const;
+  std::tuple<Tensor,Tensor> slogdet() const;
+  Tensor smm(const Tensor & mat2) const;
+  Tensor softmax(int64_t dim) const;
+  std::vector<Tensor> split(int64_t split_size, int64_t dim=0) const;
+  std::vector<Tensor> split_with_sizes(IntList split_sizes, int64_t dim=0) const;
+  Tensor squeeze() const;
+  Tensor squeeze(int64_t dim) const;
+  Tensor & squeeze_();
+  Tensor & squeeze_(int64_t dim);
+  Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const;
+  int64_t stride(int64_t dim) const;
+  Tensor sum(ScalarType dtype) const;
+  Tensor sum() const;
+  Tensor sum(IntList dim, bool keepdim, ScalarType dtype) const;
+  Tensor sum(IntList dim, bool keepdim=false) const;
+  Tensor sum(IntList dim, ScalarType dtype) const;
+  Tensor sqrt() const;
+  Tensor & sqrt_();
+  Tensor std(bool unbiased=true) const;
+  Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor prod(ScalarType dtype) const;
+  Tensor prod() const;
+  Tensor prod(int64_t dim, bool keepdim, ScalarType dtype) const;
+  Tensor prod(int64_t dim, bool keepdim=false) const;
+  Tensor prod(int64_t dim, ScalarType dtype) const;
+  Tensor t() const;
+  Tensor & t_();
+  Tensor tan() const;
+  Tensor & tan_();
+  Tensor tanh() const;
+  Tensor & tanh_();
+  Tensor transpose(int64_t dim0, int64_t dim1) const;
+  Tensor & transpose_(int64_t dim0, int64_t dim1);
+  Tensor flip(IntList dims) const;
+  Tensor rot90(int64_t k=1, IntList dims={0,1}) const;
+  Tensor trunc() const;
+  Tensor & trunc_();
+  Tensor type_as(const Tensor & other) const;
+  Tensor unsqueeze(int64_t dim) const;
+  Tensor & unsqueeze_(int64_t dim);
+  Tensor var(bool unbiased=true) const;
+  Tensor var(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor view_as(const Tensor & other) const;
+  Tensor where(const Tensor & condition, const Tensor & other) const;
+  Tensor norm(Scalar p=2) const;
+  Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const;
+  Tensor clone() const;
+  Tensor & resize_as_(const Tensor & the_template);
+  Tensor pow(Scalar exponent) const;
+  Tensor & zero_();
+  Tensor sub(const Tensor & other, Scalar alpha=1) const;
+  Tensor & sub_(const Tensor & other, Scalar alpha=1);
+  Tensor sub(Scalar other, Scalar alpha=1) const;
+  Tensor & sub_(Scalar other, Scalar alpha=1);
+  Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+  Tensor & sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims);
+  Tensor & sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims);
+  Tensor sparse_mask(SparseTensorRef mask) const;
+  Tensor to_dense() const;
+  int64_t _sparseDims() const;
+  int64_t _denseDims() const;
+  int64_t _nnz() const;
+  Tensor coalesce() const;
+  bool is_coalesced() const;
+  Tensor _indices() const;
+  Tensor _values() const;
+  int64_t numel() const;
+  std::vector<Tensor> unbind(int64_t dim=0) const;
+  int64_t get_device() const;
+  Tensor to(Device device, ScalarType dtype, bool non_blocking=false) const;
+  Tensor to(ScalarType dtype, bool non_blocking=false) const;
+  Tensor to(Device device, bool non_blocking=false) const;
+  Tensor to(const Tensor & other, bool non_blocking=false) const;
+  Scalar _local_scalar() const;
+
+  template <typename F, typename... Args>
+  auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward<Args>(params)...)) {
+    return func(*this, std::forward<Args>(params)...);
+  }
+
+  friend struct WeakTensor;
+
+protected:
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
+};
+
+struct AT_API WeakTensor {
+  WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
+
+  // XXX: this can return undefined tensors
+  // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
+  Tensor lock() const {
+    return Tensor(weak_tensor_impl_.lock());
+  }
+
+  bool is_same(const WeakTensor& other) const noexcept {
+    return weak_tensor_impl_ == other.weak_tensor_impl_;
+  }
+
+  size_t use_count() const noexcept {
+    return weak_tensor_impl_.use_count();
+  }
+  size_t weak_use_count() const noexcept {
+    return weak_tensor_impl_.weak_use_count();
+  }
+
+  TensorImpl* unsafeGetTensorImpl() const {
+    return weak_tensor_impl_._unsafe_get_target();
+  }
+
+private:
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
+};
+} // namespace at
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
new file mode 100644
index 00000000000000..aba169279e6d2b
--- /dev/null
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -0,0 +1,1242 @@
+#pragma once
+
+#include "ATen/core/Tensor.h"
+#include "ATen/core/Scalar.h"
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/core/Type.h"
+#include "ATen/core/TensorOptions.h"
+
+namespace at {
+
+inline Tensor Tensor::toType(const Type & t, bool non_blocking) const {
+  if(type() == t)
+    return *this;
+  return t.copy(*this, non_blocking);
+}
+
+inline Tensor Tensor::cpu() const {
+  return toType(type().cpu());
+}
+
+inline Tensor Tensor::cuda() const {
+  return toType(type().cuda());
+}
+
+inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) {
+  return type().copy_(*this, src, non_blocking);
+}
+
+inline Tensor Tensor::toType(ScalarType t) const {
+  return toType(type().toScalarType(t));
+}
+
+inline Tensor Tensor::toBackend(Backend b) const {
+  return toType(type().toBackend(b));
+}
+
+inline TensorOptions Tensor::options() const {
+  return TensorOptions().dtype(dtype())
+                        .device(device())
+                        .layout(layout())
+                        .is_variable(is_variable());
+}
+
+inline void Tensor::backward(
+    at::optional<Tensor> gradient,
+    bool keep_graph,
+    bool create_graph) {
+  type().backward(*this, std::move(gradient), keep_graph, create_graph);
+}
+
+inline void Tensor::set_data(Tensor new_data) {
+  type().set_data(*this, new_data);
+}
+
+// all static inline to allow for inlining of the non-dynamic part of dispatch
+inline int64_t Tensor::storage_offset() const {
+    return type().storage_offset(*this);
+}
+inline Tensor & Tensor::resize_(IntList size) {
+    return type().resize_(*this, size);
+}
+inline Tensor & Tensor::set_(Storage source) {
+    return type().set_(*this, source);
+}
+inline Tensor & Tensor::set_(Storage source, int64_t storage_offset, IntList size, IntList stride) {
+    return type().set_(*this, source, storage_offset, size, stride);
+}
+inline Tensor & Tensor::set_(const Tensor & source) {
+    return type().set_(*this, source);
+}
+inline Tensor & Tensor::set_() {
+    return type().set_(*this);
+}
+inline bool Tensor::is_contiguous() const {
+    return type().is_contiguous(*this);
+}
+inline bool Tensor::is_set_to(const Tensor & tensor) const {
+    return type().is_set_to(*this, tensor);
+}
+inline Tensor & Tensor::masked_fill_(const Tensor & mask, Scalar value) {
+    return type().masked_fill_(*this, mask, value);
+}
+inline Tensor & Tensor::masked_fill_(const Tensor & mask, const Tensor & value) {
+    return type().masked_fill_(*this, mask, value);
+}
+inline Tensor & Tensor::masked_scatter_(const Tensor & mask, const Tensor & source) {
+    return type().masked_scatter_(*this, mask, source);
+}
+inline Tensor Tensor::masked_select(const Tensor & mask) const {
+    return type().masked_select(*this, mask);
+}
+inline Tensor Tensor::nonzero() const {
+    return type().nonzero(*this);
+}
+inline Tensor Tensor::contiguous() const {
+    return type().contiguous(*this);
+}
+inline Tensor Tensor::view(IntList size) const {
+    return type().view(*this, size);
+}
+inline Tensor Tensor::index_select(int64_t dim, const Tensor & index) const {
+    return type().index_select(*this, dim, index);
+}
+inline Tensor Tensor::take(const Tensor & index) const {
+    return type().take(*this, index);
+}
+inline Tensor & Tensor::put_(const Tensor & index, const Tensor & source, bool accumulate) {
+    return type().put_(*this, index, source, accumulate);
+}
+inline Tensor & Tensor::index_add_(int64_t dim, const Tensor & index, const Tensor & source) {
+    return type().index_add_(*this, dim, index, source);
+}
+inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, Scalar value) {
+    return type().index_fill_(*this, dim, index, value);
+}
+inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, const Tensor & value) {
+    return type().index_fill_(*this, dim, index, value);
+}
+inline Tensor Tensor::unfold(int64_t dimension, int64_t size, int64_t step) const {
+    return type().unfold(*this, dimension, size, step);
+}
+inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, const Tensor & src) {
+    return type().scatter_(*this, dim, index, src);
+}
+inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, Scalar value) {
+    return type().scatter_(*this, dim, index, value);
+}
+inline Tensor & Tensor::scatter_add_(int64_t dim, const Tensor & index, const Tensor & src) {
+    return type().scatter_add_(*this, dim, index, src);
+}
+inline Tensor Tensor::gather(int64_t dim, const Tensor & index) const {
+    return type().gather(*this, dim, index);
+}
+inline void* Tensor::data_ptr() const {
+    return type().data_ptr(*this);
+}
+inline bool Tensor::equal(const Tensor & other) const {
+    return type().equal(*this, other);
+}
+inline Tensor Tensor::__and__(Scalar other) const {
+    return type().__and__(*this, other);
+}
+inline Tensor Tensor::__and__(const Tensor & other) const {
+    return type().__and__(*this, other);
+}
+inline Tensor & Tensor::__iand__(Scalar other) {
+    return type().__iand__(*this, other);
+}
+inline Tensor & Tensor::__iand__(const Tensor & other) {
+    return type().__iand__(*this, other);
+}
+inline Tensor Tensor::__or__(Scalar other) const {
+    return type().__or__(*this, other);
+}
+inline Tensor Tensor::__or__(const Tensor & other) const {
+    return type().__or__(*this, other);
+}
+inline Tensor & Tensor::__ior__(Scalar other) {
+    return type().__ior__(*this, other);
+}
+inline Tensor & Tensor::__ior__(const Tensor & other) {
+    return type().__ior__(*this, other);
+}
+inline Tensor Tensor::__xor__(Scalar other) const {
+    return type().__xor__(*this, other);
+}
+inline Tensor Tensor::__xor__(const Tensor & other) const {
+    return type().__xor__(*this, other);
+}
+inline Tensor & Tensor::__ixor__(Scalar other) {
+    return type().__ixor__(*this, other);
+}
+inline Tensor & Tensor::__ixor__(const Tensor & other) {
+    return type().__ixor__(*this, other);
+}
+inline Tensor Tensor::__lshift__(Scalar other) const {
+    return type().__lshift__(*this, other);
+}
+inline Tensor Tensor::__lshift__(const Tensor & other) const {
+    return type().__lshift__(*this, other);
+}
+inline Tensor & Tensor::__ilshift__(Scalar other) {
+    return type().__ilshift__(*this, other);
+}
+inline Tensor & Tensor::__ilshift__(const Tensor & other) {
+    return type().__ilshift__(*this, other);
+}
+inline Tensor Tensor::__rshift__(Scalar other) const {
+    return type().__rshift__(*this, other);
+}
+inline Tensor Tensor::__rshift__(const Tensor & other) const {
+    return type().__rshift__(*this, other);
+}
+inline Tensor & Tensor::__irshift__(Scalar other) {
+    return type().__irshift__(*this, other);
+}
+inline Tensor & Tensor::__irshift__(const Tensor & other) {
+    return type().__irshift__(*this, other);
+}
+inline Tensor Tensor::lt(Scalar other) const {
+    return type().lt(*this, other);
+}
+inline Tensor Tensor::lt(const Tensor & other) const {
+    return type().lt(*this, other);
+}
+inline Tensor & Tensor::lt_(Scalar other) {
+    return type().lt_(*this, other);
+}
+inline Tensor & Tensor::lt_(const Tensor & other) {
+    return type().lt_(*this, other);
+}
+inline Tensor Tensor::gt(Scalar other) const {
+    return type().gt(*this, other);
+}
+inline Tensor Tensor::gt(const Tensor & other) const {
+    return type().gt(*this, other);
+}
+inline Tensor & Tensor::gt_(Scalar other) {
+    return type().gt_(*this, other);
+}
+inline Tensor & Tensor::gt_(const Tensor & other) {
+    return type().gt_(*this, other);
+}
+inline Tensor Tensor::le(Scalar other) const {
+    return type().le(*this, other);
+}
+inline Tensor Tensor::le(const Tensor & other) const {
+    return type().le(*this, other);
+}
+inline Tensor & Tensor::le_(Scalar other) {
+    return type().le_(*this, other);
+}
+inline Tensor & Tensor::le_(const Tensor & other) {
+    return type().le_(*this, other);
+}
+inline Tensor Tensor::ge(Scalar other) const {
+    return type().ge(*this, other);
+}
+inline Tensor Tensor::ge(const Tensor & other) const {
+    return type().ge(*this, other);
+}
+inline Tensor & Tensor::ge_(Scalar other) {
+    return type().ge_(*this, other);
+}
+inline Tensor & Tensor::ge_(const Tensor & other) {
+    return type().ge_(*this, other);
+}
+inline Tensor Tensor::eq(Scalar other) const {
+    return type().eq(*this, other);
+}
+inline Tensor Tensor::eq(const Tensor & other) const {
+    return type().eq(*this, other);
+}
+inline Tensor & Tensor::eq_(Scalar other) {
+    return type().eq_(*this, other);
+}
+inline Tensor & Tensor::eq_(const Tensor & other) {
+    return type().eq_(*this, other);
+}
+inline Tensor Tensor::ne(Scalar other) const {
+    return type().ne(*this, other);
+}
+inline Tensor Tensor::ne(const Tensor & other) const {
+    return type().ne(*this, other);
+}
+inline Tensor & Tensor::ne_(Scalar other) {
+    return type().ne_(*this, other);
+}
+inline Tensor & Tensor::ne_(const Tensor & other) {
+    return type().ne_(*this, other);
+}
+inline Tensor Tensor::min(const Tensor & other) const {
+    return type().min(*this, other);
+}
+inline Tensor Tensor::min() const {
+    return type().min(*this);
+}
+inline Tensor Tensor::max(const Tensor & other) const {
+    return type().max(*this, other);
+}
+inline Tensor Tensor::max() const {
+    return type().max(*this);
+}
+inline Tensor Tensor::median() const {
+    return type().median(*this);
+}
+inline std::tuple<Tensor,Tensor> Tensor::sort(int64_t dim, bool descending) const {
+    return type().sort(*this, dim, descending);
+}
+inline std::tuple<Tensor,Tensor> Tensor::topk(int64_t k, int64_t dim, bool largest, bool sorted) const {
+    return type().topk(*this, k, dim, largest, sorted);
+}
+inline Tensor Tensor::all() const {
+    return type().all(*this);
+}
+inline Tensor Tensor::any() const {
+    return type().any(*this);
+}
+inline Tensor Tensor::lgamma() const {
+    return type().lgamma(*this);
+}
+inline Tensor & Tensor::lgamma_() {
+    return type().lgamma_(*this);
+}
+inline Tensor Tensor::digamma() const {
+    return type().digamma(*this);
+}
+inline Tensor & Tensor::digamma_() {
+    return type().digamma_(*this);
+}
+inline Tensor Tensor::polygamma(int64_t n) const {
+    return type().polygamma(n, *this);
+}
+inline Tensor & Tensor::polygamma_(int64_t n) {
+    return type().polygamma_(*this, n);
+}
+inline Tensor & Tensor::erfinv_() {
+    return type().erfinv_(*this);
+}
+inline Tensor Tensor::erfinv() const {
+    return type().erfinv(*this);
+}
+inline Tensor & Tensor::frac_() {
+    return type().frac_(*this);
+}
+inline Tensor Tensor::frac() const {
+    return type().frac(*this);
+}
+inline Tensor Tensor::renorm(Scalar p, int64_t dim, Scalar maxnorm) const {
+    return type().renorm(*this, p, dim, maxnorm);
+}
+inline Tensor & Tensor::renorm_(Scalar p, int64_t dim, Scalar maxnorm) {
+    return type().renorm_(*this, p, dim, maxnorm);
+}
+inline Tensor Tensor::dist(const Tensor & other, Scalar p) const {
+    return type().dist(*this, other, p);
+}
+inline Tensor Tensor::reciprocal() const {
+    return type().reciprocal(*this);
+}
+inline Tensor & Tensor::reciprocal_() {
+    return type().reciprocal_(*this);
+}
+inline Tensor Tensor::neg() const {
+    return type().neg(*this);
+}
+inline Tensor & Tensor::neg_() {
+    return type().neg_(*this);
+}
+inline Tensor Tensor::atan2(const Tensor & other) const {
+    return type().atan2(*this, other);
+}
+inline Tensor & Tensor::atan2_(const Tensor & other) {
+    return type().atan2_(*this, other);
+}
+inline Tensor Tensor::pow(const Tensor & exponent) const {
+    return type().pow(*this, exponent);
+}
+inline Tensor & Tensor::pow_(Scalar exponent) {
+    return type().pow_(*this, exponent);
+}
+inline Tensor & Tensor::pow_(const Tensor & exponent) {
+    return type().pow_(*this, exponent);
+}
+inline Tensor Tensor::lerp(const Tensor & end, Scalar weight) const {
+    return type().lerp(*this, end, weight);
+}
+inline Tensor & Tensor::lerp_(const Tensor & end, Scalar weight) {
+    return type().lerp_(*this, end, weight);
+}
+inline Tensor Tensor::histc(int64_t bins, Scalar min, Scalar max) const {
+    return type().histc(*this, bins, min, max);
+}
+inline Tensor Tensor::sign() const {
+    return type().sign(*this);
+}
+inline Tensor & Tensor::sign_() {
+    return type().sign_(*this);
+}
+inline Tensor Tensor::trace() const {
+    return type().trace(*this);
+}
+inline Tensor Tensor::fmod(Scalar other) const {
+    return type().fmod(*this, other);
+}
+inline Tensor Tensor::fmod(const Tensor & other) const {
+    return type().fmod(*this, other);
+}
+inline Tensor & Tensor::fmod_(Scalar other) {
+    return type().fmod_(*this, other);
+}
+inline Tensor & Tensor::fmod_(const Tensor & other) {
+    return type().fmod_(*this, other);
+}
+inline Tensor Tensor::remainder(Scalar other) const {
+    return type().remainder(*this, other);
+}
+inline Tensor Tensor::remainder(const Tensor & other) const {
+    return type().remainder(*this, other);
+}
+inline Tensor & Tensor::remainder_(Scalar other) {
+    return type().remainder_(*this, other);
+}
+inline Tensor & Tensor::remainder_(const Tensor & other) {
+    return type().remainder_(*this, other);
+}
+inline Tensor Tensor::tril(int64_t diagonal) const {
+    return type().tril(*this, diagonal);
+}
+inline Tensor & Tensor::tril_(int64_t diagonal) {
+    return type().tril_(*this, diagonal);
+}
+inline Tensor Tensor::triu(int64_t diagonal) const {
+    return type().triu(*this, diagonal);
+}
+inline Tensor & Tensor::triu_(int64_t diagonal) {
+    return type().triu_(*this, diagonal);
+}
+inline Tensor Tensor::cross(const Tensor & other, int64_t dim) const {
+    return type().cross(*this, other, dim);
+}
+inline Tensor Tensor::diag(int64_t diagonal) const {
+    return type().diag(*this, diagonal);
+}
+inline Tensor Tensor::addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
+    return type().addbmm(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor & Tensor::addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return type().addbmm_(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor Tensor::addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
+    return type().addcmul(*this, tensor1, tensor2, value);
+}
+inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return type().addcmul_(*this, tensor1, tensor2, value);
+}
+inline Tensor Tensor::addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
+    return type().addcdiv(*this, tensor1, tensor2, value);
+}
+inline Tensor & Tensor::addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return type().addcdiv_(*this, tensor1, tensor2, value);
+}
+inline std::tuple<Tensor,Tensor> Tensor::gels(const Tensor & A) const {
+    return type().gels(*this, A);
+}
+inline std::tuple<Tensor,Tensor> Tensor::trtrs(const Tensor & A, bool upper, bool transpose, bool unitriangular) const {
+    return type().trtrs(*this, A, upper, transpose, unitriangular);
+}
+inline std::tuple<Tensor,Tensor> Tensor::symeig(bool eigenvectors, bool upper) const {
+    return type().symeig(*this, eigenvectors, upper);
+}
+inline std::tuple<Tensor,Tensor> Tensor::eig(bool eigenvectors) const {
+    return type().eig(*this, eigenvectors);
+}
+inline std::tuple<Tensor,Tensor,Tensor> Tensor::svd(bool some) const {
+    return type().svd(*this, some);
+}
+inline Tensor Tensor::potrf(bool upper) const {
+    return type().potrf(*this, upper);
+}
+inline Tensor Tensor::potrs(const Tensor & input2, bool upper) const {
+    return type().potrs(*this, input2, upper);
+}
+inline Tensor Tensor::potri(bool upper) const {
+    return type().potri(*this, upper);
+}
+inline std::tuple<Tensor,Tensor> Tensor::pstrf(bool upper, Scalar tol) const {
+    return type().pstrf(*this, upper, tol);
+}
+inline std::tuple<Tensor,Tensor> Tensor::qr() const {
+    return type().qr(*this);
+}
+inline std::tuple<Tensor,Tensor> Tensor::geqrf() const {
+    return type().geqrf(*this);
+}
+inline Tensor Tensor::orgqr(const Tensor & input2) const {
+    return type().orgqr(*this, input2);
+}
+inline Tensor Tensor::ormqr(const Tensor & input2, const Tensor & input3, bool left, bool transpose) const {
+    return type().ormqr(*this, input2, input3, left, transpose);
+}
+inline std::tuple<Tensor,Tensor> Tensor::btrifact(bool pivot) const {
+    return type().btrifact(*this, pivot);
+}
+inline std::tuple<Tensor,Tensor,Tensor> Tensor::btrifact_with_info(bool pivot) const {
+    return type().btrifact_with_info(*this, pivot);
+}
+inline Tensor Tensor::btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const {
+    return type().btrisolve(*this, LU_data, LU_pivots);
+}
+inline Tensor & Tensor::random_(int64_t from, int64_t to, Generator * generator) {
+    return type().random_(*this, from, to, generator);
+}
+inline Tensor & Tensor::random_(int64_t to, Generator * generator) {
+    return type().random_(*this, to, generator);
+}
+inline Tensor & Tensor::random_(Generator * generator) {
+    return type().random_(*this, generator);
+}
+inline Tensor Tensor::multinomial(int64_t num_samples, bool replacement, Generator * generator) const {
+    return type().multinomial(*this, num_samples, replacement, generator);
+}
+inline Tensor & Tensor::uniform_(double from, double to, Generator * generator) {
+    return type().uniform_(*this, from, to, generator);
+}
+inline Tensor & Tensor::normal_(double mean, double std, Generator * generator) {
+    return type().normal_(*this, mean, std, generator);
+}
+inline Tensor & Tensor::cauchy_(double median, double sigma, Generator * generator) {
+    return type().cauchy_(*this, median, sigma, generator);
+}
+inline Tensor & Tensor::log_normal_(double mean, double std, Generator * generator) {
+    return type().log_normal_(*this, mean, std, generator);
+}
+inline Tensor & Tensor::exponential_(double lambd, Generator * generator) {
+    return type().exponential_(*this, lambd, generator);
+}
+inline Tensor & Tensor::geometric_(double p, Generator * generator) {
+    return type().geometric_(*this, p, generator);
+}
+inline Tensor Tensor::abs() const {
+    return type().abs(*this);
+}
+inline Tensor & Tensor::abs_() {
+    return type().abs_(*this);
+}
+inline Tensor Tensor::acos() const {
+    return type().acos(*this);
+}
+inline Tensor & Tensor::acos_() {
+    return type().acos_(*this);
+}
+inline Tensor Tensor::add(const Tensor & other, Scalar alpha) const {
+    return type().add(*this, other, alpha);
+}
+inline Tensor & Tensor::add_(const Tensor & other, Scalar alpha) {
+    return type().add_(*this, other, alpha);
+}
+inline Tensor Tensor::add(Scalar other, Scalar alpha) const {
+    return type().add(*this, other, alpha);
+}
+inline Tensor & Tensor::add_(Scalar other, Scalar alpha) {
+    return type().add_(*this, other, alpha);
+}
+inline Tensor Tensor::addmv(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const {
+    return type().addmv(*this, mat, vec, beta, alpha);
+}
+inline Tensor & Tensor::addmv_(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return type().addmv_(*this, mat, vec, beta, alpha);
+}
+inline Tensor Tensor::addr(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const {
+    return type().addr(*this, vec1, vec2, beta, alpha);
+}
+inline Tensor & Tensor::addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return type().addr_(*this, vec1, vec2, beta, alpha);
+}
+inline Tensor Tensor::all(int64_t dim, bool keepdim) const {
+    return type().all(*this, dim, keepdim);
+}
+inline bool Tensor::allclose(const Tensor & other, double rtol, double atol, bool equal_nan) const {
+    return type().allclose(*this, other, rtol, atol, equal_nan);
+}
+inline Tensor Tensor::any(int64_t dim, bool keepdim) const {
+    return type().any(*this, dim, keepdim);
+}
+inline Tensor Tensor::argmax(int64_t dim, bool keepdim) const {
+    return type().argmax(*this, dim, keepdim);
+}
+inline Tensor Tensor::argmax() const {
+    return type().argmax(*this);
+}
+inline Tensor Tensor::argmin(int64_t dim, bool keepdim) const {
+    return type().argmin(*this, dim, keepdim);
+}
+inline Tensor Tensor::argmin() const {
+    return type().argmin(*this);
+}
+inline Tensor Tensor::as_strided(IntList size, IntList stride) const {
+    return type().as_strided(*this, size, stride);
+}
+inline Tensor & Tensor::as_strided_(IntList size, IntList stride) {
+    return type().as_strided_(*this, size, stride);
+}
+inline Tensor Tensor::as_strided(IntList size, IntList stride, int64_t storage_offset) const {
+    return type().as_strided(*this, size, stride, storage_offset);
+}
+inline Tensor & Tensor::as_strided_(IntList size, IntList stride, int64_t storage_offset) {
+    return type().as_strided_(*this, size, stride, storage_offset);
+}
+inline Tensor Tensor::asin() const {
+    return type().asin(*this);
+}
+inline Tensor & Tensor::asin_() {
+    return type().asin_(*this);
+}
+inline Tensor Tensor::atan() const {
+    return type().atan(*this);
+}
+inline Tensor & Tensor::atan_() {
+    return type().atan_(*this);
+}
+inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
+    return type().baddbmm(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return type().baddbmm_(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor Tensor::bernoulli(const Tensor & p, Generator * generator) const {
+    return type().bernoulli(*this, p, generator);
+}
+inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
+    return type().bernoulli(*this, p, generator);
+}
+inline Tensor Tensor::bernoulli() const {
+    return type().bernoulli(*this);
+}
+inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) {
+    return type().bernoulli_(*this, p, generator);
+}
+inline Tensor & Tensor::bernoulli_(double p, Generator * generator) {
+    return type().bernoulli_(*this, p, generator);
+}
+inline Tensor & Tensor::bernoulli_() {
+    return type().bernoulli_(*this);
+}
+inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const {
+    return type().bincount(*this, weights, minlength);
+}
+inline Tensor Tensor::bmm(const Tensor & mat2) const {
+    return type().bmm(*this, mat2);
+}
+inline Tensor Tensor::ceil() const {
+    return type().ceil(*this);
+}
+inline Tensor & Tensor::ceil_() {
+    return type().ceil_(*this);
+}
+inline std::vector<Tensor> Tensor::chunk(int64_t chunks, int64_t dim) const {
+    return type().chunk(*this, chunks, dim);
+}
+inline Tensor Tensor::clamp(Scalar min, Scalar max) const {
+    return type().clamp(*this, min, max);
+}
+inline Tensor & Tensor::clamp_(Scalar min, Scalar max) {
+    return type().clamp_(*this, min, max);
+}
+inline Tensor Tensor::clamp_max(Scalar max) const {
+    return type().clamp_max(*this, max);
+}
+inline Tensor & Tensor::clamp_max_(Scalar max) {
+    return type().clamp_max_(*this, max);
+}
+inline Tensor Tensor::clamp_min(Scalar min) const {
+    return type().clamp_min(*this, min);
+}
+inline Tensor & Tensor::clamp_min_(Scalar min) {
+    return type().clamp_min_(*this, min);
+}
+inline Tensor Tensor::cos() const {
+    return type().cos(*this);
+}
+inline Tensor & Tensor::cos_() {
+    return type().cos_(*this);
+}
+inline Tensor Tensor::cosh() const {
+    return type().cosh(*this);
+}
+inline Tensor & Tensor::cosh_() {
+    return type().cosh_(*this);
+}
+inline Tensor Tensor::cumsum(int64_t dim, ScalarType dtype) const {
+    return type().cumsum(*this, dim, dtype);
+}
+inline Tensor Tensor::cumsum(int64_t dim) const {
+    return type().cumsum(*this, dim);
+}
+inline Tensor Tensor::cumprod(int64_t dim, ScalarType dtype) const {
+    return type().cumprod(*this, dim, dtype);
+}
+inline Tensor Tensor::cumprod(int64_t dim) const {
+    return type().cumprod(*this, dim);
+}
+inline Tensor Tensor::det() const {
+    return type().det(*this);
+}
+inline Tensor Tensor::diagflat(int64_t offset) const {
+    return type().diagflat(*this, offset);
+}
+inline Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const {
+    return type().diagonal(*this, offset, dim1, dim2);
+}
+inline Tensor Tensor::div(const Tensor & other) const {
+    return type().div(*this, other);
+}
+inline Tensor & Tensor::div_(const Tensor & other) {
+    return type().div_(*this, other);
+}
+inline Tensor Tensor::div(Scalar other) const {
+    return type().div(*this, other);
+}
+inline Tensor & Tensor::div_(Scalar other) {
+    return type().div_(*this, other);
+}
+inline Tensor Tensor::dot(const Tensor & tensor) const {
+    return type().dot(*this, tensor);
+}
+inline Tensor Tensor::erf() const {
+    return type().erf(*this);
+}
+inline Tensor & Tensor::erf_() {
+    return type().erf_(*this);
+}
+inline Tensor Tensor::erfc() const {
+    return type().erfc(*this);
+}
+inline Tensor & Tensor::erfc_() {
+    return type().erfc_(*this);
+}
+inline Tensor Tensor::exp() const {
+    return type().exp(*this);
+}
+inline Tensor & Tensor::exp_() {
+    return type().exp_(*this);
+}
+inline Tensor Tensor::expm1() const {
+    return type().expm1(*this);
+}
+inline Tensor & Tensor::expm1_() {
+    return type().expm1_(*this);
+}
+inline Tensor Tensor::expand(IntList size, bool implicit) const {
+    return type().expand(*this, size, implicit);
+}
+inline Tensor Tensor::expand_as(const Tensor & other) const {
+    return type().expand_as(*this, other);
+}
+inline Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim) const {
+    return type().flatten(*this, start_dim, end_dim);
+}
+inline Tensor & Tensor::fill_(Scalar value) {
+    return type().fill_(*this, value);
+}
+inline Tensor & Tensor::fill_(const Tensor & value) {
+    return type().fill_(*this, value);
+}
+inline Tensor Tensor::floor() const {
+    return type().floor(*this);
+}
+inline Tensor & Tensor::floor_() {
+    return type().floor_(*this);
+}
+inline Tensor Tensor::ger(const Tensor & vec2) const {
+    return type().ger(*this, vec2);
+}
+inline std::tuple<Tensor,Tensor> Tensor::gesv(const Tensor & A) const {
+    return type().gesv(*this, A);
+}
+inline Tensor Tensor::fft(int64_t signal_ndim, bool normalized) const {
+    return type().fft(*this, signal_ndim, normalized);
+}
+inline Tensor Tensor::ifft(int64_t signal_ndim, bool normalized) const {
+    return type().ifft(*this, signal_ndim, normalized);
+}
+inline Tensor Tensor::rfft(int64_t signal_ndim, bool normalized, bool onesided) const {
+    return type().rfft(*this, signal_ndim, normalized, onesided);
+}
+inline Tensor Tensor::irfft(int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const {
+    return type().irfft(*this, signal_ndim, normalized, onesided, signal_sizes);
+}
+inline Tensor Tensor::index(TensorList indices) const {
+    return type().index(*this, indices);
+}
+inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) {
+    return type().index_copy_(*this, dim, index, source);
+}
+inline Tensor Tensor::index_put(TensorList indices, const Tensor & values) const {
+    return type().index_put(*this, indices, values);
+}
+inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values) {
+    return type().index_put_(*this, indices, values);
+}
+inline Tensor Tensor::inverse() const {
+    return type().inverse(*this);
+}
+inline Tensor Tensor::isclose(const Tensor & other, double rtol, double atol, bool equal_nan) const {
+    return type().isclose(*this, other, rtol, atol, equal_nan);
+}
+inline bool Tensor::is_cuda() const {
+    return type().is_cuda(*this);
+}
+inline bool Tensor::is_distributed() const {
+    return type().is_distributed(*this);
+}
+inline bool Tensor::is_floating_point() const {
+    return type().is_floating_point(*this);
+}
+inline bool Tensor::is_complex() const {
+    return type().is_complex(*this);
+}
+inline bool Tensor::is_nonzero() const {
+    return type().is_nonzero(*this);
+}
+inline bool Tensor::is_same_size(const Tensor & other) const {
+    return type().is_same_size(*this, other);
+}
+inline bool Tensor::is_signed() const {
+    return type().is_signed(*this);
+}
+inline bool Tensor::is_sparse() const {
+    return type().is_sparse(*this);
+}
+inline std::tuple<Tensor,Tensor> Tensor::kthvalue(int64_t k, int64_t dim, bool keepdim) const {
+    return type().kthvalue(*this, k, dim, keepdim);
+}
+inline Tensor Tensor::log() const {
+    return type().log(*this);
+}
+inline Tensor & Tensor::log_() {
+    return type().log_(*this);
+}
+inline Tensor Tensor::log10() const {
+    return type().log10(*this);
+}
+inline Tensor & Tensor::log10_() {
+    return type().log10_(*this);
+}
+inline Tensor Tensor::log1p() const {
+    return type().log1p(*this);
+}
+inline Tensor & Tensor::log1p_() {
+    return type().log1p_(*this);
+}
+inline Tensor Tensor::log2() const {
+    return type().log2(*this);
+}
+inline Tensor & Tensor::log2_() {
+    return type().log2_(*this);
+}
+inline Tensor Tensor::logdet() const {
+    return type().logdet(*this);
+}
+inline Tensor Tensor::log_softmax(int64_t dim) const {
+    return type().log_softmax(*this, dim);
+}
+inline Tensor Tensor::logsumexp(int64_t dim, bool keepdim) const {
+    return type().logsumexp(*this, dim, keepdim);
+}
+inline Tensor Tensor::matmul(const Tensor & other) const {
+    return type().matmul(*this, other);
+}
+inline Tensor Tensor::matrix_power(int64_t n) const {
+    return type().matrix_power(*this, n);
+}
+inline std::tuple<Tensor,Tensor> Tensor::max(int64_t dim, bool keepdim) const {
+    return type().max(*this, dim, keepdim);
+}
+inline Tensor Tensor::max_values(int64_t dim, bool keepdim) const {
+    return type().max_values(*this, dim, keepdim);
+}
+inline Tensor Tensor::mean(ScalarType dtype) const {
+    return type().mean(*this, dtype);
+}
+inline Tensor Tensor::mean() const {
+    return type().mean(*this);
+}
+inline Tensor Tensor::mean(int64_t dim, bool keepdim, ScalarType dtype) const {
+    return type().mean(*this, dim, keepdim, dtype);
+}
+inline Tensor Tensor::mean(int64_t dim, bool keepdim) const {
+    return type().mean(*this, dim, keepdim);
+}
+inline Tensor Tensor::mean(int64_t dim, ScalarType dtype) const {
+    return type().mean(*this, dim, dtype);
+}
+inline std::tuple<Tensor,Tensor> Tensor::median(int64_t dim, bool keepdim) const {
+    return type().median(*this, dim, keepdim);
+}
+inline std::tuple<Tensor,Tensor> Tensor::min(int64_t dim, bool keepdim) const {
+    return type().min(*this, dim, keepdim);
+}
+inline Tensor Tensor::min_values(int64_t dim, bool keepdim) const {
+    return type().min_values(*this, dim, keepdim);
+}
+inline Tensor Tensor::mm(const Tensor & mat2) const {
+    return type().mm(*this, mat2);
+}
+inline std::tuple<Tensor,Tensor> Tensor::mode(int64_t dim, bool keepdim) const {
+    return type().mode(*this, dim, keepdim);
+}
+inline Tensor Tensor::mul(const Tensor & other) const {
+    return type().mul(*this, other);
+}
+inline Tensor & Tensor::mul_(const Tensor & other) {
+    return type().mul_(*this, other);
+}
+inline Tensor Tensor::mul(Scalar other) const {
+    return type().mul(*this, other);
+}
+inline Tensor & Tensor::mul_(Scalar other) {
+    return type().mul_(*this, other);
+}
+inline Tensor Tensor::mv(const Tensor & vec) const {
+    return type().mv(*this, vec);
+}
+inline Tensor Tensor::mvlgamma(int64_t p) const {
+    return type().mvlgamma(*this, p);
+}
+inline Tensor & Tensor::mvlgamma_(int64_t p) {
+    return type().mvlgamma_(*this, p);
+}
+inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const {
+    return type().narrow(*this, dim, start, length);
+}
+inline Tensor Tensor::permute(IntList dims) const {
+    return type().permute(*this, dims);
+}
+inline Tensor Tensor::pin_memory() const {
+    return type().pin_memory(*this);
+}
+inline Tensor Tensor::pinverse(double rcond) const {
+    return type().pinverse(*this, rcond);
+}
+inline Tensor Tensor::repeat(IntList repeats) const {
+    return type().repeat(*this, repeats);
+}
+inline Tensor Tensor::reshape(IntList shape) const {
+    return type().reshape(*this, shape);
+}
+inline Tensor Tensor::reshape_as(const Tensor & other) const {
+    return type().reshape_as(*this, other);
+}
+inline Tensor Tensor::round() const {
+    return type().round(*this);
+}
+inline Tensor & Tensor::round_() {
+    return type().round_(*this);
+}
+inline Tensor Tensor::relu() const {
+    return type().relu(*this);
+}
+inline Tensor & Tensor::relu_() {
+    return type().relu_(*this);
+}
+inline Tensor Tensor::hardshrink(Scalar lambd) const {
+    return type().hardshrink(*this, lambd);
+}
+inline Tensor Tensor::hardshrink_backward(const Tensor & grad_out, Scalar lambd) const {
+    return type().hardshrink_backward(grad_out, *this, lambd);
+}
+inline Tensor Tensor::rsqrt() const {
+    return type().rsqrt(*this);
+}
+inline Tensor & Tensor::rsqrt_() {
+    return type().rsqrt_(*this);
+}
+inline Tensor Tensor::select(int64_t dim, int64_t index) const {
+    return type().select(*this, dim, index);
+}
+inline Tensor Tensor::sigmoid() const {
+    return type().sigmoid(*this);
+}
+inline Tensor & Tensor::sigmoid_() {
+    return type().sigmoid_(*this);
+}
+inline Tensor Tensor::sin() const {
+    return type().sin(*this);
+}
+inline Tensor & Tensor::sin_() {
+    return type().sin_(*this);
+}
+inline Tensor Tensor::sinh() const {
+    return type().sinh(*this);
+}
+inline Tensor & Tensor::sinh_() {
+    return type().sinh_(*this);
+}
+inline Tensor Tensor::detach() const {
+    return type().detach(*this);
+}
+inline Tensor & Tensor::detach_() {
+    return type().detach_(*this);
+}
+inline int64_t Tensor::size(int64_t dim) const {
+    return type().size(*this, dim);
+}
+inline Tensor Tensor::slice(int64_t dim, int64_t start, int64_t end, int64_t step) const {
+    return type().slice(*this, dim, start, end, step);
+}
+inline std::tuple<Tensor,Tensor> Tensor::slogdet() const {
+    return type().slogdet(*this);
+}
+inline Tensor Tensor::smm(const Tensor & mat2) const {
+    return type().smm(*this, mat2);
+}
+inline Tensor Tensor::softmax(int64_t dim) const {
+    return type().softmax(*this, dim);
+}
+inline std::vector<Tensor> Tensor::split(int64_t split_size, int64_t dim) const {
+    return type().split(*this, split_size, dim);
+}
+inline std::vector<Tensor> Tensor::split_with_sizes(IntList split_sizes, int64_t dim) const {
+    return type().split_with_sizes(*this, split_sizes, dim);
+}
+inline Tensor Tensor::squeeze() const {
+    return type().squeeze(*this);
+}
+inline Tensor Tensor::squeeze(int64_t dim) const {
+    return type().squeeze(*this, dim);
+}
+inline Tensor & Tensor::squeeze_() {
+    return type().squeeze_(*this);
+}
+inline Tensor & Tensor::squeeze_(int64_t dim) {
+    return type().squeeze_(*this, dim);
+}
+inline Tensor Tensor::sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
+    return type().sspaddmm(*this, mat1, mat2, beta, alpha);
+}
+inline Tensor Tensor::stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const {
+    return type().stft(*this, n_fft, hop_length, win_length, window, normalized, onesided);
+}
+inline int64_t Tensor::stride(int64_t dim) const {
+    return type().stride(*this, dim);
+}
+inline Tensor Tensor::sum(ScalarType dtype) const {
+    return type().sum(*this, dtype);
+}
+inline Tensor Tensor::sum() const {
+    return type().sum(*this);
+}
+inline Tensor Tensor::sum(IntList dim, bool keepdim, ScalarType dtype) const {
+    return type().sum(*this, dim, keepdim, dtype);
+}
+inline Tensor Tensor::sum(IntList dim, bool keepdim) const {
+    return type().sum(*this, dim, keepdim);
+}
+inline Tensor Tensor::sum(IntList dim, ScalarType dtype) const {
+    return type().sum(*this, dim, dtype);
+}
+inline Tensor Tensor::sqrt() const {
+    return type().sqrt(*this);
+}
+inline Tensor & Tensor::sqrt_() {
+    return type().sqrt_(*this);
+}
+inline Tensor Tensor::std(bool unbiased) const {
+    return type().std(*this, unbiased);
+}
+inline Tensor Tensor::std(int64_t dim, bool unbiased, bool keepdim) const {
+    return type().std(*this, dim, unbiased, keepdim);
+}
+inline Tensor Tensor::prod(ScalarType dtype) const {
+    return type().prod(*this, dtype);
+}
+inline Tensor Tensor::prod() const {
+    return type().prod(*this);
+}
+inline Tensor Tensor::prod(int64_t dim, bool keepdim, ScalarType dtype) const {
+    return type().prod(*this, dim, keepdim, dtype);
+}
+inline Tensor Tensor::prod(int64_t dim, bool keepdim) const {
+    return type().prod(*this, dim, keepdim);
+}
+inline Tensor Tensor::prod(int64_t dim, ScalarType dtype) const {
+    return type().prod(*this, dim, dtype);
+}
+inline Tensor Tensor::t() const {
+    return type().t(*this);
+}
+inline Tensor & Tensor::t_() {
+    return type().t_(*this);
+}
+inline Tensor Tensor::tan() const {
+    return type().tan(*this);
+}
+inline Tensor & Tensor::tan_() {
+    return type().tan_(*this);
+}
+inline Tensor Tensor::tanh() const {
+    return type().tanh(*this);
+}
+inline Tensor & Tensor::tanh_() {
+    return type().tanh_(*this);
+}
+inline Tensor Tensor::transpose(int64_t dim0, int64_t dim1) const {
+    return type().transpose(*this, dim0, dim1);
+}
+inline Tensor & Tensor::transpose_(int64_t dim0, int64_t dim1) {
+    return type().transpose_(*this, dim0, dim1);
+}
+inline Tensor Tensor::flip(IntList dims) const {
+    return type().flip(*this, dims);
+}
+inline Tensor Tensor::rot90(int64_t k, IntList dims) const {
+    return type().rot90(*this, k, dims);
+}
+inline Tensor Tensor::trunc() const {
+    return type().trunc(*this);
+}
+inline Tensor & Tensor::trunc_() {
+    return type().trunc_(*this);
+}
+inline Tensor Tensor::type_as(const Tensor & other) const {
+    return type().type_as(*this, other);
+}
+inline Tensor Tensor::unsqueeze(int64_t dim) const {
+    return type().unsqueeze(*this, dim);
+}
+inline Tensor & Tensor::unsqueeze_(int64_t dim) {
+    return type().unsqueeze_(*this, dim);
+}
+inline Tensor Tensor::var(bool unbiased) const {
+    return type().var(*this, unbiased);
+}
+inline Tensor Tensor::var(int64_t dim, bool unbiased, bool keepdim) const {
+    return type().var(*this, dim, unbiased, keepdim);
+}
+inline Tensor Tensor::view_as(const Tensor & other) const {
+    return type().view_as(*this, other);
+}
+inline Tensor Tensor::where(const Tensor & condition, const Tensor & other) const {
+    return type().where(condition, *this, other);
+}
+inline Tensor Tensor::norm(Scalar p) const {
+    return type().norm(*this, p);
+}
+inline Tensor Tensor::norm(Scalar p, int64_t dim, bool keepdim) const {
+    return type().norm(*this, p, dim, keepdim);
+}
+inline Tensor Tensor::clone() const {
+    return type().clone(*this);
+}
+inline Tensor & Tensor::resize_as_(const Tensor & the_template) {
+    return type().resize_as_(*this, the_template);
+}
+inline Tensor Tensor::pow(Scalar exponent) const {
+    return type().pow(*this, exponent);
+}
+inline Tensor & Tensor::zero_() {
+    return type().zero_(*this);
+}
+inline Tensor Tensor::sub(const Tensor & other, Scalar alpha) const {
+    return type().sub(*this, other, alpha);
+}
+inline Tensor & Tensor::sub_(const Tensor & other, Scalar alpha) {
+    return type().sub_(*this, other, alpha);
+}
+inline Tensor Tensor::sub(Scalar other, Scalar alpha) const {
+    return type().sub(*this, other, alpha);
+}
+inline Tensor & Tensor::sub_(Scalar other, Scalar alpha) {
+    return type().sub_(*this, other, alpha);
+}
+inline Tensor Tensor::addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
+    return type().addmm(*this, mat1, mat2, beta, alpha);
+}
+inline Tensor & Tensor::addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return type().addmm_(*this, mat1, mat2, beta, alpha);
+}
+inline Tensor & Tensor::sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims) {
+    return type().sparse_resize_(*this, size, sparseDims, denseDims);
+}
+inline Tensor & Tensor::sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims) {
+    return type().sparse_resize_and_clear_(*this, size, sparseDims, denseDims);
+}
+inline Tensor Tensor::sparse_mask(SparseTensorRef mask) const {
+    return type().sparse_mask(*this, mask);
+}
+inline Tensor Tensor::to_dense() const {
+    return type().to_dense(*this);
+}
+inline int64_t Tensor::_sparseDims() const {
+    return type()._sparseDims(*this);
+}
+inline int64_t Tensor::_denseDims() const {
+    return type()._denseDims(*this);
+}
+inline int64_t Tensor::_nnz() const {
+    return type()._nnz(*this);
+}
+inline Tensor Tensor::coalesce() const {
+    return type().coalesce(*this);
+}
+inline bool Tensor::is_coalesced() const {
+    return type().is_coalesced(*this);
+}
+inline Tensor Tensor::_indices() const {
+    return type()._indices(*this);
+}
+inline Tensor Tensor::_values() const {
+    return type()._values(*this);
+}
+inline int64_t Tensor::numel() const {
+    return type().numel(*this);
+}
+inline std::vector<Tensor> Tensor::unbind(int64_t dim) const {
+    return type().unbind(*this, dim);
+}
+inline int64_t Tensor::get_device() const {
+    return type().get_device(*this);
+}
+inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) const {
+    return type().to(*this, device, dtype, non_blocking);
+}
+inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const {
+    return type().to(*this, dtype, non_blocking);
+}
+inline Tensor Tensor::to(Device device, bool non_blocking) const {
+    return type().to(*this, device, non_blocking);
+}
+inline Tensor Tensor::to(const Tensor & other, bool non_blocking) const {
+    return type().to(*this, other, non_blocking);
+}
+inline Scalar Tensor::_local_scalar() const {
+    return type()._local_scalar(*this);
+}
+
+#define DEFINE_CAST(T, name, _)                  \
+  template <>                                    \
+  inline T* Tensor::data() const {               \
+    AT_CHECK(                                    \
+        type().scalarType() == ScalarType::name, \
+        "expected scalar type ",                 \
+        #name,                                   \
+        " but found ",                           \
+        at::toString(type().scalarType()));      \
+    return static_cast<T*>(this->data_ptr());    \
+  }                                              \
+  inline T* Tensor::to##name##Data() const {     \
+    return data<T>();                            \
+  }
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
+#undef DEFINE_CAST
+
+#define DEFINE_TO_C_TYPE(T,name,_) \
+inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
+#undef DEFINE_TO_C_TYPE
+
+} //namespace at
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
new file mode 100644
index 00000000000000..6284e7634bb755
--- /dev/null
+++ b/aten/src/ATen/core/Type.h
@@ -0,0 +1,647 @@
+#pragma once
+
+#include "ATen/core/ATenGeneral.h"
+#include "ATen/core/Allocator.h"
+#include "ATen/core/Deprecated.h"
+#include "ATen/core/Generator.h"
+#include "ATen/core/Layout.h"
+#include "ATen/core/Scalar.h"
+#include "ATen/core/ScalarType.h"
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/core/Tensor.h"
+#include "ATen/core/ArrayRef.h"
+#include "ATen/core/Half.h"
+#include "ATen/core/TensorTypeIdRegistration.h"
+#include "ATen/core/Reduction.h"
+#include "ATen/core/TensorOptions.h"
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <memory>
+
+// To solve the conflict of s_addr in inaddr.h
+#ifdef _MSC_VER
+#ifdef s_addr
+#undef s_addr
+#endif
+#endif
+
+namespace at {
+
+class Context;
+struct Allocator;
+struct Generator;
+struct Storage;
+
+static inline void noop_deleter(void*) {}
+
+enum class TypeID {
+  CPUByte,
+  CPUChar,
+  CPUDouble,
+  CPUFloat,
+  CPUInt,
+  CPULong,
+  CPUShort,
+  CPUHalf,
+  SparseCPUByte,
+  SparseCPUChar,
+  SparseCPUDouble,
+  SparseCPUFloat,
+  SparseCPUInt,
+  SparseCPULong,
+  SparseCPUShort,
+  CUDAByte,
+  CUDAChar,
+  CUDADouble,
+  CUDAFloat,
+  CUDAInt,
+  CUDALong,
+  CUDAShort,
+  CUDAHalf,
+  SparseCUDAByte,
+  SparseCUDAChar,
+  SparseCUDADouble,
+  SparseCUDAFloat,
+  SparseCUDAInt,
+  SparseCUDALong,
+  SparseCUDAShort,
+  CPUComplexFloat,
+  CPUComplexDouble,
+  CUDAComplexFloat,
+  CUDAComplexDouble,
+  Undefined,
+  NumOptions
+};
+
+struct AT_API Type {
+  explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
+
+  virtual ~Type() {}
+  virtual ScalarType scalarType() const = 0;
+  virtual Backend backend() const = 0;
+  Layout layout() const noexcept { return layout_from_backend(backend()); }
+  virtual bool is_cuda() const = 0;
+  virtual bool is_sparse() const = 0;
+  virtual bool is_distributed() const = 0;
+  bool is_variable() const noexcept { return is_variable_; }
+  bool is_undefined() const noexcept { return is_undefined_; }
+  virtual Allocator * allocator() const = 0;
+  virtual Device getDeviceFromPtr(void * data) const = 0;
+  virtual Storage storage(bool resizable = false) const = 0;
+  virtual Storage storage(size_t size, bool resizable = false) const = 0;
+  virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual Storage storageWithAllocator(int64_t size, Allocator* allocator) const = 0;
+  virtual std::unique_ptr<Generator> generator() const = 0;
+  virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0;
+  virtual Storage unsafeStorageFromTH(void * th_pointer, bool retain) const = 0;
+  virtual const char * toString() const = 0;
+  virtual size_t elementSizeInBytes() const = 0;
+  virtual Type & toBackend(Backend b) const = 0;
+  virtual Type & toScalarType(ScalarType s) const = 0;
+  Type & toSparse() const {
+    return this->toBackend(at::toSparse(this->backend()));
+  }
+  Type & toDense() const {
+    return this->toBackend(at::toDense(this->backend()));
+  }
+  Type & cpu() const {
+    return this->toBackend(at::backendToCPU(this->backend()));
+  }
+  Type & cuda() const {
+    return this->toBackend(at::backendToCUDA(this->backend()));
+  }
+  // contiguous IDs for all types in the system
+  // for external dispatch
+  virtual TypeID ID() const = 0;
+
+  // New-style TensorTypeId that supports open registration.
+  TensorTypeId type_id() const { return type_id_; }
+
+  // NB: This will return DeviceType::CPU for Backend::SparseCPU
+  DeviceType device_type() const {
+    return backendToDeviceType(backend());
+  }
+
+  virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional<Device> to_device={}) const = 0;
+  virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0;
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
+  virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0;
+
+  virtual void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const = 0;
+  virtual void set_data(Tensor & self, Tensor new_data) const = 0;
+
+  virtual Tensor tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const = 0;
+  virtual Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const = 0;
+  virtual Tensor scalarTensor(Scalar s) const = 0;
+
+  bool operator==(const Type& other) const {
+    return this == &other;
+  }
+  bool operator!=(const Type& other) const {
+    return this != &other;
+  }
+
+  /// Constructs the `TensorOptions` from a type and a `device_index`.
+  TensorOptions options(int32_t device_index = -1) const {
+    TensorOptions r;
+    r.dtype(scalarType());
+    r.device({backendToDeviceType(backend()), device_index});
+    r.layout(layout());
+    r.is_variable(is_variable());
+    return r;
+  }
+
+  operator TensorOptions() const {
+    return options();
+  }
+
+  // example
+  // virtual Tensor * add(Tensor & a, Tensor & b) = 0;
+  virtual int64_t storage_offset(const Tensor & self) const = 0;
+  virtual Tensor & resize_(Tensor & self, IntList size) const = 0;
+  virtual Tensor & set_(Tensor & self, Storage source) const = 0;
+  virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride={}) const = 0;
+  virtual Tensor & set_(Tensor & self, const Tensor & source) const = 0;
+  virtual Tensor & set_(Tensor & self) const = 0;
+  virtual bool is_contiguous(const Tensor & self) const = 0;
+  virtual bool is_set_to(const Tensor & self, const Tensor & tensor) const = 0;
+  virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0;
+  virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0;
+  virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0;
+  virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0;
+  virtual Tensor & s_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0;
+  virtual Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0;
+  virtual Tensor s_masked_select(const Tensor & self, const Tensor & mask) const = 0;
+  virtual Tensor masked_select(const Tensor & self, const Tensor & mask) const = 0;
+  virtual Tensor nonzero(const Tensor & self) const = 0;
+  virtual Tensor contiguous(const Tensor & self) const = 0;
+  virtual Tensor view(const Tensor & self, IntList size) const = 0;
+  virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const = 0;
+  virtual Tensor take(const Tensor & self, const Tensor & index) const = 0;
+  virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate=false) const = 0;
+  virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
+  virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0;
+  virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const = 0;
+  virtual Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step) const = 0;
+  virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0;
+  virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0;
+  virtual Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0;
+  virtual Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) const = 0;
+  virtual void* data_ptr(const Tensor & self) const = 0;
+  virtual bool equal(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __and__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___and__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __and__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __iand__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___iand__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __iand__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __or__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___or__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __or__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ior__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___ior__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ior__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __xor__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___xor__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __xor__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ixor__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___ixor__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ixor__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __lshift__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___lshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __lshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ilshift__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___ilshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ilshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __rshift__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___rshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __rshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __irshift__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___irshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __irshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor lt(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_lt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor lt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & lt_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_lt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & lt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor gt(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_gt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor gt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & gt_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_gt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & gt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor le(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_le(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor le(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & le_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_le_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & le_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ge(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_ge(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ge(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ge_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_ge_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ge_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor eq(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_eq(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor eq(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & eq_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_eq_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & eq_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ne(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_ne(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ne(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ne_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_ne_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ne_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor s_min(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor min(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor min(const Tensor & self) const = 0;
+  virtual Tensor s_max(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor max(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor max(const Tensor & self) const = 0;
+  virtual Tensor median(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim=-1, bool descending=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const = 0;
+  virtual Tensor all(const Tensor & self) const = 0;
+  virtual Tensor any(const Tensor & self) const = 0;
+  virtual Tensor lgamma(const Tensor & self) const = 0;
+  virtual Tensor & lgamma_(Tensor & self) const = 0;
+  virtual Tensor digamma(const Tensor & self) const = 0;
+  virtual Tensor & digamma_(Tensor & self) const = 0;
+  virtual Tensor polygamma(int64_t n, const Tensor & self) const = 0;
+  virtual Tensor & polygamma_(Tensor & self, int64_t n) const = 0;
+  virtual Tensor & erfinv_(Tensor & self) const = 0;
+  virtual Tensor erfinv(const Tensor & self) const = 0;
+  virtual Tensor & frac_(Tensor & self) const = 0;
+  virtual Tensor frac(const Tensor & self) const = 0;
+  virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
+  virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
+  virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0;
+  virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0;
+  virtual Tensor reciprocal(const Tensor & self) const = 0;
+  virtual Tensor & reciprocal_(Tensor & self) const = 0;
+  virtual Tensor neg(const Tensor & self) const = 0;
+  virtual Tensor & neg_(Tensor & self) const = 0;
+  virtual Tensor s_atan2(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor atan2(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & s_atan2_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & atan2_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor s_pow(const Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor pow(const Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor pow(Scalar base, const Tensor & self) const = 0;
+  virtual Tensor & pow_(Tensor & self, Scalar exponent) const = 0;
+  virtual Tensor & s_pow_(Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor & pow_(Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor s_lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const = 0;
+  virtual Tensor sign(const Tensor & self) const = 0;
+  virtual Tensor & sign_(Tensor & self) const = 0;
+  virtual Tensor trace(const Tensor & self) const = 0;
+  virtual Tensor fmod(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_fmod(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor fmod(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & fmod_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_fmod_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & fmod_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor remainder(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_remainder(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor remainder(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & remainder_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & remainder_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor tril(const Tensor & self, int64_t diagonal=0) const = 0;
+  virtual Tensor & tril_(Tensor & self, int64_t diagonal=0) const = 0;
+  virtual Tensor triu(const Tensor & self, int64_t diagonal=0) const = 0;
+  virtual Tensor & triu_(Tensor & self, int64_t diagonal=0) const = 0;
+  virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1) const = 0;
+  virtual Tensor diag(const Tensor & self, int64_t diagonal=0) const = 0;
+  virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual std::tuple<Tensor,Tensor> gels(const Tensor & self, const Tensor & A) const = 0;
+  virtual std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors=false, bool upper=true) const = 0;
+  virtual std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors=false) const = 0;
+  virtual std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some=true) const = 0;
+  virtual Tensor potrf(const Tensor & self, bool upper=true) const = 0;
+  virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true) const = 0;
+  virtual Tensor potri(const Tensor & self, bool upper=true) const = 0;
+  virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper=true, Scalar tol=-1) const = 0;
+  virtual std::tuple<Tensor,Tensor> qr(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> geqrf(const Tensor & self) const = 0;
+  virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0;
+  virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot=true) const = 0;
+  virtual std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot=true) const = 0;
+  virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const = 0;
+  virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator=nullptr) const = 0;
+  virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator=nullptr) const = 0;
+  virtual Tensor & random_(Tensor & self, Generator * generator=nullptr) const = 0;
+  virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const = 0;
+  virtual Tensor & uniform_(Tensor & self, double from=0, double to=1, Generator * generator=nullptr) const = 0;
+  virtual Tensor & normal_(Tensor & self, double mean=0, double std=1, Generator * generator=nullptr) const = 0;
+  virtual Tensor & cauchy_(Tensor & self, double median=0, double sigma=1, Generator * generator=nullptr) const = 0;
+  virtual Tensor & log_normal_(Tensor & self, double mean=1, double std=2, Generator * generator=nullptr) const = 0;
+  virtual Tensor & exponential_(Tensor & self, double lambd=1, Generator * generator=nullptr) const = 0;
+  virtual Tensor & geometric_(Tensor & self, double p, Generator * generator=nullptr) const = 0;
+  virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride={}) const = 0;
+  virtual Tensor tensor(IntList size, IntList stride) const = 0;
+  virtual Tensor abs(const Tensor & self) const = 0;
+  virtual Tensor & abs_(Tensor & self) const = 0;
+  virtual Tensor acos(const Tensor & self) const = 0;
+  virtual Tensor & acos_(Tensor & self) const = 0;
+  virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
+  virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
+  virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha=1) const = 0;
+  virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha=1) const = 0;
+  virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0;
+  virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step=1) const = 0);
+  AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0);
+  virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor argmax(const Tensor & self) const = 0;
+  virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor argmin(const Tensor & self) const = 0;
+  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0;
+  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0;
+  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0;
+  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0;
+  virtual Tensor asin(const Tensor & self) const = 0;
+  virtual Tensor & asin_(Tensor & self) const = 0;
+  virtual Tensor atan(const Tensor & self) const = 0;
+  virtual Tensor & atan_(Tensor & self) const = 0;
+  virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator=nullptr) const = 0;
+  virtual Tensor bernoulli(const Tensor & self) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator=nullptr) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self) const = 0;
+  virtual Tensor bincount(const Tensor & self, const Tensor & weights={}, int64_t minlength=0) const = 0;
+  virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0;
+  virtual Tensor ceil(const Tensor & self) const = 0;
+  virtual Tensor & ceil_(Tensor & self) const = 0;
+  virtual std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim=0) const = 0;
+  virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const = 0;
+  virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const = 0;
+  virtual Tensor clamp_max(const Tensor & self, Scalar max) const = 0;
+  virtual Tensor & clamp_max_(Tensor & self, Scalar max) const = 0;
+  virtual Tensor clamp_min(const Tensor & self, Scalar min) const = 0;
+  virtual Tensor & clamp_min_(Tensor & self, Scalar min) const = 0;
+  virtual Tensor cos(const Tensor & self) const = 0;
+  virtual Tensor & cos_(Tensor & self) const = 0;
+  virtual Tensor cosh(const Tensor & self) const = 0;
+  virtual Tensor & cosh_(Tensor & self) const = 0;
+  virtual Tensor cumsum(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual Tensor cumsum(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor cumprod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual Tensor cumprod(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor det(const Tensor & self) const = 0;
+  virtual Tensor diagflat(const Tensor & self, int64_t offset=0) const = 0;
+  virtual Tensor diagonal(const Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const = 0;
+  virtual Tensor div(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & div_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor div(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor & div_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor dot(const Tensor & self, const Tensor & tensor) const = 0;
+  AT_DEPRECATED(virtual Tensor empty(IntList size) const = 0);
+  virtual Tensor erf(const Tensor & self) const = 0;
+  virtual Tensor & erf_(Tensor & self) const = 0;
+  virtual Tensor erfc(const Tensor & self) const = 0;
+  virtual Tensor & erfc_(Tensor & self) const = 0;
+  virtual Tensor exp(const Tensor & self) const = 0;
+  virtual Tensor & exp_(Tensor & self) const = 0;
+  virtual Tensor expm1(const Tensor & self) const = 0;
+  virtual Tensor & expm1_(Tensor & self) const = 0;
+  virtual Tensor expand(const Tensor & self, IntList size, bool implicit=false) const = 0;
+  virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0;
+  AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m=-1) const = 0);
+  virtual Tensor flatten(const Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) const = 0;
+  virtual Tensor & fill_(Tensor & self, Scalar value) const = 0;
+  virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0;
+  virtual Tensor floor(const Tensor & self) const = 0;
+  virtual Tensor & floor_(Tensor & self) const = 0;
+  AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0);
+  virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0;
+  virtual std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) const = 0;
+  virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0;
+  virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0;
+  virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true) const = 0;
+  virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const = 0;
+  virtual Tensor index(const Tensor & self, TensorList indices) const = 0;
+  virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
+  virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0;
+  virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0;
+  virtual Tensor inverse(const Tensor & self) const = 0;
+  virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0;
+  virtual bool is_cuda(const Tensor & self) const = 0;
+  virtual bool is_distributed(const Tensor & self) const = 0;
+  virtual bool is_floating_point(const Tensor & self) const = 0;
+  virtual bool is_complex(const Tensor & self) const = 0;
+  virtual bool is_nonzero(const Tensor & self) const = 0;
+  virtual bool is_same_size(const Tensor & self, const Tensor & other) const = 0;
+  virtual bool is_signed(const Tensor & self) const = 0;
+  virtual bool is_sparse(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const = 0;
+  AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps=100) const = 0);
+  virtual Tensor log(const Tensor & self) const = 0;
+  virtual Tensor & log_(Tensor & self) const = 0;
+  virtual Tensor log10(const Tensor & self) const = 0;
+  virtual Tensor & log10_(Tensor & self) const = 0;
+  virtual Tensor log1p(const Tensor & self) const = 0;
+  virtual Tensor & log1p_(Tensor & self) const = 0;
+  virtual Tensor log2(const Tensor & self) const = 0;
+  virtual Tensor & log2_(Tensor & self) const = 0;
+  virtual Tensor logdet(const Tensor & self) const = 0;
+  AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps=100) const = 0);
+  virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor matrix_power(const Tensor & self, int64_t n) const = 0;
+  virtual std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor mean(const Tensor & self, ScalarType dtype) const = 0;
+  virtual Tensor mean(const Tensor & self) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor mm(const Tensor & self, const Tensor & mat2) const = 0;
+  virtual std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim=-1, bool keepdim=false) const = 0;
+  virtual Tensor mul(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & mul_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor mul(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor & mul_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0;
+  virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0;
+  virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0;
+  virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
+  AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0);
+  virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
+  virtual Tensor pin_memory(const Tensor & self) const = 0;
+  virtual Tensor pinverse(const Tensor & self, double rcond=1e-15) const = 0;
+  AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator=nullptr) const = 0);
+  AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator=nullptr) const = 0);
+  AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator=nullptr) const = 0);
+  AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator=nullptr) const = 0);
+  AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator=nullptr) const = 0);
+  AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step=1) const = 0);
+  virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0;
+  virtual Tensor reshape(const Tensor & self, IntList shape) const = 0;
+  virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor round(const Tensor & self) const = 0;
+  virtual Tensor & round_(Tensor & self) const = 0;
+  virtual Tensor relu(const Tensor & self) const = 0;
+  virtual Tensor & relu_(Tensor & self) const = 0;
+  virtual Tensor hardshrink(const Tensor & self, Scalar lambd=0.5) const = 0;
+  virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0;
+  virtual Tensor rsqrt(const Tensor & self) const = 0;
+  virtual Tensor & rsqrt_(Tensor & self) const = 0;
+  virtual Tensor select(const Tensor & self, int64_t dim, int64_t index) const = 0;
+  virtual Tensor sigmoid(const Tensor & self) const = 0;
+  virtual Tensor & sigmoid_(Tensor & self) const = 0;
+  virtual Tensor sin(const Tensor & self) const = 0;
+  virtual Tensor & sin_(Tensor & self) const = 0;
+  virtual Tensor sinh(const Tensor & self) const = 0;
+  virtual Tensor & sinh_(Tensor & self) const = 0;
+  virtual Tensor detach(const Tensor & self) const = 0;
+  virtual Tensor & detach_(Tensor & self) const = 0;
+  virtual int64_t size(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const = 0;
+  virtual std::tuple<Tensor,Tensor> slogdet(const Tensor & self) const = 0;
+  virtual Tensor smm(const Tensor & self, const Tensor & mat2) const = 0;
+  virtual Tensor softmax(const Tensor & self, int64_t dim) const = 0;
+  virtual std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim=0) const = 0;
+  virtual std::vector<Tensor> split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim=0) const = 0;
+  virtual Tensor squeeze(const Tensor & self) const = 0;
+  virtual Tensor squeeze(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor & squeeze_(Tensor & self) const = 0;
+  virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0;
+  virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const = 0;
+  virtual int64_t stride(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0;
+  virtual Tensor sum(const Tensor & self) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim=false) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0;
+  virtual Tensor sqrt(const Tensor & self) const = 0;
+  virtual Tensor & sqrt_(Tensor & self) const = 0;
+  virtual Tensor std(const Tensor & self, bool unbiased=true) const = 0;
+  virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0;
+  virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0;
+  virtual Tensor prod(const Tensor & self) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual Tensor t(const Tensor & self) const = 0;
+  virtual Tensor & t_(Tensor & self) const = 0;
+  virtual Tensor tan(const Tensor & self) const = 0;
+  virtual Tensor & tan_(Tensor & self) const = 0;
+  virtual Tensor tanh(const Tensor & self) const = 0;
+  virtual Tensor & tanh_(Tensor & self) const = 0;
+  virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const = 0;
+  virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const = 0;
+  virtual Tensor flip(const Tensor & self, IntList dims) const = 0;
+  virtual Tensor rot90(const Tensor & self, int64_t k=1, IntList dims={0,1}) const = 0;
+  virtual Tensor trunc(const Tensor & self) const = 0;
+  virtual Tensor & trunc_(Tensor & self) const = 0;
+  virtual Tensor type_as(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const = 0;
+  virtual Tensor var(const Tensor & self, bool unbiased=true) const = 0;
+  virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0;
+  virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0;
+  AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0);
+  virtual Tensor norm(const Tensor & self, Scalar p=2) const = 0;
+  virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor clone(const Tensor & self) const = 0;
+  virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0;
+  virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0;
+  virtual Tensor & zero_(Tensor & self) const = 0;
+  virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
+  virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
+  virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1) const = 0;
+  virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha=1) const = 0;
+  virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor tensor() const = 0;
+  virtual Tensor tensor(IntList size) const = 0;
+  virtual Tensor native_sparse_coo_tensor(IntList size) const = 0;
+  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
+  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
+  virtual Tensor sparse_coo_tensor(IntList size) const = 0;
+  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
+  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
+  virtual Tensor _native_sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntList size) const = 0;
+  virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
+  virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
+  virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0;
+  virtual Tensor to_dense(const Tensor & self) const = 0;
+  virtual int64_t _sparseDims(const Tensor & self) const = 0;
+  virtual int64_t _denseDims(const Tensor & self) const = 0;
+  virtual int64_t _nnz(const Tensor & self) const = 0;
+  virtual Tensor coalesce(const Tensor & self) const = 0;
+  virtual bool is_coalesced(const Tensor & self) const = 0;
+  virtual Tensor _indices(const Tensor & self) const = 0;
+  virtual Tensor _values(const Tensor & self) const = 0;
+  virtual int64_t numel(const Tensor & self) const = 0;
+  virtual std::vector<Tensor> unbind(const Tensor & self, int64_t dim=0) const = 0;
+  virtual int64_t get_device(const Tensor & self) const = 0;
+  virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking=false) const = 0;
+  virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking=false) const = 0;
+  virtual Tensor to(const Tensor & self, Device device, bool non_blocking=false) const = 0;
+  virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking=false) const = 0;
+  virtual Scalar _local_scalar(const Tensor & self) const = 0;
+protected:
+  TensorTypeId type_id_;
+  bool is_variable_;
+  bool is_undefined_;
+
+};
+
+inline bool Tensor::is_variable() const noexcept {
+  return type().is_variable();
+}
+
+inline ScalarType Tensor::dtype() const noexcept {
+  return type().scalarType();
+}
+
+inline Layout Tensor::layout() const noexcept {
+  return type().layout();
+}
+
+inline Device Tensor::device() const {
+  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
+}
+
+} // namespace at
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 5497a752dc4fb4..f0827077827a25 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+import filecmp
 
 import yaml
 from collections import OrderedDict
@@ -40,19 +41,23 @@
 parser.add_argument(
     '-d', '--install_dir', help='output directory', default='ATen')
 options = parser.parse_args()
+core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None
 
 if options.install_dir is not None and not os.path.exists(options.install_dir):
     os.makedirs(options.install_dir)
+if core_install_dir is not None and not os.path.exists(core_install_dir):
+    os.makedirs(core_install_dir)
 
 
 class FileManager(object):
-    def __init__(self):
+    def __init__(self, prefix=None):
+        self.install_dir = '{}/{}'.format(options.install_dir, prefix) if prefix else options.install_dir
         self.filenames = set()
         self.outputs_written = False
         self.undeclared_files = []
 
     def will_write(self, filename):
-        filename = '{}/{}'.format(options.install_dir, filename)
+        filename = '{}/{}'.format(self.install_dir, filename)
         if self.outputs_written:
             raise Exception("'will_write' can only be called before " +
                             "the call to write_outputs, refactor so outputs are registered " +
@@ -78,7 +83,7 @@ def write_outputs(self, filename):
         self.outputs_written = True
 
     def write(self, filename, s, env=None):
-        filename = '{}/{}'.format(options.install_dir, filename)
+        filename = '{}/{}'.format(self.install_dir, filename)
         if isinstance(s, CodeTemplate):
             assert env is not None
             env['generated_comment'] = "@" + "generated by aten/src/ATen/gen.py"
@@ -128,6 +133,7 @@ def check_all_files_written(self):
 context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}());
 """)
 
+core_file_manager = FileManager('core_tmp')
 file_manager = FileManager()
 cuda_file_manager = FileManager()
 
@@ -332,9 +338,11 @@ def iterate_types():
 # so that the script runs quickly when we are just querying the
 # outputs
 def declare_outputs():
-    files = ['Declarations.yaml', 'Type.h', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h',
-             'TensorMethods.h', 'Functions.h',
-             'CPUCopy.cpp', 'NativeFunctions.h',
+    core_files = ['Type.h', 'Tensor.h', 'TensorMethods.h']
+    for f in core_files:
+        core_file_manager.will_write(f)
+    files = ['Declarations.yaml', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h',
+             'Functions.h', 'CPUCopy.cpp', 'NativeFunctions.h',
              'RegisterCPU.cpp', 'RegisterCPU.h']
     for f in files:
         file_manager.will_write(f)
@@ -401,7 +409,15 @@ def generate_outputs():
         all_types.append(generate_storage_type_and_tensor(
             backend, density, scalar_type, declarations))
 
-    file_manager.write('Type.h', TYPE_H, top_env)
+    core_files = {
+        'Type.h': TYPE_H,
+        'Tensor.h': TENSOR_H,
+        'TensorMethods.h': TENSOR_METHODS_H
+    }
+
+    for core_file, core_template_file in core_files.items():
+        core_file_manager.write(core_file, core_template_file, top_env)
+
     file_manager.write('TypeExtendedInterface.h', TYPE_EXTENDED_INTERFACE_H, top_env)
     file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env)
     file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env)
@@ -412,8 +428,6 @@ def generate_outputs():
     cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env)
     cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env)
 
-    file_manager.write('Tensor.h', TENSOR_H, top_env)
-    file_manager.write('TensorMethods.h', TENSOR_METHODS_H, top_env)
     file_manager.write('Functions.h', FUNCTIONS_H, top_env)
 
     file_manager.write('CPUCopy.cpp', copy_wrapper.create(all_types, 'CPU'))
@@ -423,10 +437,25 @@ def generate_outputs():
     file_manager.check_all_files_written()
     cuda_file_manager.check_all_files_written()
 
+    # check that generated files match source files
+    core_source_path = os.path.join(options.source_path, 'core')
+    match, mismatch, errors = filecmp.cmpfiles(core_install_dir, core_source_path, core_files.keys(), shallow=False)
+    if errors:
+        raise RuntimeError("Error while trying to compare source and generated files for {}. "
+                           "Source directory: {}.  Generated directory: {}."
+                           .format(errors, core_source_path, core_install_dir))
+    if mismatch:
+        file_component = '{}'.format(','.join(mismatch))
+        if len(mismatch) > 1:
+            file_component = '{' + file_component + '}'
+        update_cmd = "cp {}/{} {}".format(core_install_dir, file_component, core_source_path)
+        raise RuntimeError("Source files: {} did not match generated files.  To update the source files, "
+                           "run \"{}\"".format(mismatch, update_cmd))
 
 declare_outputs()
 if options.output_dependencies is not None:
     file_manager.write_outputs(options.output_dependencies)
+    core_file_manager.write_outputs(options.output_dependencies + "-core")
     cuda_file_manager.write_outputs(options.output_dependencies + "-cuda")
 else:
     generate_outputs()
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index 82a2f00ff77bc7..c6355127734b1b 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -5,7 +5,7 @@
 #include <ATen/Context.h>
 #include <ATen/ScalarType.h>
 #include <ATen/TensorOperators.h>
-#include <ATen/TensorMethods.h>
+#include <ATen/core/TensorMethods.h>
 #include <ATen/core/TensorOptions.h>
 
 #include <array>
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index aab8acd86a6ba5..50f6bb83bb7edb 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// ${generated_comment}
-
 #include "ATen/core/Device.h"
 #include "ATen/core/Layout.h"
 #include "ATen/core/Scalar.h"
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index ec86db721b9f04..54fd80c5f13463 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -1,11 +1,9 @@
 #pragma once
 
-// ${generated_comment}
-
-#include "ATen/Tensor.h"
+#include "ATen/core/Tensor.h"
 #include "ATen/core/Scalar.h"
 #include "ATen/core/SparseTensorRef.h"
-#include "ATen/Type.h"
+#include "ATen/core/Type.h"
 #include "ATen/core/TensorOptions.h"
 
 namespace at {
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 183b2d4d498026..18fb393aeca759 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// ${generated_comment}
-
 #include "ATen/core/ATenGeneral.h"
 #include "ATen/core/Allocator.h"
 #include "ATen/core/Deprecated.h"
@@ -10,7 +8,7 @@
 #include "ATen/core/Scalar.h"
 #include "ATen/core/ScalarType.h"
 #include "ATen/core/SparseTensorRef.h"
-#include "ATen/Tensor.h"
+#include "ATen/core/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"
 #include "ATen/core/TensorTypeIdRegistration.h"
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index ff838c58889e45..7d9a18eda18489 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -167,12 +167,20 @@ if (NOT BUILD_ATEN_MOBILE)
 
   file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
 
+  # these are files that are generated by the script and checked in -- the script checks
+  # that they are equivalent so it must be a dependency of the script
+  set(core_gen_checked_inputs
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Type.h
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Tensor.h
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorMethods.h)
+
   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core_tmp)
 
   add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp}
     COMMAND ${GEN_COMMAND}
       --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
-    DEPENDS ${all_python} ${all_templates} ${cwrap_files})
+    DEPENDS ${all_python} ${all_templates} ${cwrap_files} ${core_gen_checked_inputs})
 
   # Generated headers used from a CUDA (.cu) file are
   # not tracked correctly in CMake. We make the libATen.so depend explicitly

From a3036b3bb3b559c7f712b31d5f79b366e58cc003 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@nvidia.com>
Date: Wed, 12 Sep 2018 13:46:47 -0700
Subject: [PATCH 086/237] Fused weightnorm for ATen (#10842)

Summary:
This PR contains a C++ implementation of weight norm.  The user-side exposure of weight norm through torch.nn.utils.weight_norm is unchanged.

If running on the GPU, and the norm is requested over the first or last dimension of the weight tensor, the forward pass is carried out using the fused kernels I wrote for our Fairseq GTC hero run, which offer superior performance to primitive ops and superior numerical stability when running in FP16.  In the common case that the backward pass is not itself constructing a graph (ie not attempting to set up double backward) the backward pass will be carried out using another fused kernel.  If the backward pass is constructing a graph, an alternate code path is taken, which does the math using differentiable primitive ops. In this way, the implementation allows double backward, even if the fused kernel was used in forward (although in this case, you don't benefit from the performance and stability of the fused backward kernel).

If running on the CPU, or if norming over an interior dim, the forward pass is carried out using double-differentiable primitive ops.

Figuring out how to generate all the right plumbing for this was tricky, but it was a fun experience learning how the autogenerator works and how the graph is constructed.  Thanks to colesbury for useful guidance on this front.

I do have a few lingering questions:

- Should I unify my return statements (ie by default-constructing Tensors outside if blocks and using operator= within)?
- What is the significance of `non_blocking` when calling e.g. `auto norms = saved_norms.to(saved_g.type().scalarType(), non_blocking=True/False);`?  I am currently omitting `non_blocking`, so it defaults to False, but I didn't see any associated synchronizes on the timeline, so I'm wondering what it means.
- Is there an "official" mapping from at::ScalarTypes to corresponding accumulate types, as there are for the PODs + Half in [AccumulateType.h](https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/AccumulateType.h)?  I looked for an equivalent mapping for ScalarTypes, didn't find one, and ended up rigging it myself (`  at::ScalarType AccType = g.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float : g.type().scalarType();`).
- Are sparse tensors a concern?  Should I include another check for sparse tensors in the `_weight_norm` entry point, and send those along the fallback CPU path as well?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10842

Differential Revision: D9735531

Pulled By: ezyang

fbshipit-source-id: 24431d46532cf5503876b3bd450d5ca775b3eaee
---
 aten/src/ATen/native/WeightNorm.cpp        | 117 +++++
 aten/src/ATen/native/cuda/WeightNorm.cu    | 502 +++++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  21 +
 tools/autograd/derivatives.yaml            |   6 +
 torch/nn/utils/weight_norm.py              |  19 +-
 torch/onnx/symbolic.py                     |   5 +
 6 files changed, 654 insertions(+), 16 deletions(-)
 create mode 100644 aten/src/ATen/native/WeightNorm.cpp
 create mode 100644 aten/src/ATen/native/cuda/WeightNorm.cu

diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
new file mode 100644
index 00000000000000..1627b4c2596e12
--- /dev/null
+++ b/aten/src/ATen/native/WeightNorm.cpp
@@ -0,0 +1,117 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace at { 
+namespace native {
+
+// Staying faithful to the Python for now for clarity, look for optimizations later
+// (e.g., single return statement for RVO)
+Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim)
+{
+  // I assume tensor.contiguous(), view(), norm(), etc. here will dispatch through VariableType.
+  if (dim == -1) {
+    return v.norm(pow);
+  } else if (dim == 0) {
+    std::vector<int64_t> output_size(v.dim(), 1);
+    output_size[0] = v.size(0);
+    return v.contiguous().view({v.size(0), -1}).norm(pow, 1).view(output_size);
+  } else if (dim == v.dim() - 1) {
+    std::vector<int64_t> output_size(v.dim(), 1);
+    output_size[v.dim() - 1] = v.size(v.dim() - 1);
+    return v.contiguous().view({-1, v.size(v.dim() - 1)}).norm(pow, 0).view(output_size);
+  } else {
+    // To consider: at::native::norm_except_dim is probably fine as well, 
+    // and would avoid an additional dynamic dispatch.  
+    return at::norm_except_dim(v.transpose(0, dim), pow, 0).transpose(0, dim); // optimize?
+  }
+}
+
+Tensor _weight_norm
+  (const Tensor & v_in, 
+   const Tensor & g_in,
+   int64_t dim) 
+{
+
+  AT_CHECK(
+    v_in.device() == g_in.device(),
+    "weight_norm: expected v_in and g_in to be on the same device, but v_in is "
+    "on ", v_in.device(), " and g_in is on ", g_in.device()); 
+
+  auto v = v_in.contiguous();
+  auto g = g_in.contiguous();
+    
+  bool can_use_fused = v.type().is_cuda() && (dim == 0 || dim == v.dim() - 1);
+
+  if (can_use_fused) {
+    // weight_norm does not have a derivative defined for it, so this will route back through
+    // VariableType.cpp, and construct a WeightNormFusedBackward object in the autograd graph.
+    return std::get<0>(at::_weight_norm_cuda_interface(v, g, dim));
+  } else {
+    // Double-differentiable primitive ops
+    // at::native::norm_except_dim would probably be fine as well.
+    return v*(g/at::norm_except_dim(v, 2, dim));
+  }
+}
+
+// Differentiable backward path, an alternative to weight_norm_cuda_backward, to be used
+// when backward is itself creating a graph.
+// The GradMode::is_enabled() check must be performed within Functions.cpp; that's why we
+// define a separate function here, instead of inlining it in weight_norm_cuda_backward.
+std::tuple<Tensor, Tensor> _weight_norm_differentiable_backward
+  (const Tensor & grad_w,
+   const Tensor & saved_v,
+   const Tensor & saved_g,
+   const Tensor & saved_norms,
+   int64_t dim)
+{
+  // In Functions.cpp, the HardshrinkBackward object supplies "grad.contiguous()"
+  // as the first argument, so grad_w should be contiguous here.
+  // All these checks should succeed:
+  AT_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous");
+  AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+
+  int64_t last_dim = saved_v.dim() - 1;
+  int64_t last_size = saved_v.size(last_dim);
+ 
+  // Like weight_norm_fused_backward, weight_norm_differentiable_backward should only ever be called
+  // through a WeightNormFusedBackward object, so we expect that dim == 0 || dim == saved_v.size(-1)
+  AT_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension");
+
+  // saved_g and saved_norms are already shaped to broadcast over the correct dimensions
+
+  // ...but saved_norms might be Float when saved_g and saved_v are half.
+  // To consider:  saved_norms.to(..., True /*non_blocking*/);
+  auto norms = saved_norms.to(saved_g.type().scalarType());
+
+  std::vector<int64_t> bcast_size(saved_v.dim(), 1);
+
+  // Analytic backward path using differentiable primitive ops
+  if (dim == 0) {
+    bcast_size[0] = saved_v.size(0);
+    auto per_dim_sums = (grad_w*saved_v).view({saved_v.size(0), -1}).sum(1).view(bcast_size);
+    auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms)));
+    auto grad_g = per_dim_sums/norms; 
+    return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+  } else { // dim == last_dim
+    bcast_size[last_dim] = last_size; 
+    auto per_dim_sums = (grad_w*saved_v).view({-1, last_size}).sum(0).view(bcast_size);
+    auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms)));
+    auto grad_g = per_dim_sums/norms; 
+    return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
new file mode 100644
index 00000000000000..67d8f39e2de71d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -0,0 +1,502 @@
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/core/Error.h"
+
+#include "ATen/cuda/CUDAContext.h"
+#include <THC/THCDeviceUtils.cuh>
+#include <THC/THCTensorMathReduce.cuh>
+
+namespace at { 
+namespace native {
+namespace {
+
+// Block size for weight_norm_*_first_dim_kernel.
+// Currently, kernels are non-persistent.
+// Dialing up the block size to, say 1024, can improve performance by
+// increase the amount of cache available per block, which can improve cache hit rate.
+// However, this is less efficient for short rows.  256 is pretty versatile. 
+// May be worth implementing heuristics later.
+#define BLOCK 256
+
+// Block size for weight_norm_*_last_dim_kernel.
+// This is tricker than the first_dim case because we must make blocks 
+// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
+// Since output-element parallelism is along the fast dimension, this reduces the number of 
+// blocks we can launch by 16X.  
+#define TILE_W 16
+// Somewhat versatile strategy: max out intra-block parallelism by extending
+// blocks across the slow dimension up to the hardware-max block size of 1024.
+#define TILE_H 64
+
+template<typename T, typename ReduceOp>
+__device__ __forceinline__ void reduce_block_into_lanes
+  (T *x, 
+   T val, 
+   int lanes, // lanes is intended to be <= 32.
+   ReduceOp reduceOp) 
+{ 
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+  
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1) 
+  {
+    if(tid < i)
+      x[tid] = reduceOp(x[tid], x[tid+i]);
+    __syncthreads();
+  }
+
+  if(tid < 32) 
+  {
+    T final;
+    if(blockSize >= 64)
+      final = reduceOp(x[tid], x[tid+32]);
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = reduceOp(final, WARP_SHFL_DOWN(final, i));
+
+    if(tid < lanes) 
+      x[tid] = final; // EpilogueOp
+  }
+
+  // Make sure the smem result is visible to all warps.
+  __syncthreads();
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_fwd_first_dim_kernel
+  (scalar_t* __restrict__ w,
+   accscalar_t* __restrict__ norms,
+   const scalar_t* __restrict__ v,
+   const scalar_t* __restrict__ g,
+   const int rowSize) 
+{
+  // We are norming each slowest-dim row of the tensor separately.
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+  
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]); 
+    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  result = sqrtf(result);
+  
+  if(tid == 0)
+    norms[row] = result;
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = scalar_cast<accscalar_t>(g[row]);
+
+  accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
+
+  // Write data to output
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
+    w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
+  }
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_fwd_last_dim_kernel
+(
+  scalar_t* __restrict__ w,
+  accscalar_t* __restrict__ norms,
+  const scalar_t* __restrict__ v,
+  const scalar_t* __restrict__ g,
+  const int fast_dim_size,
+  const int slower_dims_size
+)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* alloc = (accscalar_t*)buf;
+  accscalar_t* s = &alloc[0];
+  accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
+      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+
+  // Better to pass an EpilogueOp to reduce_block_into_lanes?
+  if(threadIdx.y == 0)
+  {
+    accscalar_t result = s[threadIdx.x];
+    accscalar_t norm_this_col = sqrtf(result);
+    norms[fast_dim_location] = norm_this_col;
+    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
+  }
+   
+  __syncthreads(); 
+
+  accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);     
+  accscalar_t rnorm = rnorms_this_block[threadIdx.x]; 
+
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
+      w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_bwd_first_dim_kernel
+  (scalar_t* __restrict__ grad_v,
+   scalar_t* __restrict__ grad_g,
+   const scalar_t* __restrict__ grad_w,
+   const scalar_t* __restrict__ saved_v,
+   const scalar_t* __restrict__ saved_g,
+   const accscalar_t* __restrict__ saved_norms,
+   const int rowSize)
+{
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+  
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[i+rowStart]); 
+    accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[i+rowStart]); 
+    thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
+  // more handy to keep around.
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/saved_norms[row];  
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(tid == 0)
+    grad_g[row] = scalar_cast<scalar_t>(result*rnorm);
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = scalar_cast<accscalar_t>(saved_g[row]);
+   
+  // Write v gradients.  We are reusing values that were loaded earlier, so there 
+  // is an optimization opportunity here (store values persistently).
+  for(int j = tid; j < rowSize; j += stride ) 
+  {
+    accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[j+rowStart]);  
+    accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[j+rowStart]);  
+    accscalar_t grad_vj = g_this_row*(rnorm*grad_wj - rnorm3*saved_vj*result);
+    grad_v[j+rowStart] = scalar_cast<scalar_t>(grad_vj);
+  }
+}
+
+template 
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_bwd_last_dim_kernel
+  (scalar_t* __restrict__ grad_v,
+   scalar_t* __restrict__ grad_g,
+   const scalar_t* __restrict__ grad_w,
+   const scalar_t* __restrict__ saved_v,
+   const scalar_t* __restrict__ saved_g,
+   const accscalar_t* __restrict__ saved_norms,
+   const int fast_dim_size,
+   const int slower_dims_size)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[currentIdx]); 
+      accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[currentIdx]); 
+      thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+  accscalar_t result = s[threadIdx.x];
+
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/saved_norms[fast_dim_location];  
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(threadIdx.y == 0)
+    grad_g[fast_dim_location] = scalar_cast<scalar_t>(result*rnorm);
+
+  // Entire block pulls these values, could use shared memory instead.
+  accscalar_t g_this_col = scalar_cast<accscalar_t>(saved_g[fast_dim_location]);
+
+  // Write v gradients.
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[currentIdx]);  
+      accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[currentIdx]);  
+      accscalar_t grad_vj = g_this_col*(rnorm*grad_wj - rnorm3*saved_vj*result);
+      grad_v[currentIdx] = scalar_cast<scalar_t>(grad_vj);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+} // anonymous namespace
+
+std::tuple<Tensor,Tensor> weight_norm_cuda
+  (const Tensor & v,
+   const Tensor & g,
+   int64_t dim) 
+{
+  auto w = at::empty_like(v);
+
+  // weight_norm_fused does have a derivative defined in derivatives.yaml, therefore, VariableType.cpp
+  // sends the unpacked g.data() as the argument.  In other words, we expect "g" is a bare Tensor here.
+
+  // norms is only needed to stash for backward.
+  // g.type().scalarType() may be at::ScalarType::Double, Float, or Half.  
+  // If Half, stash norms as float.
+  at::ScalarType AccType = g.type().scalarType() == at::ScalarType::Half ?
+                           at::ScalarType::Float : g.type().scalarType();
+  // Will this create norms on the same device as g, regardless of what the thread's default 
+  // current device is?  I believe so, because Type::* functions are DeviceGuard()ed.
+  auto norms = g.type().toScalarType(AccType).tensor(g.sizes(), g.strides());
+
+  const int ndims = v.dim();
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= v.size(i);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (v.type(), 
+       "weight_norm_fwd_first_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+         weight_norm_fwd_first_dim_kernel<scalar_t, accscalar_t>
+           <<<v.size(0), 
+              BLOCK, 
+              BLOCK*sizeof(accscalar_t),
+              stream>>>
+           (w.data<scalar_t>(), 
+            norms.data<accscalar_t>(),
+            v.data<scalar_t>(),  
+            g.data<scalar_t>(),  
+            rowSize);
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= v.size(i);
+
+    int fast_dim_size = v.size(ndims-1);
+ 
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (v.type(), 
+       "weight_norm_fwd_last_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+        
+         weight_norm_fwd_last_dim_kernel<scalar_t, accscalar_t>
+           <<<(fast_dim_size+TILE_W-1)/TILE_W,
+              dim3(TILE_W,TILE_H),
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream>>>
+           (w.data<scalar_t>(),
+            norms.data<accscalar_t>(),
+            v.data<scalar_t>(),
+            g.data<scalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+       });
+  }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
+  // synchronizing here, this is the best we can do.
+  THCudaCheck(cudaGetLastError());
+
+  return std::tuple<Tensor, Tensor>{w, norms};
+}
+
+std::tuple<Tensor, Tensor> weight_norm_cuda_backward
+  (const Tensor & grad_w, 
+   const Tensor & saved_v, 
+   const Tensor & saved_g, 
+   const Tensor & saved_norms,
+   int64_t dim)
+{
+  // These checks should always succeed, because weight_norm_fused_backward should only
+  // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g.
+  AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  AT_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim")
+
+  auto grad_v = at::empty_like(saved_v);
+  auto grad_g = at::empty_like(saved_g);
+
+  const int ndims = saved_v.dim();
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= saved_v.size(i);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (saved_v.type(), 
+       "weight_norm_bwd_first_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+	 weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
+	   <<<grad_w.size(0), 
+	      BLOCK, 
+	      BLOCK*sizeof(accscalar_t),
+              stream>>>
+	   (grad_v.data<scalar_t>(),
+	    grad_g.data<scalar_t>(),
+	    grad_w.data<scalar_t>(),
+	    saved_v.data<scalar_t>(),
+	    saved_g.data<scalar_t>(),
+	    saved_norms.data<accscalar_t>(),
+	    rowSize);
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= saved_v.size(i);
+
+    int fast_dim_size = saved_v.size(ndims-1);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (saved_v.type(), 
+       "weight_norm_bwd_last_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+         weight_norm_bwd_last_dim_kernel<scalar_t, accscalar_t>
+           <<<(fast_dim_size+TILE_W-1)/TILE_W,
+              dim3(TILE_W,TILE_H), 
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream>>>
+           (grad_v.data<scalar_t>(),
+            grad_g.data<scalar_t>(),
+            grad_w.data<scalar_t>(),
+            saved_v.data<scalar_t>(),
+            saved_g.data<scalar_t>(),
+            saved_norms.data<accscalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+       });
+  }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
+  // synchronizing here, this is the best we can do.
+  THCudaCheck(cudaGetLastError());
+
+  return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+}
+
+#undef BLOCK
+#undef TILE_W
+#undef TILE_H
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index cdd7a2655473f1..af58e45ebe0ebd 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1746,6 +1746,27 @@
     CPU: _s_where_cpu
     CUDA: _s_where_cuda
 
+- func: norm_except_dim(Tensor v, int64_t pow=2, int64_t dim=0) -> Tensor
+  variants: function
+
+# VariableType::_weight_norm does not want to be given a gap in the autograd graph,
+# so we don't define "dispatch" variants for it.
+- func: _weight_norm(Tensor v, Tensor g, int64_t dim=0) -> Tensor
+  variants: function
+
+- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: weight_norm_cuda
+
+- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: weight_norm_cuda_backward
+
+- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor)
+  variants: function
+
 - func: zeros(IntList size, TensorOptions options={}) -> Tensor
 
 - func: zeros_out(Tensor result, IntList size) -> Tensor
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8accdb97427b27..caec9575ef9cca 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -762,6 +762,12 @@
   self: where(condition, grad, zeros_like(grad))
   other: where(condition, zeros_like(grad), grad)
 
+# weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen
+# to be running backward with create_graph=True, fall back to a backward function that uses 
+# differentiable ops.
+- name: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim) 
+  v, g: "GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_cuda_interface_backward(grad.contiguous(), v, g, result1, dim)"
+
 - name: zero_(Tensor self)
   self: zeros_like(grad)
 
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
index abc8b63cb105b9..ddefa3d6f5f544 100644
--- a/torch/nn/utils/weight_norm.py
+++ b/torch/nn/utils/weight_norm.py
@@ -2,20 +2,7 @@
 Weight Normalization from https://arxiv.org/abs/1602.07868
 """
 from torch.nn.parameter import Parameter
-
-
-def _norm(p, dim):
-    """Computes the norm over all dimensions except dim"""
-    if dim is None:
-        return p.norm()
-    elif dim == 0:
-        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
-        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
-    elif dim == p.dim() - 1:
-        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
-        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
-    else:
-        return _norm(p.transpose(0, dim), 0).transpose(0, dim)
+from torch import _weight_norm, norm_except_dim
 
 
 class WeightNorm(object):
@@ -26,7 +13,7 @@ def __init__(self, name, dim):
     def compute_weight(self, module):
         g = getattr(module, self.name + '_g')
         v = getattr(module, self.name + '_v')
-        return v * (g / _norm(v, self.dim))
+        return _weight_norm(v, g, self.dim)
 
     @staticmethod
     def apply(module, name, dim):
@@ -38,7 +25,7 @@ def apply(module, name, dim):
         del module._parameters[name]
 
         # add g and v as new parameters and express w as g/||v|| * v
-        module.register_parameter(name + '_g', Parameter(_norm(weight, dim).data))
+        module.register_parameter(name + '_g', Parameter(norm_except_dim(weight, 2, dim).data))
         module.register_parameter(name + '_v', Parameter(weight.data))
         setattr(module, name, fn.compute_weight(module))
 
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 53d4f769a4e56f..d9c5964109a0c4 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -721,6 +721,11 @@ def unfold(g, input, dimension, size, step):
     return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
 
 
+@parse_args('v', 'v', 'i')
+def _weight_norm(graph, v, g, dim):
+    return graph.op("ATen", v, g, dim_i=dim, operator_s="_weight_norm")
+
+
 @parse_args('v', 't', 't', 't')
 def elu(g, input, alpha, scale, input_scale):
     if scale and scale != 1.:

From 504126e7050d9c29f4d1ef5bac13399fe50f2730 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 12 Sep 2018 13:57:55 -0700
Subject: [PATCH 087/237] Documentation for debugging JIT

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11540

Differential Revision: D9798647

Pulled By: jamesr66a

fbshipit-source-id: 968a4af22c735a848fa27cbadaed9b7023ba8276
---
 docs/source/jit.rst | 248 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 244 insertions(+), 4 deletions(-)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 8bc4ce6e336fc9..005c09fa693624 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -535,13 +535,253 @@ Python-defined Constants
 Debugging
 ~~~~~~~~~
 
-Print things
+Disable JIT for Debugging
+    If you want to disable all JIT modes (tracing and scripting) so you can
+    debug your program in raw Python, you can use the ``PYTORCH_JIT`` environment
+    variable. ``PYTORCH_JIT`` can be used to globally disable the
+    JIT by setting its value to ``0``. Given an example script::
 
-Use ``USE_PYTHON=0`` to debug in normal python mode
+        @torch.jit.script
+        def scripted_fn(x : torch.Tensor):
+            for i in range(12):
+                x = x + x
+            return x
+
+
+        def fn(x):
+            x = torch.neg(x)
+            import pdb; pdb.set_trace()
+            return scripted_fn(x)
+
+        traced_fn = torch.jit.trace(fn, (torch.rand(4, 5),))
+
+        traced_fn(torch.rand(3, 4))
 
-Look at the graph
+    Debugging this script with PDB works except for when we invoke the @script
+    function. We can globally disable JIT, so that we can call the @script
+    function as a normal python function and not compile it. If the above script
+    is called ``disable_jit_example.py``, we can invoke it like so::
 
-Pay attention to tracer warnings
+        $ PYTORCH_JIT=0 python disable_jit_example.py
+
+    and we will be able to step into the @script function as a normal Python
+    function.
+
+
+Interpreting Graphs
+    TorchScript uses a static single assignment (SSA) intermediate representation
+    (IR) to represent computation. The instructions in this format consist of
+    ATen (the C++ backend of PyTorch) operators and other primitive operators,
+    including control flow operators for loops and conditionals. As an example::
+
+        @torch.jit.script
+        def foo(len):
+          # type: (int) -> torch.Tensor
+          rv = torch.zeros(3, 4)
+          for i in range(len):
+            if i < 10:
+                rv = rv - 1.0
+            else:
+                rv = rv + 1.0
+          return rv
+
+        print(foo.graph)
+
+    A ``ScriptModule`` with a single ``forward`` method will have an attribute
+    ``graph``, which you can use to inspect the IR representing the computation.
+    If the ScriptModule has more than one method, you will need to access
+    ``.graph`` on the method itself and not the module. We can inspect the
+    graph of a method named ``bar`` on a ScriptModule by accessing ``.bar.graph``.
+
+    The example script above produces the graph::
+
+        graph(%len : int) {
+          %13 : float = prim::Constant[value=1]()
+          %10 : int = prim::Constant[value=10]()
+          %2 : int = prim::Constant[value=4]()
+          %1 : int = prim::Constant[value=3]()
+          %3 : int[] = prim::ListConstruct(%1, %2)
+          %4 : int = prim::Constant[value=6]()
+          %5 : int = prim::Constant[value=0]()
+          %6 : int[] = prim::Constant[value=[0, -1]]()
+          %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
+          %8 : int = prim::Constant[value=1]()
+          %rv : Dynamic = prim::Loop(%len, %8, %rv.1)
+            block0(%i : int, %12 : Dynamic) {
+              %11 : int = aten::lt(%i, %10)
+              %rv.4 : Dynamic = prim::If(%11)
+                block0() {
+                  %14 : int = prim::Constant[value=1]()
+                  %rv.2 : Dynamic = aten::sub(%12, %13, %14)
+                  -> (%rv.2)
+                }
+                block1() {
+                  %16 : int = prim::Constant[value=1]()
+                  %rv.3 : Dynamic = aten::add(%12, %13, %16)
+                  -> (%rv.3)
+                }
+              %19 : int = prim::Constant[value=1]()
+              -> (%19, %rv.4)
+            }
+          return (%rv);
+        }
+
+    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
+    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
+    value named ``rv.1``, and that value is of ``Dynamic`` type, i.e. we do
+    not know its concrete shape. ``aten::zeros`` is the operator (equivalent
+    to ``torch.zeros``) and the input list ``(%3, %4, %5, %6)`` specifies which
+    values in scope should be passed as inputs. The schema for built-in functions
+    like ``aten::zeros`` can be found at `Builtin Functions`_.
+
+    Notice that operators can also have associated ``blocks``, namely the
+    ``prim::Loop`` and ``prim::If`` operators. In the graph print-out, these
+    operators are formatted to reflect their equivalent source code forms
+    to facilitate easy debugging.
+
+    Graphs can be inspected as shown to confirm that the computation described
+    by a ``ScriptModule`` is correct, in both automated and manual fashion, as
+    described below.
+
+
+Tracing Edge Cases
+    There are some edge cases that exist where the trace of a given Python
+    function/module will not be representative of the underlying code. These
+    cases can include:
+
+    * Tracing of control flow that is dependent on inputs (e.g. tensor shapes)
+    * Tracing of in-place operations of tensor views (e.g. indexing on the
+      left-hand side of an assignment)
+
+    Note that these cases may in fact be traceable in the future.
+
+
+Automatic Trace Checking
+    One way to automatically catch many errors in traces is by using ``check_inputs``
+    on the ``torch.jit.trace()`` API. ``check_inputs`` takes a list of tuples
+    of inputs that will be used to re-trace the computation and verify the
+    results. For example::
+
+        def loop_in_traced_fn(x):
+            result = x[0]
+            for i in range(x.size(0)):
+                result = result * x[i]
+            return result
+
+        inputs = (torch.rand(3, 4, 5),)
+        check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)]
+
+        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
+
+    Gives us the following diagnostic information::
+
+        ERROR: Graphs differed across invocations!
+        Graph diff:
+            graph(%0 : Dynamic) {
+                  %1 : int = prim::Constant[value=0]()
+                  %2 : int = prim::Constant[value=0]()
+                  %3 : Dynamic = aten::select(%0, %1, %2)
+                  %4 : int = prim::Constant[value=0]()
+                  %5 : int = prim::Constant[value=0]()
+                  %6 : Dynamic = aten::select(%0, %4, %5)
+                  %7 : Dynamic = aten::mul(%3, %6)
+                  %8 : int = prim::Constant[value=0]()
+                  %9 : int = prim::Constant[value=1]()
+                  %10 : Dynamic = aten::select(%0, %8, %9)
+                  %11 : Dynamic = aten::mul(%7, %10)
+                  %12 : int = prim::Constant[value=0]()
+                  %13 : int = prim::Constant[value=2]()
+                  %14 : Dynamic = aten::select(%0, %12, %13)
+                  %15 : Dynamic = aten::mul(%11, %14)
+              +   %16 : int = prim::Constant[value=0]()
+              +   %17 : int = prim::Constant[value=3]()
+              +   %18 : Dynamic = aten::select(%0, %16, %17)
+              +   %19 : Dynamic = aten::mul(%15, %18)
+              -   return (%15);
+              ?             ^
+              +   return (%19);
+              ?             ^
+            }
+
+
+    This message indicates to us that the computation differed between when
+    we first traced it and when we traced it with the ``check_inputs``. Indeed,
+    the loop within the body of ``loop_in_traced_fn`` depends on the shape
+    of the input ``x``, and thus when we try another ``x`` with a different
+    shape, the trace differs.
+
+    In this case, data-dependent control flow like this can be captured using
+    script instead::
+
+        def fn(x):
+            result = x[0]
+            for i in range(x.size(0)):
+                result = result * x[i]
+            return result
+
+        inputs = (torch.rand(3, 4, 5),)
+        check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)]
+
+        scripted_fn = torch.jit.script(fn)
+        print(scripted_fn.graph)
+
+        for input_tuple in [inputs] + check_inputs:
+            torch.testing.assert_allclose(fn(*input_tuple), scripted_fn(*input_tuple))
+
+
+    Which produces::
+
+        graph(%x : Dynamic) {
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=0]()
+          %result.1 : Dynamic = aten::select(%x, %2, %1)
+          %4 : int = aten::size(%x, %1)
+          %5 : int = prim::Constant[value=1]()
+          %result : Dynamic = prim::Loop(%4, %5, %result.1)
+            block0(%i : int, %7 : Dynamic) {
+              %9 : int = prim::Constant[value=0]()
+              %10 : Dynamic = aten::select(%x, %9, %i)
+              %result.2 : Dynamic = aten::mul(%7, %10)
+              %12 : int = prim::Constant[value=1]()
+              -> (%12, %result.2)
+            }
+          return (%result);
+        }
+
+
+Tracer Warnings
+    The tracer produces warnings for several problematic patterns in traced
+    computation. As an example, take a trace of a function that contains an
+    in-place assignment on a slice (a view) of a Tensor::
+
+        def fill_row_zero(x):
+            x[0] = torch.rand(*x.shape[1:2])
+            return x
+
+        traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+        print(traced.graph)
+
+
+    Produces several warnings and a graph which simply returns the input::
+
+        fill_row_zero.py:4: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
+          x[0] = torch.rand(*x.shape[1:2])
+        fill_row_zero.py:6: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error:
+        Not within tolerance rtol=1e-05 atol=1e-05 at input[0, 1] (0.09115803241729736 vs. 0.6782537698745728) and 3 other locations (33.00%)
+          traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+        graph(%0 : Float(3, 4)) {
+          return (%0);
+        }
+
+    We can fix this by modifying the code to not use the in-place update, but
+    rather build up the result tensor out-of-place with `torch.cat`::
+
+        def fill_row_zero(x):
+            x = torch.cat((torch.rand(1, *x.shape[1:2]), x[1:2]), dim=0)
+            return x
+
+        traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+        print(traced.graph)
 
 
 Builtin Functions

From f0a440007e9e671d7ccf3123f6f209e74586ff8e Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 13:59:50 -0700
Subject: [PATCH 088/237] Explicitly set locale on docs build. (#11595)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11595

Differential Revision: D9798567

Pulled By: ezyang

fbshipit-source-id: ac05458347e181960a07cacae1dfc68d2837451f
---
 .jenkins/pytorch/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 7d491e0fccb7ef..2dc64157c5d00d 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -118,7 +118,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
   # TODO: Don't run this here
   pip install -r requirements.txt || true
-  make html
+  LC_ALL=C make html
   popd
 fi
 

From 958ba4e91383e00697e59862acf0f101b25a66c9 Mon Sep 17 00:00:00 2001
From: Lingyi Liu <lingyiliu@fb.com>
Date: Wed, 12 Sep 2018 14:24:16 -0700
Subject: [PATCH 089/237] Aibench for asr decoder

Summary: as title

Reviewed By: sf-wind

Differential Revision: D9738021

fbshipit-source-id: 98f570484bca6486ad99207732efd534ec7e3251
---
 modules/observers/perf_observer.cc | 45 ++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
index ed391a3e3ff5dc..93ae7868e7f622 100644
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@@ -1,11 +1,42 @@
 #include "observers/perf_observer.h"
 #include "observers/observer_config.h"
+#if !CAFFE2_MOBILE
+#include "caffe2/core/flags.h"
+#include "observers/net_observer_reporter_print.h"
+#endif
 
 #include <random>
 #include "caffe2/core/common.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 
+#if !CAFFE2_MOBILE
+CAFFE2_DEFINE_int64(
+    aiBench_netInitSampleRate,
+    0,
+    "One in N sampling rate for net delay");
+
+CAFFE2_DEFINE_int64(
+    aiBench_netFollowupSampleRate,
+    0,
+    "One in N sampling rate for net delay");
+
+CAFFE2_DEFINE_int64(
+    aiBench_netFollowupSampleCount,
+    0,
+    "control the following c logs");
+
+CAFFE2_DEFINE_int64(
+    aiBench_operatorNetSampleRatio,
+    0,
+    "One in N sampling rate for operator delay");
+
+CAFFE2_DEFINE_int64(
+    aiBench_skipIters,
+    0,
+    "skip the first N iterations of the net run");
+#endif
+
 namespace caffe2 {
 namespace {
 
@@ -13,6 +44,20 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
   AddGlobalNetObserverCreator([](NetBase* subject) {
     return caffe2::make_unique<PerfNetObserver>(subject);
   });
+
+#if !CAFFE2_MOBILE
+  // for aibench usage
+  caffe2::ObserverConfig::setReporter(
+      caffe2::make_unique<caffe2::NetObserverReporterPrint>());
+
+  caffe2::ObserverConfig::initSampleRate(
+      FLAGS_aiBench_netInitSampleRate,
+      FLAGS_aiBench_netFollowupSampleRate,
+      FLAGS_aiBench_netFollowupSampleCount,
+      FLAGS_aiBench_operatorNetSampleRatio,
+      FLAGS_aiBench_skipIters);
+#endif
+
   return true;
 }
 } // namespace

From d4e05f4e1e276055cd3d3e1a2a1e186e6c6405ee Mon Sep 17 00:00:00 2001
From: Owen Anderson <owen.anderson@oculus.com>
Date: Wed, 12 Sep 2018 14:39:23 -0700
Subject: [PATCH 090/237] Move function deletion from the stack to the heap.
 (#11534)

Summary:
This eliminates the need for any heuristics regarding stack size limits.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11534

Differential Revision: D9779866

Pulled By: resistor

fbshipit-source-id: 96753eead7904bbdc2869fb01f7bd42141032347
---
 torch/csrc/autograd/function.cpp | 122 ++++++++++---------------------
 torch/csrc/autograd/function.h   |   2 -
 torch/csrc/autograd/init.cpp     |   5 --
 3 files changed, 40 insertions(+), 89 deletions(-)

diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index ba33a6f116bac2..6090b520bd36ab 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -36,95 +36,53 @@ AnomalyMetadata* Function::metadata() noexcept {
   return anomaly_metadata_.get();
 }
 
-/*
- * Fix for #5534: prevent stack overflow on deletion of deep computation graph
- *
- * Sometimes one can end up with a very big computation graph of Functions
- * and Edges. Each std::shared_ptr<Function> contains a list of Edge, and
- * each Edge contains a std::shared_ptr<Function>. Deleting a
- * std::shared_ptr<Function> can trigger the recursive deletion of other
- * std::shared_ptr<Function>'s: this can stack overflow if the graph
- * is deep enough. Here is an example of such a graph:
- *
- * shared_ptr<Function> -> Edge -> shared_ptr<Function> -> Edge -> ... -> shared_ptr<Function>
- *
- * The solution here is to use a custom deleter with each
- * std::shared_ptr<Function>. The custom deleter keeps track of how many
- * nested deleters it is in. When this number exceeds the maximum allowed
- * depth, the Function* to be deleted are accumulated in a per-thread
- * delete queue and handled by one of the deleters.
- *
- * Note that these custom deleters are NOT necessary for deleting PyFunction.
- * This is because a THPFunction Python object owns a PyFunction that is in a
- * computation graph. When Python objects get recursively destroyed, they
- * are also queued into a delete list. This happens very early for them
- * (at 50 deleters): https://github.com/python/cpython/blob/f320be77ffb73e3b9e7fc98c37b8df3975d84b40/Include/object.h#L1024-L1063
- * so we don't need to worry about them.
- */
-
-thread_local std::deque<Function*> deleteFunctionQueue;
-thread_local size_t deleteFunctionRecursionDepth = 0;
-
-/*
- * If this number is set too high, a deep computation graph can still
- * stack overflow. The procedure for setting this number was to
- * 1) find the smallest value that would not guard against stack overflows
- *    on various machines
- * 2) Take the minimum of all such values and subtract some leeway because
- *    the memory of these stack frames will probably grow as time passes.
- * Testing on a few machines machines, the magic numbers were:
- * - Mac OSX (Macbook Pro 15) : ~60000
- * - A beefy Ubuntu 16.04 box : ~15000
- * - Windows AWS instance (g3.4xlarge): variable. My two attempts at different
- *   times have gotten the following numbers: ~8300, 3669
- */
-#ifdef _WIN32
-size_t deleteFunctionMaxRecursionDepth = 3000;
-#else
-size_t deleteFunctionMaxRecursionDepth = 10000;
-#endif
-
-struct RecursionDepthCounter {
- public:
-  explicit RecursionDepthCounter() {
-    ++deleteFunctionRecursionDepth;
-  }
-  ~RecursionDepthCounter() {
-    --deleteFunctionRecursionDepth;
-  }
-
-  size_t value() {
-    return deleteFunctionRecursionDepth;
+static void gatherFunctions(Function* func,
+                            std::vector<std::shared_ptr<Function>>& stack) {
+  for (auto& edge : func->next_edges()) {
+    if (edge.function.use_count() == 1) {
+      stack.emplace_back(std::move(edge.function));
+    }
   }
-};
+}
 
 /*
- * Note that the custom deleter deletes in BFS style. Without using
- * the custom deleter, the computation graph is deleted in a DFS style.
- * The BFS deletion is valid (and safe) because if a shared_ptr<Function>
- * 's reference count hits 0, nothing else will access it.
- */
+  * Fix for #5534: prevent stack overflow on deletion of deep computation graph
+  * 
+  * Sometimes one can end up with a very big computation graph of Functions
+  * and Edges. Each std::shared_ptr<Function> contains a list of Edge, and
+  * each Edge contains a std::shared_ptr<Function>. Deleting a
+  * std::shared_ptr<Function> can trigger the recursive deletion of other
+  * std::shared_ptr<Function>'s: this can stack overflow if the graph
+  * is deep enough. Here is an example of such a graph:
+  *
+  * shared_ptr<Function> -> Edge -> shared_ptr<Function> -> Edge -> ... -> shared_ptr<Function>
+  *
+  * The solution here is to detect when we are decrementing away the last
+  * reference to a Function, and when doing so to buffer up the Function's
+  * that will be recursively decremented.  We can then decrement (and free)
+  * the original Function without causing a recursive cascade, before
+  * draining the buffer applying the same behavior.  This is, in effect,
+  * converting recursion to a loop, using a heap buffer in place of the
+  * recursive call stack.
+  */
 void deleteFunction(Function* function) {
-  RecursionDepthCounter recursion_depth;
+  // To avoid stack overflow on large computational graphs,
+  // we need to track reference decrementing and freeing
+  // on the heap.
+  std::vector<std::shared_ptr<Function>> stack;
+  gatherFunctions(function, stack);
+  delete function;
 
-  if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) {
-    deleteFunctionQueue.push_back(function);
-    return;
-  }
+  while (!stack.empty()) {
+    auto& curr_func = stack.back();
 
-  delete function;
+    if (curr_func.use_count() == 1) {
+      // If this is the last reference, gather function references
+      // that will be recursively decremented.
+      gatherFunctions(curr_func.get(), stack);
+    }
 
-  if (deleteFunctionQueue.empty()) {
-    return;
-  }
-  if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) {
-    AT_ERROR("Only one deleter per thread should be able to process "
-             "the delete queue. Please open an issue.");
-  }
-  while (!deleteFunctionQueue.empty()) {
-    auto queued_function = deleteFunctionQueue.front();
-    deleteFunctionQueue.pop_front();
-    delete queued_function;
+    stack.pop_back();
   }
 }
 
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index dfe8683197f8ee..b4c90b1489a261 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -33,8 +33,6 @@ using edge_list = std::vector<Edge>;
 using saved_variable_list = std::vector<SavedVariable>;
 using IndexRange = std::pair<size_t, size_t>;
 
-TORCH_API extern size_t deleteFunctionMaxRecursionDepth;
-
 // Custom deleter to prevent stack overflows.
 void deleteFunction(Function* function);
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index b306426025e025..0be1cc2600a2f0 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -50,11 +50,6 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   });
   m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
-  /// TODO: Replace this ASAP with a better solution for deep autograd graphs!
-  m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) {
-    torch::autograd::deleteFunctionMaxRecursionDepth = value;
-  });
-
   Py_RETURN_TRUE;
 }
 

From 02c4cd3c8a3a4234ffbddde8d1214f0cfb209086 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Wed, 12 Sep 2018 14:45:52 -0700
Subject: [PATCH 091/237] Skip flaky distributed tests (#11594)

Summary:
context: https://github.com/pytorch/pytorch/issues/11582

cc pietern The controller you requested could not be found.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11594

Differential Revision: D9798871

Pulled By: SsnL

fbshipit-source-id: 9f9e1871c7fd9505ca898865eb8068fab4d3416d
---
 test/test_distributed.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_distributed.py b/test/test_distributed.py
index abf3663966700b..6dd0d984e842d3 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -408,6 +408,7 @@ def test_broadcast(self):
     )
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
+    @unittest.skip("Flaky test, see pytorch#11582")
     def test_broadcast_cuda(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -642,6 +643,7 @@ def test_all_reduce_sum(self):
     )
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
+    @unittest.skip("Flaky test, see pytorch#11582")
     def test_all_reduce_sum_cuda(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -930,6 +932,7 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
     @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
     @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped")
     @skip_if_no_gpu
+    @unittest.skip("Flaky test, see pytorch#11582")
     def test_broadcast_multigpu(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -1190,6 +1193,7 @@ def test_DistributedDataParallelCPU(self):
                      "Only Nccl & Gloo backend support DistributedDataParallel")
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
+    @unittest.skip("Flaky test, see pytorch#11582")
     def test_DistributedDataParallel(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()

From b663b7ce7ede61e48cf1d985f534bf028f94f343 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 15:14:54 -0700
Subject: [PATCH 092/237] Update ROCm Docker image with latest AMD debians
 (#11507)

Summary:
Building at https://ci.pytorch.org/jenkins/job/caffe2-docker-trigger/194/

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11507

Differential Revision: D9772474

Pulled By: ezyang

fbshipit-source-id: ab00f05744547dc7ec9f97511e2c8495ac282fac
---
 docker/caffe2/jenkins/common/install_rocm.sh | 29 ++++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index c69d857118b2d7..82692d0acdb9b6 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -60,25 +60,30 @@ install_rocrand() {
 # Install rocSPARSE/hipSPARSE that will be released soon - can co-exist w/ hcSPARSE which will be removed soon
 install_hipsparse() {
     mkdir -p /opt/rocm/debians
-    curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.1.0.deb -o /opt/rocm/debians/rocsparse.deb
-    curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.1.0.deb -o /opt/rocm/debians/hipsparse.deb
+    curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.2.114-Linux.deb -o /opt/rocm/debians/rocsparse.deb
+    curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.2.55-Linux.deb -o /opt/rocm/debians/hipsparse.deb
     dpkg -i /opt/rocm/debians/rocsparse.deb
     dpkg -i /opt/rocm/debians/hipsparse.deb
 }
 
 # Install custom hcc containing two compiler fixes relevant to PyTorch
 install_customhcc() {
+    HIP_VERSION="1.5.18354"
     mkdir -p /opt/rocm/debians
-    curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-1.2.18272-Linux.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_base-1.5.18276.deb -o /opt/rocm/debians/hip_base-1.5.18276.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_doc-1.5.18276.deb -o /opt/rocm/debians/hip_doc-1.5.18276.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_samples-1.5.18276.deb -o /opt/rocm/debians/hip_samples-1.5.18276.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_hcc-1.5.18276.deb -o /opt/rocm/debians/hip_hcc-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hcc-1.2.18272-Linux.deb
-    dpkg -i /opt/rocm/debians/hip_base-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hip_doc-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hip_samples-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hip_hcc-1.5.18276.deb
+    curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-Linux.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_base-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_base.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_doc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_doc.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_samples-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_samples.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_hcc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_hcc.deb
+    dpkg -i /opt/rocm/debians/hcc-Linux.deb
+    dpkg -i /opt/rocm/debians/hip_base.deb
+    dpkg -i /opt/rocm/debians/hip_doc.deb
+    dpkg -i /opt/rocm/debians/hip_samples.deb
+    dpkg -i /opt/rocm/debians/hip_hcc.deb
+
+    if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+        sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
+    fi
 }
 
 # Install Python packages depending on the base OS

From ac94889939573d00ba10394766f036dc90ae5a21 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Wed, 12 Sep 2018 15:16:05 -0700
Subject: [PATCH 093/237] Add jit doc entry to sidebar (#11598)

Summary:
cc zdevito apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11598

Differential Revision: D9801230

Pulled By: SsnL

fbshipit-source-id: f0c8d2468b64a50c3c437667d462722dcd2682d1
---
 docs/source/index.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7732366d38a61c..31c6c3f743da06 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,8 +32,8 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    optim
    torch.autograd <autograd>
    torch.distributed <distributed>
-   torch.distributed.deprecated <distributed_deprecated>
    torch.distributions <distributions>
+   torch.jit <JIT>
    torch.multiprocessing <multiprocessing>
    bottleneck
    checkpoint
@@ -43,6 +43,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    ffi
    model_zoo
    onnx
+   torch.distributed.deprecated <distributed_deprecated>
    torch.legacy <legacy>
 
 .. toctree::

From c81406c5148bbbf2b0a4e3ef92636fe70fa2d6a6 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Wed, 12 Sep 2018 15:39:27 -0700
Subject: [PATCH 094/237] Document Any (#11580)

Summary:
Documents the `AnyModule` class in the C++ API.

Also changed the API to be friendlier by default. Calling `AnyModule::forward` used to return an `AnyModule::Value` which you had to call `.get<T>()` on to cast to a concrete type. I changed the name of that `forward` method to `any_forward` and instead made `forward` templated on a `ReturnType` template parameter which you can supply to do the `.get<T>` cast for you automatically. I default this parameter to `torch::Tensor` so that it can often be omitted. So where you used to have to write

```cpp
any_module.forward(...).get<int>();
any_module.forward(...).get<torch::Tensor>();
```

you now write

```cpp
any_module.forward<int>(...);
any_module.forward(...);
```

ebetica ezyang soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11580

Differential Revision: D9798626

Pulled By: goldsborough

fbshipit-source-id: 060b4ea28facaffc417f53b80b846a9dff9acb73
---
 test/cpp/api/any.cpp                          |  28 +++--
 torch/csrc/api/include/torch/nn/modules/any.h | 105 ++++++++++++++++--
 .../api/include/torch/nn/modules/sequential.h |   6 +-
 3 files changed, 117 insertions(+), 22 deletions(-)

diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp
index ab044b84dc056d..9368d4dd5a4983 100644
--- a/test/cpp/api/any.cpp
+++ b/test/cpp/api/any.cpp
@@ -22,8 +22,9 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward().get<int>() == 123);
+    REQUIRE(any.forward<int>() == 123);
   }
+
   SECTION("int(int)") {
     struct M : torch::nn::Module {
       int forward(int x) {
@@ -31,8 +32,9 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward(5).get<int>() == 5);
+    REQUIRE(any.forward<int>(5) == 5);
   }
+
   SECTION("const char*(const char*)") {
     struct M : torch::nn::Module {
       const char* forward(const char* x) {
@@ -40,7 +42,7 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward("hello").get<const char*>() == std::string("hello"));
+    REQUIRE(any.forward<const char*>("hello") == std::string("hello"));
   }
 
   SECTION("string(int, const double)") {
@@ -51,7 +53,7 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{});
     int x = 4;
-    REQUIRE(any.forward(x, 3.14).get<std::string>() == std::string("7"));
+    REQUIRE(any.forward<std::string>(x, 3.14) == std::string("7"));
   }
 
   SECTION("Tensor(string, const string&, string&&)") {
@@ -66,8 +68,8 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{});
     REQUIRE(
-        any.forward(std::string("a"), std::string("ab"), std::string("abc"))
-            .get<torch::Tensor>()
+        any.forward(
+               std::string("a"), std::string("ab"), std::string("abc"))
             .sum()
             .toCInt() == 6);
   }
@@ -181,7 +183,7 @@ TEST_CASE("any-module") {
         any.forward<int>(5),
         StartsWith("Cannot call forward() on an empty AnyModule"));
   }
-  SECTION("can move assign differentm modules") {
+  SECTION("can move assign different modules") {
     struct M : torch::nn::Module {
       std::string forward(int x) {
         return std::to_string(x);
@@ -196,10 +198,10 @@ TEST_CASE("any-module") {
     REQUIRE(any.is_empty());
     any = std::make_shared<M>();
     REQUIRE(!any.is_empty());
-    REQUIRE(any.forward(5).get<std::string>() == "5");
+    REQUIRE(any.forward<std::string>(5) == "5");
     any = std::make_shared<N>();
     REQUIRE(!any.is_empty());
-    REQUIRE(any.forward(5.0f).get<int>() == 8);
+    REQUIRE(any.forward<int>(5.0f) == 8);
   }
   SECTION("constructs from ModuleHolder") {
     struct MImpl : torch::nn::Module {
@@ -218,6 +220,10 @@ TEST_CASE("any-module") {
     AnyModule any(M{5});
     REQUIRE(any.get<MImpl>().value == 5);
     REQUIRE(any.get<M>()->value == 5);
+
+    AnyModule module(Linear(3, 4));
+    std::shared_ptr<Module> ptr = module.ptr();
+    Linear linear(module.get<Linear>());
   }
   SECTION("converts autograd::Variable to torch::Tensor correctly") {
     struct M : torch::nn::Module {
@@ -232,12 +238,10 @@ TEST_CASE("any-module") {
       AnyModule any(M{});
       REQUIRE(
           any.forward(torch::autograd::Variable(torch::ones(5)))
-              .get<torch::Tensor>()
               .sum()
               .toCFloat() == 5);
       // at::Tensors that are not variables work too.
-      REQUIRE(
-          any.forward(at::ones(5)).get<torch::Tensor>().sum().toCFloat() == 5);
+      REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5);
     }
   }
 }
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index 920aea6aab3a54..626149469b5035 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -21,10 +21,87 @@
 namespace torch {
 namespace nn {
 
-/// A class to store a type erased module, whose `forward()` method can be
-/// invoked, with dynamic type checking. An `AnyModule` has an empty state, into
-/// which it is default constructed. `is_empty()` can be used to query whether
-/// the `AnyModule` is empty.
+/// Stores a type erased `Module`.
+///
+/// The PyTorch C++ API does not impose an interface on the signature of
+/// `forward()` in `Module` subclasses. This gives you complete freedom to
+/// design your `forward()` methods to your liking. However, this also means
+/// there is no unified base type you could store in order to call `forward()`
+/// polymorphically for any module. This is where the `AnyModule` comes in.
+/// Instead of inheritance, it relies on type erasure for polymorphism.
+///
+/// An `AnyModule` can store any `nn::Module` subclass that provides a
+/// `forward()` method. This `forward()` may accept any types and return any
+/// type. Once stored in an `AnyModule`, you can invoke the underlying module's
+/// `forward()` by calling `AnyModule::forward()` with the arguments you would
+/// supply to the stored module (though see one important limitation below).
+/// Example:
+///
+/// \rst
+/// .. code-block::
+///   struct GenericTrainer {
+///     torch::nn::AnyModule module;
+///
+///     void train(torch::Tensor input) {
+///       module.forward(input);
+///     }
+///   };
+///
+///   GenericTrainer trainer1{torch::nn::Linear(3, 4)};
+///   GenericTrainer trainer2{torch::nn::Conv2d(3, 4, 2)};
+/// \endrst
+///
+/// As `AnyModule` erases the static type of the stored module (and its
+/// `forward()` method) to achieve polymorphism, type checking of arguments is
+/// moved to runtime. That is, passing an argument with an incorrect type to an
+/// `AnyModule` will compile, but throw an exception at runtime:
+///
+/// \rst
+/// .. code-block::
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   // Linear takes a tensor as input, but we are passing an integer.
+///   // This will compile, but throw a `torch::Error` exception at runtime.
+///   module.forward(123);
+/// \endrst
+///
+/// \rst
+/// .. attention::
+///   One noteworthy limitation of `AnyModule` is that its `forward()` method
+///   does not support implicit conversion of argument types. For example, if
+///   the stored module's `forward()` method accepts a `float` and you call
+///   `any_module.forward(3.4)` (where `3.4` is a `double`), this will throw
+///   an exception.
+/// \endrst
+///
+/// The return type of the `AnyModule`'s `forward()` method is controlled via
+/// the first template argument to `AnyModule::forward()`. It defaults to
+/// `torch::Tensor`. To change it, you can write `any_module.forward<int>()`,
+/// for example.
+///
+/// \rst
+/// .. code-block::
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   auto output = module.forward(torch::ones({2, 3}));
+///
+///   struct IntModule {
+///     int forward(int x) { return x; }
+///   };
+///   torch::nn::AnyModule module(IntModule{});
+///   int output = module.forward<int>(5);
+/// \endrst
+///
+/// The only other method an `AnyModule` provides access to on the stored
+/// module is `clone()`. However, you may acquire a handle on the module via
+/// `.ptr()`, which returns a `shared_ptr<nn::Module>`. Further, if you know
+/// the concrete type of the stored module, you can get a concrete handle to it
+/// using `.get<T>()` where `T` is the concrete module type.
+///
+/// \rst
+/// .. code-block::
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   std::shared_ptr<nn::Module> ptr = module.ptr();
+///   torch::nn::Linear linear(module.get<torch::nn::Linear>());
+/// \endrst
 class AnyModule {
  public:
   /// A type-erased value.
@@ -69,7 +146,13 @@ class AnyModule {
   /// returns the return value as an `Value`. Use this method when chaining
   /// `AnyModule`s in a loop.
   template <typename... ArgumentTypes>
-  Value forward(ArgumentTypes&&... arguments);
+  Value any_forward(ArgumentTypes&&... arguments);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// casts the returned `Value` to the supplied `ReturnType` (which defaults to
+  /// `torch::Tensor`).
+  template <typename ReturnType = torch::Tensor, typename... ArgumentTypes>
+  ReturnType forward(ArgumentTypes&&... arguments);
 
   /// Attempts to cast the underlying module to the given module type. Throws an
   /// exception if the types do not match.
@@ -358,7 +441,7 @@ AnyModule& AnyModule::operator=(std::shared_ptr<ModuleType> module) {
 }
 
 template <typename... ArgumentTypes>
-AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) {
+AnyModule::Value AnyModule::any_forward(ArgumentTypes&&... arguments) {
   AT_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
   std::vector<Value> values;
   values.reserve(sizeof...(ArgumentTypes));
@@ -368,6 +451,12 @@ AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) {
   return content_->forward(std::move(values));
 }
 
+template <typename ReturnType, typename... ArgumentTypes>
+ReturnType AnyModule::forward(ArgumentTypes&&... arguments) {
+  return any_forward(std::forward<ArgumentTypes>(arguments)...)
+      .template get<ReturnType>();
+}
+
 template <typename T, typename>
 T& AnyModule::get() {
   AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
@@ -393,9 +482,9 @@ inline std::shared_ptr<Module> AnyModule::ptr() const {
 template <typename T, typename>
 std::shared_ptr<T> AnyModule::ptr() const {
   AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
-  /// Call get() but discard the value, just to do the type checking.
+  // Call get() but discard the value, just to do the type checking.
   get_<T>();
-  return std::static_pointer_cast<T>(ptr());
+  return std::dynamic_pointer_cast<T>(ptr());
 }
 
 inline const std::type_info& AnyModule::type_info() const {
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
index 9f3f7a0b983ab9..384afd1b7814bb 100644
--- a/torch/csrc/api/include/torch/nn/modules/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/detail/static.h>
+#include <torch/nn/cloneable.h>
 #include <torch/nn/module.h>
 #include <torch/nn/modules/any.h>
 #include <torch/nn/pimpl.h>
@@ -57,10 +58,11 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
     AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
 
     auto iterator = modules_.begin();
-    auto input = iterator->forward(std::forward<ArgumentTypes>(arguments)...);
+    auto input =
+        iterator->any_forward(std::forward<ArgumentTypes>(arguments)...);
 
     for (++iterator; iterator != modules_.end(); ++iterator) {
-      input = iterator->forward(std::move(input));
+      input = iterator->any_forward(std::move(input));
     }
 
     // Check the return value and give a nice error message if the requsted

From eb7a298489f5e755846c7fb6af1bc6ae338265f1 Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@fb.com>
Date: Wed, 12 Sep 2018 15:53:29 -0700
Subject: [PATCH 095/237] Add resnext model to OSS (#11468)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11468

Add resnext model into OSS Caffe 2 repo.

Reviewed By: orionr, kuttas

Differential Revision: D9506000

fbshipit-source-id: 236005d5d7dbeb8c2864014b1eea03810618d8e8
---
 caffe2/python/examples/resnet50_trainer.py |  91 +++++--
 caffe2/python/models/resnet.py             | 264 ++++++++++++++-------
 2 files changed, 253 insertions(+), 102 deletions(-)

diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py
index e1716857bb16b0..05b753b8fd397e 100644
--- a/caffe2/python/examples/resnet50_trainer.py
+++ b/caffe2/python/examples/resnet50_trainer.py
@@ -22,8 +22,10 @@
 from caffe2.python.predictor_constants import predictor_constants as predictor_constants
 
 '''
-Parallelized multi-GPU distributed trainer for Resnet 50. Can be used to train
-on imagenet data, for example.
+Parallelized multi-GPU distributed trainer for Resne(X)t.
+Can be used to train on imagenet data, for example.
+The default parameters can train a standard Resnet-50 (1x64d), and parameters
+can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
 
 To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
 
@@ -39,14 +41,23 @@
 '''
 
 logging.basicConfig()
-log = logging.getLogger("resnet50_trainer")
+log = logging.getLogger("ResNe(X)t_trainer")
 log.setLevel(logging.DEBUG)
 
 dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
 dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
 
 
-def AddImageInput(model, reader, batch_size, img_size, dtype, is_test):
+def AddImageInput(
+    model,
+    reader,
+    batch_size,
+    img_size,
+    dtype,
+    is_test,
+    mean_per_channel=None,
+    std_per_channel=None,
+):
     '''
     The image input operator loads image and label data from the reader and
     applies transformations to the images (random cropping, mirroring, ...).
@@ -58,6 +69,9 @@ def AddImageInput(model, reader, batch_size, img_size, dtype, is_test):
         output_type=dtype,
         use_gpu_transform=True if model._device_type == 1 else False,
         use_caffe_datum=True,
+        mean_per_channel=mean_per_channel,
+        std_per_channel=std_per_channel,
+        # mean_per_channel takes precedence over mean
         mean=128.,
         std=128.,
         scale=256,
@@ -166,6 +180,7 @@ def RunEpoch(
     # TODO: add loading from checkpoint
     log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
     epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
+    test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
     for i in range(epoch_iters):
         # This timeout is required (temporarily) since CUDA-NCCL
         # operators might deadlock when synchronizing between GPUs.
@@ -194,19 +209,25 @@ def RunEpoch(
         data_parallel_model.GetLearningRateBlobNames(train_model)[0]
     )
     test_accuracy = 0
-    if (test_model is not None):
+    test_accuracy_top5 = 0
+    if test_model is not None:
         # Run 100 iters of testing
         ntests = 0
-        for _ in range(0, 100):
+        for _ in range(test_epoch_iters):
             workspace.RunNet(test_model.net.Proto().name)
             for g in test_model._devices:
                 test_accuracy += np.asscalar(workspace.FetchBlob(
                     "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
                 ))
+                test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
+                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
+                ))
                 ntests += 1
         test_accuracy /= ntests
+        test_accuracy_top5 /= ntests
     else:
         test_accuracy = (-1)
+        test_accuracy_top5 = (-1)
 
     explog.log(
         input_count=num_images,
@@ -216,7 +237,8 @@ def RunEpoch(
             'loss': loss,
             'learning_rate': learning_rate,
             'epoch': epoch,
-            'test_accuracy': test_accuracy,
+            'top1_test_accuracy': test_accuracy,
+            'top5_test_accuracy': test_accuracy_top5,
         }
     )
     assert loss < 40, "Exploded gradients :("
@@ -243,6 +265,17 @@ def Train(args):
         total_batch_size % num_gpus == 0, \
         "Number of GPUs must divide batch size"
 
+    # Verify valid image mean/std per channel
+    if args.image_mean_per_channel:
+        assert \
+            len(args.image_mean_per_channel) == args.num_channels, \
+            "The number of channels of image mean doesn't match input"
+
+    if args.image_std_per_channel:
+        assert \
+            len(args.image_std_per_channel) == args.num_channels, \
+            "The number of channels of image std doesn't match input"
+
     # Round down epoch size to closest multiple of batch size across machines
     global_batch_size = total_batch_size * args.num_shards
     epoch_iters = int(args.epoch_size / global_batch_size)
@@ -262,7 +295,7 @@ def Train(args):
         'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
     }
     train_model = model_helper.ModelHelper(
-        name="resnet50", arg_scope=train_arg_scope
+        name='resnext' + str(args.num_layers), arg_scope=train_arg_scope
     )
 
     num_shards = args.num_shards
@@ -324,7 +357,7 @@ def Train(args):
         rendezvous = None
 
     # Model building functions
-    def create_resnet50_model_ops(model, loss_scale):
+    def create_resnext_model_ops(model, loss_scale):
         initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                        else Initializer)
 
@@ -333,11 +366,14 @@ def create_resnet50_model_ops(model, loss_scale):
                             BiasInitializer=initializer,
                             enable_tensor_core=args.enable_tensor_core,
                             float16_compute=args.float16_compute):
-            pred = resnet.create_resnet50(
+            pred = resnet.create_resnext(
                 model,
                 "data",
                 num_input_channels=args.num_channels,
                 num_labels=args.num_labels,
+                num_layers=args.num_layers,
+                num_groups=args.resnext_num_groups,
+                num_width_per_group=args.resnext_width_per_group,
                 no_bias=True,
                 no_loss=True,
             )
@@ -348,7 +384,8 @@ def create_resnet50_model_ops(model, loss_scale):
         softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                               ['softmax', 'loss'])
         loss = model.Scale(loss, scale=loss_scale)
-        brew.accuracy(model, [softmax, "label"], "accuracy")
+        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
+        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
         return [loss]
 
     def add_optimizer(model):
@@ -408,6 +445,8 @@ def add_image_input(model):
                 img_size=args.image_size,
                 dtype=args.dtype,
                 is_test=False,
+                mean_per_channel=args.image_mean_per_channel,
+                std_per_channel=args.image_std_per_channel,
             )
 
     def add_post_sync_ops(model):
@@ -423,7 +462,7 @@ def add_post_sync_ops(model):
     data_parallel_model.Parallelize(
         train_model,
         input_builder_fun=add_image_input,
-        forward_pass_builder_fun=create_resnet50_model_ops,
+        forward_pass_builder_fun=create_resnext_model_ops,
         optimizer_builder_fun=add_optimizer,
         post_sync_builder_fun=add_post_sync_ops,
         devices=gpus,
@@ -449,7 +488,9 @@ def add_post_sync_ops(model):
             'cudnn_exhaustive_search': True,
         }
         test_model = model_helper.ModelHelper(
-            name="resnet50_test", arg_scope=test_arg_scope, init_params=False
+            name='resnext' + str(args.num_layers) + "_test",
+            arg_scope=test_arg_scope,
+            init_params=False,
         )
 
         test_reader = test_model.CreateDB(
@@ -466,12 +507,14 @@ def test_input_fn(model):
                 img_size=args.image_size,
                 dtype=args.dtype,
                 is_test=True,
+                mean_per_channel=args.image_mean_per_channel,
+                std_per_channel=args.image_std_per_channel,
             )
 
         data_parallel_model.Parallelize(
             test_model,
             input_builder_fun=test_input_fn,
-            forward_pass_builder_fun=create_resnet50_model_ops,
+            forward_pass_builder_fun=create_resnext_model_ops,
             post_sync_builder_fun=add_post_sync_ops,
             param_update_builder_fun=None,
             devices=gpus,
@@ -497,7 +540,8 @@ def test_input_fn(model):
         else:
             log.warning("The format of load_model_path doesn't match!")
 
-    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
+    expname = "resnext_%d_gpu%d_b%d_L%d_lr%.2f_v2" % (
+        args.num_layers,
         args.num_gpus,
         total_batch_size,
         args.num_labels,
@@ -534,12 +578,24 @@ def test_input_fn(model):
 def main():
     # TODO: use argv
     parser = argparse.ArgumentParser(
-        description="Caffe2: Resnet-50 training"
+        description="Caffe2: ResNe(X)t training"
     )
     parser.add_argument("--train_data", type=str, default=None, required=True,
                         help="Path to training data (or 'null' to simulate)")
+    parser.add_argument("--num_layers", type=int, default=50,
+                        help="The number of layers in ResNe(X)t model")
+    parser.add_argument("--resnext_num_groups", type=int, default=1,
+                        help="The cardinality of resnext")
+    parser.add_argument("--resnext_width_per_group", type=int, default=64,
+                        help="The cardinality of resnext")
     parser.add_argument("--test_data", type=str, default=None,
                         help="Path to test data")
+    parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
+                        help="The per channel mean for the images")
+    parser.add_argument("--image_std_per_channel", type=float, nargs='+',
+                        help="The per channel standard deviation for the images")
+    parser.add_argument("--test_epoch_size", type=int, default=50000,
+                        help="Number of test images")
     parser.add_argument("--db_type", type=str, default="lmdb",
                         help="Database type (such as lmdb or leveldb)")
     parser.add_argument("--gpus", type=str,
@@ -576,7 +632,7 @@ def main():
                         help="Port of Redis server (for rendezvous)")
     parser.add_argument("--file_store_path", type=str, default="/tmp",
                         help="Path to directory to use for rendezvous")
-    parser.add_argument("--save_model_name", type=str, default="resnet50_model",
+    parser.add_argument("--save_model_name", type=str, default="resnext_model",
                         help="Save the trained model to a given name")
     parser.add_argument("--load_model_path", type=str, default=None,
                         help="Load previously saved model to continue training")
@@ -598,6 +654,7 @@ def main():
 
     Train(args)
 
+
 if __name__ == '__main__':
     workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
     main()
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
index 60e00ed1a1ae24..7c6c6dc27fe10a 100644
--- a/caffe2/python/models/resnet.py
+++ b/caffe2/python/models/resnet.py
@@ -6,9 +6,12 @@
 from __future__ import print_function
 
 from caffe2.python import brew
+import logging
+
 '''
-Utility for creating ResNets
-See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+Utility for creating ResNe(X)t
+"Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+"Aggregated Residual Transformations for Deep Neural Networks" by Xie et. al. 2016
 '''
 
 
@@ -17,16 +20,33 @@ class ResNetBuilder():
     Helper class for constructing residual blocks.
     '''
 
-    def __init__(self, model, prev_blob, no_bias, is_test, spatial_bn_mom=0.9):
+    def __init__(
+        self,
+        model,
+        prev_blob,
+        no_bias,
+        is_test,
+        bn_epsilon=1e-5,
+        bn_momentum=0.9,
+    ):
         self.model = model
         self.comp_count = 0
         self.comp_idx = 0
         self.prev_blob = prev_blob
         self.is_test = is_test
-        self.spatial_bn_mom = spatial_bn_mom
+        self.bn_epsilon = bn_epsilon
+        self.bn_momentum = bn_momentum
         self.no_bias = 1 if no_bias else 0
 
-    def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
+    def add_conv(
+        self,
+        in_filters,
+        out_filters,
+        kernel,
+        stride=1,
+        group=1,
+        pad=0,
+    ):
         self.comp_idx += 1
         self.prev_blob = brew.conv(
             self.model,
@@ -37,6 +57,7 @@ def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
             weight_init=("MSRAFill", {}),
             kernel=kernel,
             stride=stride,
+            group=group,
             pad=pad,
             no_bias=self.no_bias,
         )
@@ -56,8 +77,8 @@ def add_spatial_bn(self, num_filters):
             self.prev_blob,
             'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx),
             num_filters,
-            epsilon=1e-3,
-            momentum=self.spatial_bn_mom,
+            epsilon=self.bn_epsilon,
+            momentum=self.bn_momentum,
             is_test=self.is_test,
         )
         return self.prev_blob
@@ -71,7 +92,8 @@ def add_bottleneck(
         input_filters,   # num of feature maps from preceding layer
         base_filters,    # num of filters internally in the component
         output_filters,  # num of feature maps to output
-        down_sampling=False,
+        stride=1,
+        group=1,
         spatial_batch_norm=True,
     ):
         self.comp_idx = 0
@@ -82,7 +104,7 @@ def add_bottleneck(
             input_filters,
             base_filters,
             kernel=1,
-            stride=1
+            stride=1,
         )
 
         if spatial_batch_norm:
@@ -95,8 +117,9 @@ def add_bottleneck(
             base_filters,
             base_filters,
             kernel=3,
-            stride=(1 if down_sampling is False else 2),
-            pad=1
+            stride=stride,
+            group=group,
+            pad=1,
         )
 
         if spatial_batch_norm:
@@ -109,9 +132,10 @@ def add_bottleneck(
             last_conv = self.add_spatial_bn(output_filters)
 
         # Summation with input signal (shortcut)
-        # If we need to increase dimensions (feature maps), need to
-        # do a projection for the short cut
-        if (output_filters > input_filters):
+        # When the number of feature maps mismatch between the input
+        # and output (this usually happens when the residual stage
+        # changes), we need to do a projection for the short cut
+        if output_filters != input_filters:
             shortcut_blob = brew.conv(
                 self.model,
                 shortcut_blob,
@@ -120,7 +144,7 @@ def add_bottleneck(
                 output_filters,
                 weight_init=("MSRAFill", {}),
                 kernel=1,
-                stride=(1 if down_sampling is False else 2),
+                stride=stride,
                 no_bias=self.no_bias,
             )
             if spatial_batch_norm:
@@ -129,8 +153,8 @@ def add_bottleneck(
                     shortcut_blob,
                     'shortcut_projection_%d_spatbn' % self.comp_count,
                     output_filters,
-                    epsilon=1e-3,
-                    momentum=self.spatial_bn_mom,
+                    epsilon=self.bn_epsilon,
+                    momentum=self.bn_momentum,
                     is_test=self.is_test,
                 )
 
@@ -144,6 +168,8 @@ def add_bottleneck(
         # Keep track of number of high level components if this ResNetBuilder
         self.comp_count += 1
 
+        return output_filters
+
     def add_simple_block(
         self,
         input_filters,
@@ -205,28 +231,102 @@ def add_simple_block(
         self.comp_count += 1
 
 
+def create_resnet_32x32(
+    model, data, num_input_channels, num_groups, num_labels, is_test=False
+):
+    '''
+    Create residual net for smaller images (sec 4.2 of He et. al (2015))
+    num_groups = 'n' in the paper
+    '''
+    # conv1 + maxpool
+    brew.conv(
+        model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1
+    )
+    brew.spatial_bn(
+        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test
+    )
+    brew.relu(model, 'conv1_spatbn', 'relu1')
+
+    # Number of blocks as described in sec 4.2
+    filters = [16, 32, 64]
+
+    builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test)
+    prev_filters = 16
+    for groupidx in range(0, 3):
+        for blockidx in range(0, 2 * num_groups):
+            builder.add_simple_block(
+                prev_filters if blockidx == 0 else filters[groupidx],
+                filters[groupidx],
+                down_sampling=(True if blockidx == 0 and
+                               groupidx > 0 else False))
+        prev_filters = filters[groupidx]
+
+    # Final layers
+    brew.average_pool(
+        model, builder.prev_blob, 'final_avg', kernel=8, stride=1
+    )
+    brew.fc(model, 'final_avg', 'last_out', 64, num_labels)
+    softmax = brew.softmax(model, 'last_out', 'softmax')
+    return softmax
+
+
+RESNEXT_BLOCK_CONFIG = {
+    18: (2, 2, 2, 2),
+    34: (3, 4, 6, 3),
+    50: (3, 4, 6, 3),
+    101: (3, 4, 23, 3),
+    152: (3, 8, 36, 3),
+    200: (3, 24, 36, 3),
+}
+
+RESNEXT_STRIDES = [1, 2, 2, 2]
+
+logging.basicConfig()
+log = logging.getLogger("resnext_builder")
+log.setLevel(logging.DEBUG)
+
+
 # The conv1 and final_avg kernel/stride args provide a basic mechanism for
 # adapting resnet50 for different sizes of input images.
-def create_resnet50(
+def create_resnext(
     model,
     data,
     num_input_channels,
     num_labels,
+    num_layers,
+    num_groups,
+    num_width_per_group,
     label=None,
     is_test=False,
     no_loss=False,
-    no_bias=0,
+    no_bias=1,
     conv1_kernel=7,
     conv1_stride=2,
     final_avg_kernel=7,
+    log=None,
+    bn_epsilon=1e-5,
+    bn_momentum=0.9,
 ):
+    if num_layers not in RESNEXT_BLOCK_CONFIG:
+        log.error("{}-layer is invalid for resnext config".format(num_layers))
+
+    num_blocks = RESNEXT_BLOCK_CONFIG[num_layers]
+    strides = RESNEXT_STRIDES
+    num_filters = [64, 256, 512, 1024, 2048]
+
+    if num_layers in [18, 34]:
+        num_filters = [64, 64, 128, 256, 512]
+
+    # the number of features before the last FC layer
+    num_features = num_filters[-1]
+
     # conv1 + maxpool
-    brew.conv(
+    conv_blob = brew.conv(
         model,
         data,
         'conv1',
         num_input_channels,
-        64,
+        num_filters[0],
         weight_init=("MSRAFill", {}),
         kernel=conv1_kernel,
         stride=conv1_stride,
@@ -234,41 +334,40 @@ def create_resnet50(
         no_bias=no_bias
     )
 
-    brew.spatial_bn(
+    bn_blob = brew.spatial_bn(
         model,
-        'conv1',
+        conv_blob,
         'conv1_spatbn_relu',
-        64,
-        epsilon=1e-3,
-        momentum=0.1,
+        num_filters[0],
+        epsilon=bn_epsilon,
+        momentum=bn_momentum,
         is_test=is_test
     )
-    brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu')
-    brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
+    relu_blob = brew.relu(model, bn_blob, bn_blob)
+    max_pool = brew.max_pool(model, relu_blob, 'pool1', kernel=3, stride=2, pad=1)
 
     # Residual blocks...
-    builder = ResNetBuilder(model, 'pool1', no_bias=no_bias,
-                            is_test=is_test, spatial_bn_mom=0.1)
-
-    # conv2_x (ref Table 1 in He et al. (2015))
-    builder.add_bottleneck(64, 64, 256)
-    builder.add_bottleneck(256, 64, 256)
-    builder.add_bottleneck(256, 64, 256)
-
-    # conv3_x
-    builder.add_bottleneck(256, 128, 512, down_sampling=True)
-    for _ in range(1, 4):
-        builder.add_bottleneck(512, 128, 512)
-
-    # conv4_x
-    builder.add_bottleneck(512, 256, 1024, down_sampling=True)
-    for _ in range(1, 6):
-        builder.add_bottleneck(1024, 256, 1024)
+    builder = ResNetBuilder(model, max_pool, no_bias=no_bias,
+                            is_test=is_test, bn_epsilon=1e-5, bn_momentum=0.9)
+
+    inner_dim = num_groups * num_width_per_group
+
+    # 4 different kinds of residual blocks
+    for residual_idx in range(4):
+        residual_num = num_blocks[residual_idx]
+        residual_stride = strides[residual_idx]
+        dim_in = num_filters[residual_idx]
+
+        for blk_idx in range(residual_num):
+            dim_in = builder.add_bottleneck(
+                dim_in,
+                inner_dim,
+                num_filters[residual_idx + 1],  # dim out
+                stride=residual_stride if blk_idx == 0 else 1,
+                group=num_groups,
+            )
 
-    # conv5_x
-    builder.add_bottleneck(1024, 512, 2048, down_sampling=True)
-    builder.add_bottleneck(2048, 512, 2048)
-    builder.add_bottleneck(2048, 512, 2048)
+        inner_dim *= 2
 
     # Final layers
     final_avg = brew.average_pool(
@@ -282,7 +381,7 @@ def create_resnet50(
 
     # Final dimension of the "image" is reduced to 7x7
     last_out = brew.fc(
-        model, final_avg, 'last_out_L{}'.format(num_labels), 2048, num_labels
+        model, final_avg, 'last_out_L{}'.format(num_labels), num_features, num_labels
     )
 
     if no_loss:
@@ -301,40 +400,35 @@ def create_resnet50(
         return brew.softmax(model, last_out, "softmax")
 
 
-def create_resnet_32x32(
-    model, data, num_input_channels, num_groups, num_labels, is_test=False
+# The conv1 and final_avg kernel/stride args provide a basic mechanism for
+# adapting resnet50 for different sizes of input images.
+def create_resnet50(
+    model,
+    data,
+    num_input_channels,
+    num_labels,
+    label=None,
+    is_test=False,
+    no_loss=False,
+    no_bias=0,
+    conv1_kernel=7,
+    conv1_stride=2,
+    final_avg_kernel=7,
 ):
-    '''
-    Create residual net for smaller images (sec 4.2 of He et. al (2015))
-    num_groups = 'n' in the paper
-    '''
-    # conv1 + maxpool
-    brew.conv(
-        model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1
-    )
-    brew.spatial_bn(
-        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test
-    )
-    brew.relu(model, 'conv1_spatbn', 'relu1')
-
-    # Number of blocks as described in sec 4.2
-    filters = [16, 32, 64]
-
-    builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test)
-    prev_filters = 16
-    for groupidx in range(0, 3):
-        for blockidx in range(0, 2 * num_groups):
-            builder.add_simple_block(
-                prev_filters if blockidx == 0 else filters[groupidx],
-                filters[groupidx],
-                down_sampling=(True if blockidx == 0 and
-                               groupidx > 0 else False))
-        prev_filters = filters[groupidx]
-
-    # Final layers
-    brew.average_pool(
-        model, builder.prev_blob, 'final_avg', kernel=8, stride=1
+    # resnet50 is a special case for ResNeXt50-1x64d
+    return create_resnext(
+        model,
+        data,
+        num_input_channels,
+        num_labels,
+        num_layers=50,
+        num_groups=1,
+        num_width_per_group=64,
+        label=label,
+        is_test=is_test,
+        no_loss=no_loss,
+        no_bias=no_bias,
+        conv1_kernel=conv1_kernel,
+        conv1_stride=conv1_stride,
+        final_avg_kernel=final_avg_kernel,
     )
-    brew.fc(model, 'final_avg', 'last_out', 64, num_labels)
-    softmax = brew.softmax(model, 'last_out', 'softmax')
-    return softmax

From 316c1679408f99d97d516a8e3b5d7964bd4e78ea Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Wed, 12 Sep 2018 16:10:24 -0700
Subject: [PATCH 096/237] Add checking of nullptrs in GetTensorInfo (#11587)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11587

To help debug the issue in T33295362, we add some checks in the function.

Possible crashing site in `GetTensorInfo`
1. tc is nullptr, which is checked.
2. tc->capacity_nbytes() hits nullptr, this is unlikely because storage is not a pointer and compute of capacity_nbytes doesn't involve pointers. It's numel * itermsize().
3. tc->ExtractDeviceOption hits nullpt. One possibility raw_data() is nullptr because tc->ExtractDeviceOption will use that. This is checked.
4. Tensor itself which is not a reference. This is also checked.

Reviewed By: salexspb

Differential Revision: D9793484

fbshipit-source-id: 3fc72746fc310a23ae45553bbe0d269a4b9edb72
---
 caffe2/core/context_gpu.h | 1 +
 caffe2/core/storage.h     | 5 +++++
 caffe2/core/tensor.cc     | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 3090ca57aedc31..cca1b8be6500a8 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -403,6 +403,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
   }
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    CAFFE_ENFORCE(data, "data cannot be nullptr");
     device->set_device_type(TypeToProto(GetDeviceType()));
     device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
index 974985625df6b1..12bff9af69ad03 100644
--- a/caffe2/core/storage.h
+++ b/caffe2/core/storage.h
@@ -214,6 +214,11 @@ class CAFFE2_API Storage {
     storage_impl_->reset();
   }
 
+  // For debugging purpose only, please don't call it
+  StorageImpl* unsafeGetStorageImp() const {
+    return storage_impl_.get();
+  }
+
   template <typename T>
   inline bool IsType() const {
     return storage_impl_->IsType<T>();
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 1659e6ba252bab..a24f8cb67b7347 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -94,6 +94,9 @@ vector<TIndex> GetTensorInfo(
     size_t* capacity,
     DeviceOption* device) {
   const Tensor* tc = static_cast<const Tensor*>(c);
+  CHECK(tc);
+  CHECK(tc->unsafeGetTensorImpl());
+  CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImp());
   *capacity = tc->capacity_nbytes();
   tc->ExtractDeviceOption(device);
   return tc->dims();

From 12efef166aff92742a0816c6f62dca225a5a4c2e Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@fb.com>
Date: Wed, 12 Sep 2018 16:18:10 -0700
Subject: [PATCH 097/237] Split out copy_op from utility_ops (#11470)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11470

In order to reduce build sizes, we are identifying files that can be split up into smaller units, allowing us to only include the ops we need.

Reviewed By: orionr, ajtulloch

Differential Revision: D9725819

fbshipit-source-id: def1074a33dffe99bd6a7e6e48aa9e5be3d04a6a
---
 caffe2/operators/copy_op.cc     | 198 ++++++++++++++++++++++++++++++++
 caffe2/operators/copy_op.cu     |  48 ++++++++
 caffe2/operators/copy_op.h      |  38 ++++++
 caffe2/operators/utility_ops.cc | 191 ------------------------------
 caffe2/operators/utility_ops.cu |  43 -------
 caffe2/operators/utility_ops.h  |  27 -----
 6 files changed, 284 insertions(+), 261 deletions(-)
 create mode 100644 caffe2/operators/copy_op.cc
 create mode 100644 caffe2/operators/copy_op.cu
 create mode 100644 caffe2/operators/copy_op.h

diff --git a/caffe2/operators/copy_op.cc b/caffe2/operators/copy_op.cc
new file mode 100644
index 00000000000000..582e31475780bf
--- /dev/null
+++ b/caffe2/operators/copy_op.cc
@@ -0,0 +1,198 @@
+#include "caffe2/operators/copy_op.h"
+
+namespace caffe2 {
+
+// From CPU, copy it to whatever the current context
+REGISTER_CPU_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CopyOnDeviceLike,
+    CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
+
+OPERATOR_SCHEMA(Copy)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .SetDoc(R"DOC(
+Copy input tensor into output, potentially across devices.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Copy",
+    ["input"],
+    ["output"]
+)
+
+workspace.FeedBlob("input", np.random.rand(3,3))
+print("input:", workspace.FetchBlob("input"))
+workspace.RunOperatorOnce(op)
+print("output:", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+input:
+[[0.16826761 0.68168217 0.55196001]
+ [0.19735483 0.34837823 0.69015595]
+ [0.09448514 0.57390828 0.37097193]]
+output:
+[[0.16826761 0.68168217 0.55196001]
+ [0.19735483 0.34837823 0.69015595]
+ [0.09448514 0.57390828 0.37097193]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "(*Tensor*): input tensor to copy")
+    .Output(0, "output", "(*Tensor*): copy of input tensor");
+
+OPERATOR_SCHEMA(CopyGPUToCPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      CAFFE_ENFORCE(
+          def.has_device_option(),
+          "CopyGPUToCPU op should have cuda device option.");
+      auto& cuda_option = def.device_option();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cuda_option);
+      vector<DeviceOption> out_dev(def.output_size(), cpu_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Copy tensor for GPU to CPU context. Must be run under GPU device option.
+)DOC")
+    .Input(0, "input", "The input tensor.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(CopyCPUToGPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      CAFFE_ENFORCE(
+          def.has_device_option(),
+          "CopyCPUToGPU op should have cuda device option.");
+      auto& cuda_option = def.device_option();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
+      vector<DeviceOption> out_dev(def.output_size(), cuda_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Copy tensor for CPU to GPU context. Must be run under GPU device option.
+)DOC")
+    .Input(0, "input", "The input tensor.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(CopyFromCPUInput)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      auto op_device =
+          def.has_device_option() ? def.device_option() : DeviceOption();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
+      vector<DeviceOption> out_dev(def.output_size(), op_device);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Take a CPU input tensor and copy it to an output in the current
+Context (GPU or CPU). This may involves cross-device MemCpy.
+)DOC")
+    .Input(0, "input", "The input CPU tensor.")
+    .Output(0, "output", "either a TensorCUDA or a TensorCPU");
+
+OPERATOR_SCHEMA(CopyOnDeviceLike)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc("Copy input tensor into output to the specific device.")
+    .Input(0, "input", "The input tensor.")
+    .Input(1, "dst", "Tensor, on which device the copy will be performed.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+struct GetCopyGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CopyOnDeviceLike",
+        "",
+        vector<string>{GO(0), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Copy, GetCopyGradient);
+
+struct GetGPUToCPUGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyCPUToGPU",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyCPUToGPU",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
+
+struct GetCPUToGPUGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyGPUToCPU",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyGPUToCPU",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/copy_op.cu b/caffe2/operators/copy_op.cu
new file mode 100644
index 00000000000000..e833e720e556f3
--- /dev/null
+++ b/caffe2/operators/copy_op.cu
@@ -0,0 +1,48 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/copy_op.h"
+
+namespace caffe2 {
+
+template <>
+class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
+    : public Operator<CUDAContext> {
+ public:
+  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = OperatorBase::Output<Tensor>(0, CUDA);
+    CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
+    output->ResizeLike(input);
+    context.template CopyItems<CUDAContext, CUDAContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+// From CPU, copy it to whatever the current context
+REGISTER_CUDA_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+
+// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
+// since gpu code will be involved.
+REGISTER_CUDA_OPERATOR(
+    CopyGPUToCPU,
+    CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CopyCPUToGPU,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
+// involving different GPUs.
+REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    CopyOnDeviceLike,
+    CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/copy_op.h b/caffe2/operators/copy_op.h
new file mode 100644
index 00000000000000..11e8e15fbcf005
--- /dev/null
+++ b/caffe2/operators/copy_op.h
@@ -0,0 +1,38 @@
+#ifndef CAFFE2_OPERATORS_COPY_OP_H_
+#define CAFFE2_OPERATORS_COPY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context, class DstContext, class SrcContext>
+class CopyOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(CopyOp)
+
+  bool RunOnDevice() override {
+    auto& input = this->template Input<Tensor>(0, SrcContext::GetDeviceType());
+    auto* output =
+        this->template Output<Tensor>(0, DstContext::GetDeviceType());
+    output->ResizeLike(input);
+    this->context_.template CopyItems<SrcContext, DstContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+template <class Context, class DstContext, class SrcContext>
+class CopyOnDeviceLikeOp : public CopyOp<Context, DstContext, SrcContext> {
+ public:
+  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : CopyOp<Context, DstContext, SrcContext>(operator_def, ws) {}
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COPY_OP_H_
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 6c287c37f3d6c9..b370b5ecb438cc 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -52,14 +52,6 @@ REGISTER_CPU_OPERATOR(
     ScatterWeightedSumOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 
-// From CPU, copy it to whatever the current context
-REGISTER_CPU_OPERATOR(
-    CopyFromCPUInput,
-    CopyOp<CPUContext, CPUContext, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    CopyOnDeviceLike,
-    CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
-REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
@@ -379,133 +371,6 @@ Currently only works on CPU because of access to INDICES.
         "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
     .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");
 
-OPERATOR_SCHEMA(Copy)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .SetDoc(R"DOC(
-Copy input tensor into output, potentially across devices.
-
-Github Links:
-
-- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
-- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
-
-
-<details>
-
-<summary> <b>Example</b> </summary>
-
-**Code**
-
-```
-
-workspace.ResetWorkspace()
-
-op = core.CreateOperator(
-    "Copy",
-    ["input"],
-    ["output"]
-)
-
-workspace.FeedBlob("input", np.random.rand(3,3))
-print("input:", workspace.FetchBlob("input"))
-workspace.RunOperatorOnce(op)
-print("output:", workspace.FetchBlob("output"))
-
-```
-
-**Result**
-
-```
-
-input:
-[[0.16826761 0.68168217 0.55196001]
- [0.19735483 0.34837823 0.69015595]
- [0.09448514 0.57390828 0.37097193]]
-output:
-[[0.16826761 0.68168217 0.55196001]
- [0.19735483 0.34837823 0.69015595]
- [0.09448514 0.57390828 0.37097193]]
-
-```
-
-</details>
-
-)DOC")
-    .Input(0, "input", "(*Tensor*): input tensor to copy")
-    .Output(0, "output", "(*Tensor*): copy of input tensor");
-
-OPERATOR_SCHEMA(CopyGPUToCPU)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      CAFFE_ENFORCE(
-          def.has_device_option(),
-          "CopyGPUToCPU op should have cuda device option.");
-      auto& cuda_option = def.device_option();
-      auto cpu_option = DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), cuda_option);
-      vector<DeviceOption> out_dev(def.output_size(), cpu_option);
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-Copy tensor for GPU to CPU context. Must be run under GPU device option.
-)DOC")
-    .Input(0, "input", "The input tensor.")
-    .Output(0, "output", "Tensor that will contain a copy of the input.");
-
-OPERATOR_SCHEMA(CopyCPUToGPU)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      CAFFE_ENFORCE(
-          def.has_device_option(),
-          "CopyCPUToGPU op should have cuda device option.");
-      auto& cuda_option = def.device_option();
-      auto cpu_option = DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
-      vector<DeviceOption> out_dev(def.output_size(), cuda_option);
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-Copy tensor for CPU to GPU context. Must be run under GPU device option.
-)DOC")
-    .Input(0, "input", "The input tensor.")
-    .Output(0, "output", "Tensor that will contain a copy of the input.");
-
-OPERATOR_SCHEMA(CopyFromCPUInput)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      auto op_device =
-          def.has_device_option() ? def.device_option() : DeviceOption();
-      auto cpu_option = DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
-      vector<DeviceOption> out_dev(def.output_size(), op_device);
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-Take a CPU input tensor and copy it to an output in the current
-Context (GPU or CPU). This may involves cross-device MemCpy.
-)DOC")
-    .Input(0, "input", "The input CPU tensor.")
-    .Output(0, "output", "either a TensorCUDA or a TensorCPU");
-
-OPERATOR_SCHEMA(CopyOnDeviceLike)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc("Copy input tensor into output to the specific device.")
-    .Input(0, "input", "The input tensor.")
-    .Input(1, "dst", "Tensor, on which device the copy will be performed.")
-    .Output(0, "output", "Tensor that will contain a copy of the input.");
 
 OPERATOR_SCHEMA(HasElements)
     .NumInputs(1)
@@ -937,62 +802,6 @@ struct GetFlattenToVecGradient : public GradientMakerBase {
 };
 REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);
 
-struct GetCopyGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "CopyOnDeviceLike",
-        "",
-        vector<string>{GO(0), I(0)},
-        vector<string>{GI(0)});
-  }
-};
-REGISTER_GRADIENT(Copy, GetCopyGradient);
-
-struct GetGPUToCPUGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (g_output_[0].IsDense()) {
-      return SingleGradientDef(
-          "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
-    } else {
-      return vector<OperatorDef>{CreateOperatorDef(
-                                     "CopyCPUToGPU",
-                                     "",
-                                     std::vector<string>{GO_I(0)},
-                                     std::vector<string>{GI_I(0)}),
-                                 CreateOperatorDef(
-                                     "CopyCPUToGPU",
-                                     "",
-                                     std::vector<string>{GO_V(0)},
-                                     std::vector<string>{GI_V(0)})};
-    }
-  }
-};
-REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
-
-struct GetCPUToGPUGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (g_output_[0].IsDense()) {
-      return SingleGradientDef(
-          "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
-    } else {
-      return vector<OperatorDef>{CreateOperatorDef(
-                                     "CopyGPUToCPU",
-                                     "",
-                                     std::vector<string>{GO_I(0)},
-                                     std::vector<string>{GI_I(0)}),
-                                 CreateOperatorDef(
-                                     "CopyGPUToCPU",
-                                     "",
-                                     std::vector<string>{GO_V(0)},
-                                     std::vector<string>{GI_V(0)})};
-    }
-  }
-};
-REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
-
 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index e771c4ee36e1cc..8272bbcad3e55c 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -36,28 +36,6 @@ bool SumOp<CUDAContext>::RunOnDevice() {
   return false;
 }
 
-template <>
-class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
-    : public Operator<CUDAContext> {
- public:
-  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {}
-  USE_OPERATOR_FUNCTIONS(CUDAContext);
-
-  bool RunOnDevice() override {
-    auto& input = Input(0);
-    auto* output = OperatorBase::Output<Tensor>(0, CUDA);
-    CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
-    output->ResizeLike(input);
-    context.template CopyItems<CUDAContext, CUDAContext>(
-        input.meta(),
-        input.size(),
-        input.raw_data(),
-        output->raw_mutable_data(input.meta()));
-    return true;
-  }
-};
-
 REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Flatten, FlattenOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp<CUDAContext>);
@@ -66,27 +44,6 @@ REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Sum, SumOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<CUDAContext>);
 
-// From CPU, copy it to whatever the current context
-REGISTER_CUDA_OPERATOR(
-    CopyFromCPUInput,
-    CopyOp<CUDAContext, CUDAContext, CPUContext>);
-
-// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
-// since gpu code will be involved.
-REGISTER_CUDA_OPERATOR(
-    CopyGPUToCPU,
-    CopyOp<CUDAContext, CPUContext, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    CopyCPUToGPU,
-    CopyOp<CUDAContext, CUDAContext, CPUContext>);
-// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
-// involving different GPUs.
-REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
-
-REGISTER_CUDA_OPERATOR(
-    CopyOnDeviceLike,
-    CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>);
-
 REGISTER_CUDA_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CUDAContext>);
 
 CAFFE_KNOWN_TYPE(const float*);
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index c661ec2f5c55ec..9a615f53e25394 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -698,33 +698,6 @@ class ScatterAssignOp : public Operator<Context> {
   INPUT_TAGS(DATA, INDICES, SLICES);
 };
 
-template <class Context, class DstContext, class SrcContext>
-class CopyOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(CopyOp);
-
-  bool RunOnDevice() override {
-    auto& input = this->template Input<Tensor>(0, SrcContext::GetDeviceType());
-    auto* output =
-        this->template Output<Tensor>(0, DstContext::GetDeviceType());
-    output->ResizeLike(input);
-    this->context_.template CopyItems<SrcContext, DstContext>(
-        input.meta(),
-        input.size(),
-        input.raw_data(),
-        output->raw_mutable_data(input.meta()));
-    return true;
-  }
-};
-
-template <class Context, class DstContext, class SrcContext>
-class CopyOnDeviceLikeOp : public CopyOp<Context, DstContext, SrcContext> {
- public:
-  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
-      : CopyOp<Context, DstContext, SrcContext>(operator_def, ws) {}
-};
-
 template <class Context>
 class LengthsToSegmentIdsOp : public Operator<Context> {
  public:

From 130d55a5f4470690885668bbe1df10a2183638ac Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Wed, 12 Sep 2018 16:40:30 -0700
Subject: [PATCH 098/237] Allow building the C++ API without cereal (#11498)

Summary:
I am working on unifying the C++ extensions and C++ API, and one constraint for this is that we will want to be able to build the C++ API without cereal, since we won't want to ship it with the Python `torch` package.

For this I introduce a `TORCH_WITH_CEREAL` option to CMake. If on, the C++ API will be built with cereal and thus serialization support. If off, serialization functions will throw exceptions, but the library will otherwise still compile the same. __This option is on by default, so for regular C++ API users nothing will change__. However, from C++ extensions, we'll be able to turn it off. This effectively means we won't be searching for any cereal headers from C++ API headers, which wouldn't be installed in the Python package.

ebetica ezyang soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11498

Differential Revision: D9784803

Pulled By: goldsborough

fbshipit-source-id: 5d0a1f2501993012d28cf3d730f45932b483abc4
---
 CMakeLists.txt                               |  1 +
 cmake/Summary.cmake                          |  1 +
 tools/build_libtorch.py                      | 13 +++--
 tools/build_pytorch_libs.sh                  |  5 ++
 torch/CMakeLists.txt                         | 24 +++++---
 torch/csrc/api/include/torch/optim/adagrad.h | 15 ++++-
 torch/csrc/api/include/torch/optim/adam.h    | 16 +++--
 torch/csrc/api/include/torch/optim/lbfgs.h   |  9 +--
 torch/csrc/api/include/torch/optim/rmsprop.h | 16 +++--
 torch/csrc/api/include/torch/optim/sgd.h     | 15 ++++-
 torch/csrc/api/include/torch/serialization.h | 61 +++++++++++---------
 11 files changed, 116 insertions(+), 60 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60d69ef2d9bc68..827121b1fc5931 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,6 +124,7 @@ cmake_dependent_option(
 cmake_dependent_option(
     USE_GLOO_IBVERBS "Use Gloo IB verbs for distributed. Only available if USE_GLOO is on." OFF
     "USE_GLOO" OFF)
+option(TORCH_USE_CEREAL "Build the C++ API with Cereal for serialization support" OFF)
 
 # Used when building Caffe2 through setup.py
 option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 16d18ac7634d0d..a314e4d348b708 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -125,6 +125,7 @@ function (caffe2_print_configuration_summary)
     message(STATUS "    USE_GLOO            : ${USE_GLOO}")
     message(STATUS "    USE_GLOO_IBVERBS    : ${USE_GLOO_IBVERBS}")
   endif()
+  message(STATUS "  TORCH_USE_CEREAL       : ${TORCH_USE_CEREAL}")
 
   message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
   message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index 8428282ab7b9f8..df29875666063e 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -9,7 +9,8 @@
 if __name__ == '__main__':
     # Placeholder for future interface. For now just gives a nice -h.
     parser = argparse.ArgumentParser(description='Build libtorch')
-    args = parser.parse_args()
+    parser.add_argument('--use-cereal', action='store_true')
+    options = parser.parse_args()
 
     os.environ['BUILD_TORCH'] = 'ON'
     os.environ['BUILD_TEST'] = 'ON'
@@ -19,11 +20,13 @@
     tools_path = os.path.dirname(os.path.abspath(__file__))
     build_pytorch_libs = os.path.join(tools_path, 'build_pytorch_libs.sh')
 
-    command = '{} --use-nnpack '.format(build_pytorch_libs)
+    command = [build_pytorch_libs, '--use-nnpack']
     if USE_CUDA:
-        command += '--use-cuda '
-    command += 'caffe2'
+        command.append('--use-cuda')
+    if options.use_cereal:
+        command.append('--use-cereal')
+    command.append('caffe2')
 
     sys.stdout.flush()
     sys.stderr.flush()
-    subprocess.check_call(shlex.split(command), universal_newlines=True)
+    subprocess.check_call(command, universal_newlines=True)
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 66c8e6d65cf0b0..79dfc948ea07b2 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -22,6 +22,7 @@ USE_NNPACK=0
 USE_MKLDNN=0
 USE_GLOO_IBVERBS=0
 CAFFE2_STATIC_LINK_CUDA=0
+TORCH_USE_CEREAL=0
 RERUN_CMAKE=1
 while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -46,6 +47,9 @@ while [[ $# -gt 0 ]]; do
       --cuda-static-link)
           CAFFE2_STATIC_LINK_CUDA=1
           ;;
+      --use-cereal)
+          TORCH_USE_CEREAL=1
+          ;;
       *)
           break
           ;;
@@ -190,6 +194,7 @@ function build() {
 		       -DTHCUNN_SO_VERSION=1 \
 		       -DTHD_SO_VERSION=1 \
 		       -DUSE_CUDA=$USE_CUDA \
+		       -DTORCH_USE_CEREAL=$TORCH_USE_CEREAL \
 		       -DBUILD_EXAMPLES=OFF \
 		       -DBUILD_TEST=$BUILD_TEST \
 		       -DNO_NNPACK=$((1-$USE_NNPACK)) \
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index b0f07a7ef3a73b..427270d2a89fa1 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -211,7 +211,6 @@ if (NOT NO_API AND NOT USE_ROCM)
     ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
     )
-
 endif()
 
 
@@ -302,6 +301,13 @@ if (NOT NO_API AND NOT USE_ROCM)
   target_include_directories(torch PUBLIC
     ${TORCH_SRC_DIR}/csrc/api
     ${TORCH_SRC_DIR}/csrc/api/include)
+
+  if (TORCH_USE_CEREAL)
+    target_compile_definitions(torch PUBLIC TORCH_USE_CEREAL)
+    # SYSTEM headers are included with -isystem and thus do not trigger warnings.
+    target_include_directories(torch SYSTEM PUBLIC
+      "${TORCH_ROOT}/third_party/cereal/include") # For cereal/
+  endif()
 endif()
 
 if(USE_CUDA)
@@ -369,10 +375,6 @@ target_include_directories(torch PRIVATE ${ATen_CPU_INCLUDE})
 target_include_directories(torch PUBLIC
   ${TORCH_SRC_DIR}/csrc)
 
-# SYSTEM headers are included with -isystem and thus do not trigger warnings.
-target_include_directories(torch SYSTEM PUBLIC
-  "${TORCH_ROOT}/third_party/cereal/include") # For cereal/
-
 set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1)
 
 if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
@@ -407,7 +409,7 @@ endif()
 if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
   set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api")
 
-  add_executable(test_api
+  set(TORCH_API_TEST_SOURCES
     ${TORCH_API_TEST_DIR}/any.cpp
     ${TORCH_API_TEST_DIR}/cursor.cpp
     ${TORCH_API_TEST_DIR}/integration.cpp
@@ -419,15 +421,19 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
     ${TORCH_API_TEST_DIR}/parallel.cpp
     ${TORCH_API_TEST_DIR}/rnn.cpp
     ${TORCH_API_TEST_DIR}/sequential.cpp
-    ${TORCH_API_TEST_DIR}/serialization.cpp
     ${TORCH_API_TEST_DIR}/static.cpp
     ${TORCH_API_TEST_DIR}/tensor_cuda.cpp
     ${TORCH_API_TEST_DIR}/tensor.cpp
     ${TORCH_API_TEST_DIR}/jit.cpp
-    # Temporary until ATen tests are built with Caffe2
     ${TORCH_API_TEST_DIR}/tensor_options.cpp
     ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp
-    )
+  )
+
+  if (TORCH_USE_CEREAL)
+    list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/serialization.cpp)
+  endif()
+
+  add_executable(test_api ${TORCH_API_TEST_SOURCES})
 
   target_include_directories(test_api
     PUBLIC
diff --git a/torch/csrc/api/include/torch/optim/adagrad.h b/torch/csrc/api/include/torch/optim/adagrad.h
index ced6a68a83f53b..203ce24a33cb14 100644
--- a/torch/csrc/api/include/torch/optim/adagrad.h
+++ b/torch/csrc/api/include/torch/optim/adagrad.h
@@ -2,13 +2,11 @@
 
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 #include <torch/tensor.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <utility>
 #include <vector>
 
@@ -37,12 +35,16 @@ class Adagrad : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(sum_));
     ar(CEREAL_NVP(step_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   Adagrad() : options(0) {}
 
   std::vector<Tensor> sum_;
@@ -50,3 +52,10 @@ class Adagrad : public Optimizer {
 };
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::Adagrad);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::Adagrad);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
index 87d00e6a6676a7..514715b6b0c181 100644
--- a/torch/csrc/api/include/torch/optim/adam.h
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -3,12 +3,10 @@
 #include <torch/nn/module.h>
 #include <torch/nn/pimpl.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <utility>
 #include <vector>
 
@@ -36,16 +34,20 @@ class Adam : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(step_buffers_),
        CEREAL_NVP(exp_average_buffers_),
        CEREAL_NVP(exp_average_sq_buffers_),
        CEREAL_NVP(max_exp_average_sq_buffers_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
   AdamOptions options;
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   Adam() : options(0) {}
 
   std::vector<int64_t> step_buffers_;
@@ -53,6 +55,12 @@ class Adam : public Optimizer {
   std::vector<Tensor> exp_average_sq_buffers_;
   std::vector<Tensor> max_exp_average_sq_buffers_;
 };
-
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::Adam);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::Adam);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index 9ea8065f000872..eec8c8fdc415d0 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -2,12 +2,10 @@
 
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <deque>
 #include <functional>
 #include <memory>
@@ -41,6 +39,7 @@ class LBFGS : public LossClosureOptimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(d));
     ar(CEREAL_NVP(t));
     ar(CEREAL_NVP(H_diag));
@@ -48,10 +47,13 @@ class LBFGS : public LossClosureOptimizer {
     ar(CEREAL_NVP(prev_loss));
     ar(CEREAL_NVP(old_dirs));
     ar(CEREAL_NVP(old_stps));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   LBFGS() : options(0) {}
 
   Tensor gather_flat_grad();
@@ -69,6 +71,5 @@ class LBFGS : public LossClosureOptimizer {
   int64_t func_evals{0};
   int64_t state_n_iter{0};
 };
-
 } // namespace optim
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
index f7f1ddb96a0971..4a84331f92810c 100644
--- a/torch/csrc/api/include/torch/optim/rmsprop.h
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -2,12 +2,10 @@
 
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <functional>
 #include <memory>
 #include <string>
@@ -41,19 +39,29 @@ class RMSprop : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(square_average_buffers_));
     ar(CEREAL_NVP(momentum_buffers_));
     ar(CEREAL_NVP(grad_average_buffers_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   RMSprop() : options(0) {}
 
   std::vector<Tensor> square_average_buffers_;
   std::vector<Tensor> momentum_buffers_;
   std::vector<Tensor> grad_average_buffers_;
 };
-
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::RMSprop);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::RMSprop);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/optim/sgd.h b/torch/csrc/api/include/torch/optim/sgd.h
index 8dc283b8b68da8..345d0343c1ba85 100644
--- a/torch/csrc/api/include/torch/optim/sgd.h
+++ b/torch/csrc/api/include/torch/optim/sgd.h
@@ -3,13 +3,11 @@
 #include <torch/nn/module.h>
 #include <torch/nn/pimpl.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 #include <torch/tensor.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <cstddef>
 #include <utility>
 #include <vector>
@@ -37,13 +35,17 @@ class SGD : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(momentum_buffers_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
   SGDOptions options;
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   SGD() : options(0) {}
 
   std::vector<Tensor> momentum_buffers_;
@@ -52,3 +54,10 @@ class SGD : public Optimizer {
 };
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::SGD);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::SGD);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/serialization.h b/torch/csrc/api/include/torch/serialization.h
index 1f6be330090d31..a85c027ce5917a 100644
--- a/torch/csrc/api/include/torch/serialization.h
+++ b/torch/csrc/api/include/torch/serialization.h
@@ -2,45 +2,69 @@
 
 #include <fstream>
 
-#include <torch/optim.h>
 #include <torch/tensor.h>
 #include <torch/utils.h>
 
+#if defined(TORCH_USE_CEREAL)
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+#include <cereal/types/polymorphic.hpp>
+
 #include "cereal/archives/binary.hpp"
-#include "cereal/types/polymorphic.hpp"
 
 #include "cereal/types/string.hpp"
 #include "cereal/types/unordered_map.hpp"
 #include "cereal/types/vector.hpp"
+#endif // defined(TORCH_USE_CEREAL)
 
 namespace torch {
-
 // Some convenience functions for saving and loading
 template <typename T>
 void save(std::ostream& stream, T const& obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryOutputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void load(std::istream& stream, T& obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryInputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void save(std::ostream& stream, T const* obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryOutputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void load(std::istream& stream, T* obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryInputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void save(std::string const& path, T const& obj) {
   std::ofstream os(path, std::ios::binary);
   torch::save(os, obj);
 }
+
 template <typename T>
 void load(std::string const& path, T& obj) {
   std::ifstream is(path, std::ios::binary);
@@ -74,8 +98,7 @@ inline int32_t scalarTypeId(torch::Dtype type) {
     case torch::Dtype::Undefined:
       return 8;
     default:
-      throw std::runtime_error(
-          "Unknown scalar type: " + std::to_string(static_cast<int>(type)));
+      AT_ERROR("Unknown scalar type: ", static_cast<int>(type));
   }
 }
 
@@ -100,7 +123,7 @@ inline torch::Dtype scalarTypeFromId(int32_t id) {
     case 8:
       return torch::Dtype::Undefined;
     default:
-      throw std::runtime_error("Unknown scalar type id: " + std::to_string(id));
+      AT_ERROR("Unknown scalar type id: ", id);
   }
 }
 
@@ -117,8 +140,7 @@ inline int32_t backendId(at::Backend backend) {
     case at::Backend::Undefined:
       return 4;
     default:
-      throw std::runtime_error(
-          "Unknown backend: " + std::to_string(static_cast<int>(backend)));
+      AT_ERROR("Unknown backend: ", static_cast<int>(backend));
   }
 }
 
@@ -135,33 +157,15 @@ inline at::Backend backendFromId(int32_t id) {
     case 4:
       return at::Backend::Undefined;
     default:
-      throw std::runtime_error("Unknown backend id: " + std::to_string(id));
+      AT_ERROR("Unknown backend id: ", id);
   }
 }
 
 } // namespace detail
 } // namespace torch
 
-// This is super ugly and I don't know how to simplify it
-CEREAL_REGISTER_TYPE(torch::optim::SGD);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::SGD);
-CEREAL_REGISTER_TYPE(torch::optim::Adagrad);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::Adagrad);
-CEREAL_REGISTER_TYPE(torch::optim::RMSprop);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::RMSprop);
-CEREAL_REGISTER_TYPE(torch::optim::Adam);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::Adam);
-
+#if defined(TORCH_USE_CEREAL)
 namespace cereal {
-
 namespace agimpl {
 
 template <class Archive>
@@ -269,3 +273,4 @@ void load(Archive& archive, torch::Tensor& tensor) {
   }
 }
 } // namespace cereal
+#endif // defined(TORCH_USE_CEREAL)

From 5b2efcf4258a7a80fcfd59e2aa54306c4da40568 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Wed, 12 Sep 2018 16:41:58 -0700
Subject: [PATCH 099/237] Document the Conv module (#11566)

Summary:
Document the C++ API conv module. No code changes.

ebetica ezyang soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11566

Differential Revision: D9793665

Pulled By: goldsborough

fbshipit-source-id: 5f7f0605f952fadc62ffbcb8eca4183d4142c451
---
 .../csrc/api/include/torch/nn/modules/conv.h  | 81 ++++++++++++++++++-
 torch/csrc/api/include/torch/nn/modules/rnn.h | 12 +--
 torch/csrc/api/src/nn/modules/conv.cpp        |  3 +-
 3 files changed, 88 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 3f562eab1c5a52..c6e6a2392dbe36 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -10,6 +10,8 @@
 
 namespace torch {
 namespace nn {
+
+/// Options for a `D`-dimensional convolution module.
 template <size_t D>
 struct ConvOptions {
   ConvOptions(
@@ -17,18 +19,59 @@ struct ConvOptions {
       int64_t output_channels,
       ExpandingArray<D> kernel_size);
 
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(int64_t, input_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(int64_t, output_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// If true, convolutions will be transpose convolutions (a.k.a.
+  /// deconvolutions).
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(bool, transposed) = false;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(bool, with_bias) = true;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(int64_t, groups) = 1;
 };
 
+/// Base class for all (dimension-specialized) convolution modules.
 template <size_t D, typename Derived>
 class ConvImpl : public torch::nn::Cloneable<Derived> {
  public:
@@ -36,44 +79,80 @@ class ConvImpl : public torch::nn::Cloneable<Derived> {
       int64_t input_channels,
       int64_t output_channels,
       ExpandingArray<D> kernel_size)
-      : ConvImpl(ConvOptions<D>(input_channels, output_channels, kernel_size)) {}
+      : ConvImpl(ConvOptions<D>(input_channels, output_channels, kernel_size)) {
+  }
   explicit ConvImpl(ConvOptions<D> options);
 
   void reset() override;
 
+  /// The options with which this `Module` was constructed.
   ConvOptions<D> options;
+
+  /// The learned kernel (or "weight").
   Tensor weight;
+
+  /// The learned bias. Only defined if the `with_bias` option was true.
   Tensor bias;
 };
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/// Applies convolution over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv1d to learn about
+/// the exact behavior of this module.
 class Conv1dImpl : public ConvImpl<1, Conv1dImpl> {
  public:
   using ConvImpl<1, Conv1dImpl>::ConvImpl;
   Tensor forward(Tensor input);
 };
+
+/// `ConvOptions` specialized for 1-D convolution.
 using Conv1dOptions = ConvOptions<1>;
+
+/// A `ModuleHolder` subclass for `Conv1dImpl`.
+/// See the documentation for `Conv1dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Conv1d);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/// Applies convolution over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d to learn about
+/// the exact behavior of this module.
 class Conv2dImpl : public ConvImpl<2, Conv2dImpl> {
  public:
   using ConvImpl<2, Conv2dImpl>::ConvImpl;
   Tensor forward(Tensor input);
 };
+
+/// `ConvOptions` specialized for 2-D convolution.
 using Conv2dOptions = ConvOptions<2>;
+
+/// A `ModuleHolder` subclass for `Conv2dImpl`.
+/// See the documentation for `Conv2dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Conv2d);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/// Applies convolution over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv3d to learn about
+/// the exact behavior of this module.
 class Conv3dImpl : public ConvImpl<3, Conv3dImpl> {
  public:
   using ConvImpl<3, Conv3dImpl>::ConvImpl;
   Tensor forward(Tensor input);
 };
+
+/// `ConvOptions` specialized for 3-D convolution.
 using Conv3dOptions = ConvOptions<3>;
+
+/// A `ModuleHolder` subclass for `Conv3dImpl`.
+/// See the documentation for `Conv3dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Conv3d);
 
 } // namespace nn
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index 326e9e267a9f32..e287b79cacd18a 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -170,8 +170,8 @@ struct RNNOptions {
 };
 
 /// A multi-layer Elman RNN module with Tanh or ReLU activation.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNN for more
-/// documenation.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNN to learn about the
+/// exact behavior of this module.
 class RNNImpl : public detail::RNNImplBase<RNNImpl> {
  public:
   RNNImpl(int64_t input_size, int64_t hidden_size)
@@ -198,8 +198,8 @@ TORCH_MODULE(RNN);
 using LSTMOptions = detail::RNNOptionsBase;
 
 /// A multi-layer long-short-term-memory (LSTM) module.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM for more
-/// documenation.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM to learn about the
+/// exact behavior of this module.
 class LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
  public:
   LSTMImpl(int64_t input_size, int64_t hidden_size)
@@ -224,8 +224,8 @@ TORCH_MODULE(LSTM);
 using GRUOptions = detail::RNNOptionsBase;
 
 /// A multi-layer gated recurrent unit (GRU) module.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRU for more
-/// documenation.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRU to learn about the
+/// exact behavior of this module.
 class GRUImpl : public detail::RNNImplBase<GRUImpl> {
  public:
   GRUImpl(int64_t input_size, int64_t hidden_size)
diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp
index 06ed1d10ea3170..30a25714de4277 100644
--- a/torch/csrc/api/src/nn/modules/conv.cpp
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdint>
 #include <functional>
+#include <utility>
 #include <vector>
 
 namespace torch {
@@ -61,7 +62,7 @@ void ConvImpl<D, Derived>::reset() {
       options.input_channels_,
       std::multiplies<int64_t>{});
   const auto stdv = 1.0 / std::sqrt(number_of_features);
-  NoGradGuard no_grad;;
+  NoGradGuard no_grad;
   for (auto& p : this->parameters()) {
     p->uniform_(-stdv, stdv);
   }

From def44c96fd7bda4cc4a9574e6f8466eea6606fb4 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 16:46:27 -0700
Subject: [PATCH 100/237] Revert D9779866: [pytorch][PR] Move function deletion
 from the stack to the heap.

Differential Revision:
D9779866

Original commit changeset: 96753eead790

fbshipit-source-id: 959deeb63318d48f4c563e10e70ef6ec7fabd3b4
---
 torch/csrc/autograd/function.cpp | 122 +++++++++++++++++++++----------
 torch/csrc/autograd/function.h   |   2 +
 torch/csrc/autograd/init.cpp     |   5 ++
 3 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index 6090b520bd36ab..ba33a6f116bac2 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -36,53 +36,95 @@ AnomalyMetadata* Function::metadata() noexcept {
   return anomaly_metadata_.get();
 }
 
-static void gatherFunctions(Function* func,
-                            std::vector<std::shared_ptr<Function>>& stack) {
-  for (auto& edge : func->next_edges()) {
-    if (edge.function.use_count() == 1) {
-      stack.emplace_back(std::move(edge.function));
-    }
+/*
+ * Fix for #5534: prevent stack overflow on deletion of deep computation graph
+ *
+ * Sometimes one can end up with a very big computation graph of Functions
+ * and Edges. Each std::shared_ptr<Function> contains a list of Edge, and
+ * each Edge contains a std::shared_ptr<Function>. Deleting a
+ * std::shared_ptr<Function> can trigger the recursive deletion of other
+ * std::shared_ptr<Function>'s: this can stack overflow if the graph
+ * is deep enough. Here is an example of such a graph:
+ *
+ * shared_ptr<Function> -> Edge -> shared_ptr<Function> -> Edge -> ... -> shared_ptr<Function>
+ *
+ * The solution here is to use a custom deleter with each
+ * std::shared_ptr<Function>. The custom deleter keeps track of how many
+ * nested deleters it is in. When this number exceeds the maximum allowed
+ * depth, the Function* to be deleted are accumulated in a per-thread
+ * delete queue and handled by one of the deleters.
+ *
+ * Note that these custom deleters are NOT necessary for deleting PyFunction.
+ * This is because a THPFunction Python object owns a PyFunction that is in a
+ * computation graph. When Python objects get recursively destroyed, they
+ * are also queued into a delete list. This happens very early for them
+ * (at 50 deleters): https://github.com/python/cpython/blob/f320be77ffb73e3b9e7fc98c37b8df3975d84b40/Include/object.h#L1024-L1063
+ * so we don't need to worry about them.
+ */
+
+thread_local std::deque<Function*> deleteFunctionQueue;
+thread_local size_t deleteFunctionRecursionDepth = 0;
+
+/*
+ * If this number is set too high, a deep computation graph can still
+ * stack overflow. The procedure for setting this number was to
+ * 1) find the smallest value that would not guard against stack overflows
+ *    on various machines
+ * 2) Take the minimum of all such values and subtract some leeway because
+ *    the memory of these stack frames will probably grow as time passes.
+ * Testing on a few machines machines, the magic numbers were:
+ * - Mac OSX (Macbook Pro 15) : ~60000
+ * - A beefy Ubuntu 16.04 box : ~15000
+ * - Windows AWS instance (g3.4xlarge): variable. My two attempts at different
+ *   times have gotten the following numbers: ~8300, 3669
+ */
+#ifdef _WIN32
+size_t deleteFunctionMaxRecursionDepth = 3000;
+#else
+size_t deleteFunctionMaxRecursionDepth = 10000;
+#endif
+
+struct RecursionDepthCounter {
+ public:
+  explicit RecursionDepthCounter() {
+    ++deleteFunctionRecursionDepth;
+  }
+  ~RecursionDepthCounter() {
+    --deleteFunctionRecursionDepth;
   }
-}
+
+  size_t value() {
+    return deleteFunctionRecursionDepth;
+  }
+};
 
 /*
-  * Fix for #5534: prevent stack overflow on deletion of deep computation graph
-  * 
-  * Sometimes one can end up with a very big computation graph of Functions
-  * and Edges. Each std::shared_ptr<Function> contains a list of Edge, and
-  * each Edge contains a std::shared_ptr<Function>. Deleting a
-  * std::shared_ptr<Function> can trigger the recursive deletion of other
-  * std::shared_ptr<Function>'s: this can stack overflow if the graph
-  * is deep enough. Here is an example of such a graph:
-  *
-  * shared_ptr<Function> -> Edge -> shared_ptr<Function> -> Edge -> ... -> shared_ptr<Function>
-  *
-  * The solution here is to detect when we are decrementing away the last
-  * reference to a Function, and when doing so to buffer up the Function's
-  * that will be recursively decremented.  We can then decrement (and free)
-  * the original Function without causing a recursive cascade, before
-  * draining the buffer applying the same behavior.  This is, in effect,
-  * converting recursion to a loop, using a heap buffer in place of the
-  * recursive call stack.
-  */
+ * Note that the custom deleter deletes in BFS style. Without using
+ * the custom deleter, the computation graph is deleted in a DFS style.
+ * The BFS deletion is valid (and safe) because if a shared_ptr<Function>
+ * 's reference count hits 0, nothing else will access it.
+ */
 void deleteFunction(Function* function) {
-  // To avoid stack overflow on large computational graphs,
-  // we need to track reference decrementing and freeing
-  // on the heap.
-  std::vector<std::shared_ptr<Function>> stack;
-  gatherFunctions(function, stack);
-  delete function;
+  RecursionDepthCounter recursion_depth;
 
-  while (!stack.empty()) {
-    auto& curr_func = stack.back();
+  if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) {
+    deleteFunctionQueue.push_back(function);
+    return;
+  }
 
-    if (curr_func.use_count() == 1) {
-      // If this is the last reference, gather function references
-      // that will be recursively decremented.
-      gatherFunctions(curr_func.get(), stack);
-    }
+  delete function;
 
-    stack.pop_back();
+  if (deleteFunctionQueue.empty()) {
+    return;
+  }
+  if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) {
+    AT_ERROR("Only one deleter per thread should be able to process "
+             "the delete queue. Please open an issue.");
+  }
+  while (!deleteFunctionQueue.empty()) {
+    auto queued_function = deleteFunctionQueue.front();
+    deleteFunctionQueue.pop_front();
+    delete queued_function;
   }
 }
 
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index b4c90b1489a261..dfe8683197f8ee 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -33,6 +33,8 @@ using edge_list = std::vector<Edge>;
 using saved_variable_list = std::vector<SavedVariable>;
 using IndexRange = std::pair<size_t, size_t>;
 
+TORCH_API extern size_t deleteFunctionMaxRecursionDepth;
+
 // Custom deleter to prevent stack overflows.
 void deleteFunction(Function* function);
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 0be1cc2600a2f0..b306426025e025 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -50,6 +50,11 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   });
   m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
+  /// TODO: Replace this ASAP with a better solution for deep autograd graphs!
+  m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) {
+    torch::autograd::deleteFunctionMaxRecursionDepth = value;
+  });
+
   Py_RETURN_TRUE;
 }
 

From 776a9992e15b896627b5703646e579339117280d Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <johannes.dieterich@amd.com>
Date: Wed, 12 Sep 2018 16:47:00 -0700
Subject: [PATCH 101/237] topk test fix, hgemm integration (#11593)

Summary:
After discussions in #11584 , new PR for just the test skip and hgemm integration.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11593

Differential Revision: D9798527

Pulled By: ezyang

fbshipit-source-id: e2ef5609676571caef2f8e6844909fe3a11d8b3e
---
 aten/src/THC/THCBlas.cu | 8 ++++++++
 test/test_torch.py      | 1 +
 2 files changed, 9 insertions(+)

diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index bb9f7b92bd691a..20b13d82b9a152 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -285,6 +285,13 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
       cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
       cublasSetStream(handle, THCState_getCurrentStream(state));
 
+#ifdef __HIP_PLATFORM_HCC__
+      THCublasCheck(rocblas_hgemm(handle, opa, opb, i_m, i_n, i_k,
+                    reinterpret_cast<rocblas_half*>(&alpha), reinterpret_cast<rocblas_half*>(a), i_lda,
+                    reinterpret_cast<rocblas_half*>(b), i_ldb, reinterpret_cast<rocblas_half*>(&beta),
+                    reinterpret_cast<rocblas_half*>(c), i_ldc));
+#else
+
       // Simulated Hgemm
       float fAlpha = THC_half2float(alpha);
       float fBeta = THC_half2float(beta);
@@ -314,6 +321,7 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
                                     a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
                                     i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
       }
+#endif
 #endif
       return;
     }
diff --git a/test/test_torch.py b/test/test_torch.py
index ea0d69d7b0e265..dce509cb7c5b0e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3513,6 +3513,7 @@ def test_topk_arguments(self):
         self.assertRaises(TypeError, lambda: q.topk(4, True))
 
     @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    @skipIfRocm
     def test_topk_noncontiguous_gpu(self):
         t = torch.randn(20, device="cuda")[::2]
         top1, idx1 = t.topk(5)

From 7f7cda99cd83c0356d7e2cd919a505f4a8e3ddfd Mon Sep 17 00:00:00 2001
From: Xiaomeng Yang <yangxm@fb.com>
Date: Wed, 12 Sep 2018 16:53:16 -0700
Subject: [PATCH 102/237] Optimize order_swich_ops on GPU (#11404)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11404

Optimize order_swich_ops on GPU

Reviewed By: houseroad

Differential Revision: D9728642

fbshipit-source-id: 74ff62268856fb1613fa61eb214bed6ec6716632
---
 caffe2/operators/order_switch_ops.cu          | 130 ++++++++------
 caffe2/operators/order_switch_ops_cudnn.cc    | 160 ++++++++++++++++++
 .../python/operator_test/order_switch_test.py |  19 ++-
 3 files changed, 249 insertions(+), 60 deletions(-)
 create mode 100644 caffe2/operators/order_switch_ops_cudnn.cc

diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
index 27a71a69141098..c213b7cc848d0d 100644
--- a/caffe2/operators/order_switch_ops.cu
+++ b/caffe2/operators/order_switch_ops.cu
@@ -1,91 +1,115 @@
 #include "caffe2/operators/order_switch_ops.h"
+
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/fixed_divisor.h"
 
 namespace caffe2 {
 
-__global__ void NHWC2NCHWKernel(
-    const int N,
-    const int HW,
-    const int C,
-    const float* X,
-    float* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N * HW * C) {
-    const int c = i % C;
-    const int hw = i / C % HW;
-    const int n = i / C / HW;
-    Y[(n * C + c) * HW + hw] = X[i];
+template <typename T>
+__global__ void NHWC2NCHWCUDAKernel(
+    const int size,
+    const FixedDivisor<int> C,
+    const FixedDivisor<int> HxW,
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    int n;
+    int c;
+    int hxw;
+    HxW.DivMod(i, &c, &hxw);
+    C.DivMod(c, &n, &c);
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + (n * HxW.d() + hxw) * C.d() + c);
+#else
+    Y[i] = X[(n * HxW.d() + hxw) * C.d() + c];
+#endif
   }
 }
 
-__global__ void NCHW2NHWCKernel(
-    const int N,
-    const int C,
-    const int HW,
-    const float* X,
-    float* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N * C * HW) {
-    const int hw = i % HW;
-    const int c = i / HW % C;
-    const int n = i / C / HW;
-    Y[(n * HW + hw) * C + c] = X[i];
+template <typename T>
+__global__ void NCHW2NHWCCUDAKernel(
+    const int size,
+    const FixedDivisor<int> C,
+    const FixedDivisor<int> HxW,
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    int n;
+    int c;
+    int hxw;
+    C.DivMod(i, &hxw, &c);
+    HxW.DivMod(hxw, &n, &hxw);
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + (n * C.d() + c) * HxW.d() + hxw);
+#else
+    Y[i] = X[(n * C.d() + c) * HxW.d() + hxw];
+#endif
   }
 }
 
 template <>
 bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
+  const auto& X = Input(0);
   auto* Y = Output(0);
-
-  auto ndim = X.ndim();
-  DCHECK_GE(ndim, 3);
-  const int N = X.dim32(0), C = X.dim32(ndim - 1);
+  const int ndim = X.ndim();
+  CAFFE_ENFORCE_GE(ndim, 3);
+  const int N = X.dim32(0);
+  const int C = X.dim32(ndim - 1);
   vector<TIndex> Y_dims(ndim);
   Y_dims[0] = N;
   Y_dims[1] = C;
-  size_t image_size = 1;
-  for (auto i = 2; i < ndim; ++i) {
+  int HxW = 1;
+  for (int i = 2; i < ndim; ++i) {
     Y_dims[i] = X.dim32(i - 1);
-    image_size *= Y_dims[i];
+    HxW *= Y_dims[i];
   }
   Y->Resize(Y_dims);
-
-  NHWC2NCHWKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, image_size, C, X.data<float>(), Y->template mutable_data<float>());
+  const int size = X.size();
+  NHWC2NCHWCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          size,
+          FixedDivisor<int>(C),
+          FixedDivisor<int>(HxW),
+          X.data<float>(),
+          Y->template mutable_data<float>());
   return true;
 }
 
 template <>
 bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
+  const auto& X = Input(0);
   auto* Y = Output(0);
-
-  auto ndim = X.ndim();
-  DCHECK_GE(X.ndim(), 3);
-  const int N = X.dim32(0), C = X.dim32(1);
+  const int ndim = X.ndim();
+  CAFFE_ENFORCE_GE(X.ndim(), 3);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
   vector<TIndex> Y_dims(ndim);
   Y_dims[0] = N;
-  size_t image_size = 1;
+  int HxW = 1;
   for (auto i = 1; i < ndim - 1; ++i) {
     Y_dims[i] = X.dim32(i + 1);
-    image_size *= Y_dims[i];
+    HxW *= Y_dims[i];
   }
   Y_dims[ndim - 1] = C;
   Y->Resize(Y_dims);
-
-  NCHW2NHWCKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, C, image_size, X.data<float>(), Y->template mutable_data<float>());
+  const int size = X.size();
+  NCHW2NHWCCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          size,
+          FixedDivisor<int>(C),
+          FixedDivisor<int>(HxW),
+          X.data<float>(),
+          Y->template mutable_data<float>());
   return true;
 }
 
-
 REGISTER_CUDA_OPERATOR(NHWC2NCHW, NHWC2NCHWOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(NCHW2NHWC, NCHW2NHWCOp<float, CUDAContext>);
-}  // namespace caffe2
+
+} // namespace caffe2
diff --git a/caffe2/operators/order_switch_ops_cudnn.cc b/caffe2/operators/order_switch_ops_cudnn.cc
new file mode 100644
index 00000000000000..4cb0034e7ee60d
--- /dev/null
+++ b/caffe2/operators/order_switch_ops_cudnn.cc
@@ -0,0 +1,160 @@
+#include "caffe2/operators/order_switch_ops.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+namespace {
+
+class CuDNNOrderSwithOpBase : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNOrderSwithOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
+  }
+
+  virtual ~CuDNNOrderSwithOpBase() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
+  }
+
+ protected:
+  void SetTensorDescriptor(
+      const cudnnDataType_t data_type,
+      const StorageOrder order,
+      const std::vector<int>& data_dims,
+      cudnnTensorDescriptor_t data_desc) const {
+    const int ndim = data_dims.size();
+    const int N = data_dims[0];
+    const int C = order == StorageOrder::NCHW ? data_dims[1] : data_dims.back();
+    if (ndim == 3) {
+      const int H = 1;
+      const int W = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1];
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W));
+    } else if (ndim == 4) {
+      const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1];
+      const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2];
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W));
+    } else {
+      const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1];
+      const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2];
+      const auto l_iter = order == StorageOrder::NCHW ? data_dims.cbegin() + 4
+                                                      : data_dims.cbegin() + 3;
+      const auto r_iter =
+          order == StorageOrder::NCHW ? data_dims.cend() : data_dims.cend() - 1;
+      const int D = std::accumulate(l_iter, r_iter, 1, std::multiplies<int>());
+      const std::array<int, 5> dims = {N, C, H, W, D};
+      const std::array<int, 5> strides = order == StorageOrder::NCHW
+          ? std::array<int, 5>{C * H * W * D, H * W * D, W * D, D, 1}
+          : std::array<int, 5>{C * H * W * D, 1, W * D * C, D * C, C};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          data_desc, data_type, 5, dims.data(), strides.data()));
+    }
+  }
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t X_desc_;
+  cudnnTensorDescriptor_t Y_desc_;
+
+  std::vector<int> cached_X_dims_;
+};
+
+class CuDNNNHWC2NCHWOp final : public CuDNNOrderSwithOpBase {
+ public:
+  CuDNNNHWC2NCHWOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNOrderSwithOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = X.dim32(ndim - 1);
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims(ndim);
+    Y_dims[0] = N;
+    Y_dims[1] = C;
+    std::copy(X_dims.cbegin() + 1, X_dims.cend() - 1, Y_dims.begin() + 2);
+    Y->Resize(Y_dims);
+    if (cached_X_dims_ != X_dims) {
+      cached_X_dims_ = X_dims;
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NHWC, X_dims, X_desc_);
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NCHW, Y_dims, Y_desc_);
+    }
+    CUDNN_ENFORCE(cudnnTransformTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        Y_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+};
+
+class CuDNNNCHW2NHWCOp final : public CuDNNOrderSwithOpBase {
+ public:
+  CuDNNNCHW2NHWCOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNOrderSwithOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = X.dim32(1);
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims(ndim);
+    Y_dims[0] = N;
+    Y_dims[ndim - 1] = C;
+    std::copy(X_dims.cbegin() + 2, X_dims.cend(), Y_dims.begin() + 1);
+    Y->Resize(Y_dims);
+    if (cached_X_dims_ != X_dims) {
+      cached_X_dims_ = X_dims;
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NCHW, X_dims, X_desc_);
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NHWC, Y_dims, Y_desc_);
+    }
+    CUDNN_ENFORCE(cudnnTransformTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        Y_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+};
+
+} // namespace
+
+REGISTER_CUDNN_OPERATOR(NHWC2NCHW, CuDNNNHWC2NCHWOp);
+REGISTER_CUDNN_OPERATOR(NCHW2NHWC, CuDNNNCHW2NHWCOp);
+
+} // namespace caffe2
diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py
index d54ac26c55fb36..5d3fd0e60f5e08 100644
--- a/caffe2/python/operator_test/order_switch_test.py
+++ b/caffe2/python/operator_test/order_switch_test.py
@@ -1,14 +1,17 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
 from caffe2.python import core
 from hypothesis import given
 
 
 class OrderSwitchOpsTest(hu.HypothesisTestCase):
-    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs)
-    def test_nchw2nhwc(self, X, gc, dc):
-        op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], device_option=gc)
+    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    def test_nchw2nhwc(self, X, engine, gc, dc):
+        op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], engine=engine)
 
         def nchw2nhwc_ref(X):
             X_reshaped = X.transpose((0,) + tuple(range(2, X.ndim)) + (1,))
@@ -18,12 +21,14 @@ def nchw2nhwc_ref(X):
         self.assertGradientChecks(gc, op, [X], 0, [0])
         self.assertDeviceChecks(dc, op, [X], [0])
 
-    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs)
-    def test_nhwc2nchw(self, X, gc, dc):
-        op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], device_option=gc)
+    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    def test_nhwc2nchw(self, X, engine, gc, dc):
+        op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], engine=engine)
 
         def nhwc2nchw_ref(X):
-            X_reshaped = X.transpose((0, X.ndim - 1) + tuple(range(1, X.ndim - 1)))
+            X_reshaped = X.transpose(
+                (0, X.ndim - 1) + tuple(range(1, X.ndim - 1)))
             return (X_reshaped,)
 
         self.assertReferenceChecks(gc, op, [X], nhwc2nchw_ref)

From e2cd627cce1e4cc2c6af8d5911a9d2f18adede7e Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 17:27:33 -0700
Subject: [PATCH 103/237] Temporarily disable docs build. (#11608)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11608

Differential Revision: D9803369

Pulled By: ezyang

fbshipit-source-id: a206d6137e8e729f702189c926ec898444d1dc53
---
 .jenkins/pytorch/build.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 2dc64157c5d00d..3159ee1ad629e6 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -117,8 +117,9 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
   # TODO: Don't run this here
-  pip install -r requirements.txt || true
-  LC_ALL=C make html
+  # TODO: Reenable doc build
+  #pip install -r requirements.txt || true
+  #LC_ALL=C make html
   popd
 fi
 

From daa379ffd7e684c4db02914918dcb298849ed600 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 17:30:48 -0700
Subject: [PATCH 104/237] Disable flaky test ObserverTest.TestMultipleNetBase
 (#11596)

Summary:
Tracked in https://github.com/pytorch/pytorch/issues/9137

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11596

Differential Revision: D9803256

Pulled By: ezyang

fbshipit-source-id: 973393203ed8343a3a0feef36d34e561d9f653c4
---
 caffe2/core/observer_test.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc
index f4f4e81a3cb45c..fa8aee6d818366 100644
--- a/caffe2/core/observer_test.cc
+++ b/caffe2/core/observer_test.cc
@@ -151,6 +151,9 @@ TEST(ObserverTest, TestDAGNetBase) {
   EXPECT_EQ(1212, count_after - count_before);
 }
 
+#if 0
+// This test intermittently segfaults,
+// see https://github.com/pytorch/pytorch/issues/9137
 TEST(ObserverTest, TestMultipleNetBase) {
   Workspace ws;
   ws.CreateBlob("in");
@@ -176,4 +179,5 @@ TEST(ObserverTest, TestMultipleNetBase) {
 
   EXPECT_EQ(net.get()->NumObservers(), prev_num);
 }
+#endif
 } // namespace caffe2

From f00f99ebcce966c8b349b2cc5798f389dbead4ad Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Wed, 12 Sep 2018 17:35:32 -0700
Subject: [PATCH 105/237] use at::Half in THC (#11322)

Summary:
- use Half instead of half in THC
- clean up TH_float2half, TH_half2float, etc. conversions
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11322

Differential Revision: D9799553

Pulled By: li-roy

fbshipit-source-id: 9aa3e003bff73d9df6224a393f3ec0624b1f44ed
---
 aten/src/ATen/core/Half-inl.h                 |   8 +
 aten/src/ATen/cuda/NumericLimits.cuh          |   3 +-
 aten/src/ATen/function_wrapper.py             |   6 -
 aten/src/ATen/gen.py                          |   4 +-
 aten/src/TH/CMakeLists.txt                    |   1 -
 aten/src/TH/THDiskFile.cpp                    |   6 +-
 aten/src/TH/THGenerateHalfType.h              |   4 +-
 aten/src/TH/THHalf.cpp                        |  30 --
 aten/src/TH/THHalf.h                          |  10 +-
 aten/src/TH/THMemoryFile.cpp                  |   6 +-
 aten/src/TH/generic/THStorageCopy.cpp         |  48 +--
 aten/src/TH/generic/THTensorCopy.cpp          |  36 +--
 aten/src/THC/CMakeLists.txt                   |   1 -
 aten/src/THC/THCAtomics.cuh                   |  18 +-
 aten/src/THC/THCBlas.cu                       |  24 +-
 aten/src/THC/THCBlas.h                        |  10 +-
 aten/src/THC/THCGeneral.cpp                   |  33 +--
 aten/src/THC/THCGenerateHalfType.h            |   4 +-
 aten/src/THC/THCHalf.h                        |  18 --
 aten/src/THC/THCNumerics.cuh                  | 276 ++++--------------
 aten/src/THC/THCStorage.cpp                   |   2 +-
 aten/src/THC/THCStorage.cu                    |   2 +-
 aten/src/THC/THCStorageCopy.cu                |   2 +-
 aten/src/THC/THCStorageCopy.h                 |   2 +-
 aten/src/THC/THCTensorCopy.cu                 |   2 +-
 aten/src/THC/THCTensorCopy.h                  |   2 +-
 aten/src/THC/THCTensorIndex.cu                |   2 +-
 aten/src/THC/THCTensorMathPairwise.cu         | 113 +------
 aten/src/THC/THCTensorMathPointwise.cuh       | 119 +-------
 aten/src/THC/THCTensorMode.cuh                |  20 +-
 aten/src/THC/THCTensorRandom.cu               |  20 +-
 aten/src/THC/THCTensorTopK.cuh                |  12 +-
 aten/src/THC/THCTensorTypeUtils.cuh           |  54 +---
 aten/src/THC/generic/THCTensorCopy.cu         |   2 +-
 aten/src/THC/generic/THCTensorMathBlas.cu     |   5 +-
 aten/src/THC/generic/THCTensorMathPairwise.cu |  10 -
 aten/src/THCUNN/Abs.cu                        |   2 +-
 aten/src/THCUNN/AbsCriterion.cu               |   2 +-
 aten/src/THCUNN/BCECriterion.cu               |   2 +-
 aten/src/THCUNN/BatchNormalization.cu         |   2 +-
 aten/src/THCUNN/ClassNLLCriterion.cu          |   2 +-
 aten/src/THCUNN/Col2Im.cu                     |   2 +-
 aten/src/THCUNN/DistKLDivCriterion.cu         |   2 +-
 aten/src/THCUNN/ELU.cu                        |   2 +-
 aten/src/THCUNN/GatedLinearUnit.cu            |   2 +-
 aten/src/THCUNN/HardTanh.cu                   |   2 +-
 aten/src/THCUNN/Im2Col.cu                     |   2 +-
 aten/src/THCUNN/IndexLinear.cu                |   2 +-
 aten/src/THCUNN/L1Cost.cu                     |   2 +-
 aten/src/THCUNN/LeakyReLU.cu                  |   2 +-
 aten/src/THCUNN/LogSigmoid.cu                 |   2 +-
 aten/src/THCUNN/LookupTable.cu                |   2 +-
 aten/src/THCUNN/LookupTableBag.cu             |   2 +-
 aten/src/THCUNN/MSECriterion.cu               |   2 +-
 aten/src/THCUNN/MarginCriterion.cu            |   2 +-
 aten/src/THCUNN/MultiLabelMarginCriterion.cu  |   2 +-
 aten/src/THCUNN/MultiMarginCriterion.cu       |   2 +-
 aten/src/THCUNN/PReLU.cu                      |   8 +-
 aten/src/THCUNN/RReLU.cu                      |   6 +-
 aten/src/THCUNN/Sigmoid.cu                    |   2 +-
 aten/src/THCUNN/SmoothL1Criterion.cu          |   2 +-
 aten/src/THCUNN/SoftMarginCriterion.cu        |   2 +-
 aten/src/THCUNN/SoftPlus.cu                   |   2 +-
 aten/src/THCUNN/SoftShrink.cu                 |   2 +-
 aten/src/THCUNN/SparseLinear.cu               |   2 +-
 .../THCUNN/SpatialAdaptiveAveragePooling.cu   |   2 +-
 aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu  |   2 +-
 aten/src/THCUNN/SpatialAveragePooling.cu      |   2 +-
 aten/src/THCUNN/SpatialClassNLLCriterion.cu   |   2 +-
 aten/src/THCUNN/SpatialConvolutionLocal.cu    |   2 +-
 aten/src/THCUNN/SpatialConvolutionMM.cu       |   2 +-
 aten/src/THCUNN/SpatialCrossMapLRN.cu         |   2 +-
 aten/src/THCUNN/SpatialDilatedConvolution.cu  |   2 +-
 aten/src/THCUNN/SpatialDilatedMaxPooling.cu   |   2 +-
 .../src/THCUNN/SpatialFractionalMaxPooling.cu |   2 +-
 aten/src/THCUNN/SpatialFullConvolution.cu     |   2 +-
 .../THCUNN/SpatialFullDilatedConvolution.cu   |   2 +-
 aten/src/THCUNN/SpatialReflectionPadding.cu   |   2 +-
 aten/src/THCUNN/SpatialReplicationPadding.cu  |   2 +-
 aten/src/THCUNN/SpatialSubSampling.cu         |   2 +-
 aten/src/THCUNN/SpatialUpSamplingBilinear.cu  |   2 +-
 aten/src/THCUNN/SpatialUpSamplingNearest.cu   |   2 +-
 aten/src/THCUNN/Sqrt.cu                       |   2 +-
 aten/src/THCUNN/Square.cu                     |   2 +-
 aten/src/THCUNN/THCHalfAutoNumerics.cuh       | 229 ++-------------
 aten/src/THCUNN/Tanh.cu                       |   2 +-
 aten/src/THCUNN/TemporalConvolution.cu        |   2 +-
 aten/src/THCUNN/TemporalMaxPooling.cu         |   2 +-
 aten/src/THCUNN/TemporalReflectionPadding.cu  |   2 +-
 aten/src/THCUNN/TemporalReplicationPadding.cu |   2 +-
 aten/src/THCUNN/TemporalRowConvolution.cu     |   2 +-
 aten/src/THCUNN/TemporalUpSamplingLinear.cu   |   2 +-
 aten/src/THCUNN/TemporalUpSamplingNearest.cu  |   2 +-
 aten/src/THCUNN/Threshold.cu                  |   2 +-
 .../VolumetricAdaptiveAveragePooling.cu       |   2 +-
 .../THCUNN/VolumetricAdaptiveMaxPooling.cu    |   2 +-
 aten/src/THCUNN/VolumetricAveragePooling.cu   |   2 +-
 aten/src/THCUNN/VolumetricConvolution.cu      |   2 +-
 .../THCUNN/VolumetricDilatedConvolution.cu    |   2 +-
 .../src/THCUNN/VolumetricDilatedMaxPooling.cu |   2 +-
 .../THCUNN/VolumetricFractionalMaxPooling.cu  |   2 +-
 aten/src/THCUNN/VolumetricFullConvolution.cu  |   2 +-
 .../VolumetricFullDilatedConvolution.cu       |   2 +-
 aten/src/THCUNN/VolumetricMaxUnpooling.cu     |   2 +-
 .../THCUNN/VolumetricReplicationPadding.cu    |   2 +-
 .../src/THCUNN/VolumetricUpSamplingNearest.cu |   2 +-
 .../THCUNN/VolumetricUpSamplingTrilinear.cu   |   2 +-
 tools/amd_build/disabled_features.yaml        |   7 -
 torch/csrc/cuda/THCP.h                        |   2 +-
 torch/csrc/generic/utils.h                    |   2 +-
 torch/csrc/utils.h                            |   9 +-
 111 files changed, 271 insertions(+), 1043 deletions(-)
 delete mode 100644 aten/src/TH/THHalf.cpp
 delete mode 100644 aten/src/THC/THCHalf.h

diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
index a1786d0bb9db6e..42633508e4b714 100644
--- a/aten/src/ATen/core/Half-inl.h
+++ b/aten/src/ATen/core/Half-inl.h
@@ -43,6 +43,14 @@ inline AT_HOSTDEVICE Half::operator __half() const {
 }
 #endif
 
+// CUDA intrinsics
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+inline __device__ Half __ldg(const Half* ptr) {
+    return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#endif
+
 /// Arithmetic
 
 inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
diff --git a/aten/src/ATen/cuda/NumericLimits.cuh b/aten/src/ATen/cuda/NumericLimits.cuh
index 325cbce737dd51..981bf8c1c34efa 100644
--- a/aten/src/ATen/cuda/NumericLimits.cuh
+++ b/aten/src/ATen/cuda/NumericLimits.cuh
@@ -3,6 +3,7 @@
 #include <cuda.h>
 #include <limits.h>
 #include <math.h>
+#include <float.h>
 
 // NumericLimits.cuh is a holder for numeric limits definitions of commonly used
 // types. This header is very specific to ROCm HIP and may be removed in the future.
@@ -101,4 +102,4 @@ struct numeric_limits<double> {
   static inline __host__ __device__ double upper_bound() { return inf; }
 };
 
-} // namespace at
\ No newline at end of file
+} // namespace at
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 287b7895008cc2..266f9645a18527 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -202,8 +202,6 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})")
 
-HALF_CONVERSION = CodeTemplate("convert<half>(${value})")
-
 
 class NYIError(Exception):
     """Indicates we don't support this declaration yet"""
@@ -1202,8 +1200,6 @@ def create_derived(backend_type_env, declarations):
 
     is_cuda = 'CUDA' in backend_type_env['Backend']
 
-    real_is_half = backend_type_env['ScalarName'] == 'Half'
-
     def replace_with_null(argument):
         # type: (THFormal) -> bool
         return (argument['type'] == 'THGenerator*' and
@@ -1230,8 +1226,6 @@ def get_argument(argument, option):
         elif requires_checked_cast(argument):
             checked_use = CHECKED_USE.get(
                 argument['type'], '{}_').format(argument['name'])
-            if real_is_half and argument['type'] == 'real':
-                checked_use = HALF_CONVERSION.substitute(value=checked_use)
             if nullable_argument(argument):
                 checked_use = CHECKED_USE_NULLABLE.substitute(
                     env={}, arg_name=argument['name'], usage=checked_use)
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index f0827077827a25..76ed7c74a88e07 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -162,7 +162,7 @@ def check_all_files_written(self):
     ('Int', 'int', 'Long', 'int32_t', False),
     ('Long', 'int64_t', 'Long', 'int64_t', False),
     ('Short', 'int16_t', 'Long', 'int16_t', False),
-    ('Half', 'Half', 'Double', 'THHalf', True),
+    ('Half', 'Half', 'Double', 'at::Half', True),
 ]
 
 # shared environment for non-derived base classes Type.h Tensor.h Storage.h
@@ -292,7 +292,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     if scalar_name == "Half":
         env['SparseTensor'] = 'Tensor'
         if backend == "CUDA":
-            env['AS_REAL'] = 'convert<half,double>'
+            env['AS_REAL'] = 'convert<at::Half,double>'
 
     declarations, definitions = function_wrapper.create_derived(
         env, declarations)
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index 9fe22beb0dc54e..10d43e1433c811 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -26,7 +26,6 @@ SET(hdr
 
 set(ATen_TH_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp
diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp
index a806df6ac62ebf..ddf993df6605d7 100644
--- a/aten/src/TH/THDiskFile.cpp
+++ b/aten/src/TH/THDiskFile.cpp
@@ -359,9 +359,9 @@ READ_WRITE_METHODS(float, Float,
                    int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
                    int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
 
-READ_WRITE_METHODS(THHalf, Half,
-                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; },
-                   int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++)
+READ_WRITE_METHODS(at::Half, Half,
+                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= static_cast<at::Half>(buf); nread++; },
+                   int ret = fprintf(dfself->handle, "%.9g", static_cast<float>(data[i])); if(ret <= 0) break; else nwrite++)
 
 READ_WRITE_METHODS(double, Double,
                    int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
diff --git a/aten/src/TH/THGenerateHalfType.h b/aten/src/TH/THGenerateHalfType.h
index 09d4c878d0f11e..8e1b5eaed4d946 100644
--- a/aten/src/TH/THGenerateHalfType.h
+++ b/aten/src/TH/THGenerateHalfType.h
@@ -5,8 +5,8 @@
 #include "THHalf.h"
 #define scalar_t THHalf
 #define accreal float
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val)
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (scalar_t)(_val)
 #define Real Half
 #define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF)
 #define TH_REAL_IS_HALF
diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp
deleted file mode 100644
index c95272c5f13e34..00000000000000
--- a/aten/src/TH/THHalf.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "THHalf.h"
-#include <ATen/core/Half.h>
-
-/* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */
-
-THHalf TH_float2half(float f)
-{
-  THHalf h;
-  TH_float2halfbits(&f, &h.x);
-  return h;
-}
-
-TH_API float TH_half2float(THHalf h)
-{
-  float f;
-  TH_halfbits2float(&h.x, &f);
-  return f;
-}
-
-
-void TH_halfbits2float(unsigned short* src, float* res)
-{
-  *res = at::detail::halfbits2float(*src);
-}
-
-
-void TH_float2halfbits(float* src, unsigned short* dest)
-{
-  *dest = at::detail::float2halfbits(*src);
-}
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
index 68a884f22bd0ad..f7c884f2cc67bd 100644
--- a/aten/src/TH/THHalf.h
+++ b/aten/src/TH/THHalf.h
@@ -1,10 +1,8 @@
 #ifndef TH_HALF_H
 #define TH_HALF_H
 
-#include <TH/THGeneral.h>
-
 #ifdef __cplusplus
-#include <ATen/core/TensorImpl.h>
+#include <ATen/core/Half.h>
 #endif
 
 #ifdef __cplusplus
@@ -14,10 +12,4 @@ typedef struct at_Half at_Half;
 #define THHalf at_Half
 #endif
 
-TH_API void TH_float2halfbits(float*, unsigned short*);
-TH_API void TH_halfbits2float(unsigned short*, float*);
-
-TH_API THHalf TH_float2half(float);
-TH_API float TH_half2float(THHalf);
-
 #endif
diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp
index 3f2187b68f74ea..b74d7926ebff21 100644
--- a/aten/src/TH/THMemoryFile.cpp
+++ b/aten/src/TH/THMemoryFile.cpp
@@ -343,11 +343,11 @@ READ_WRITE_METHODS(float, Float,
                    nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]),
                    1)
 
-READ_WRITE_METHODS(THHalf, Half,
+READ_WRITE_METHODS(at::Half, Half,
                    int nByteRead_; float buf; \
                    int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \
-                   data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])),
+                   data[i] = static_cast<at::Half>(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", static_cast<float>(data[i])),
                    1)
 
 READ_WRITE_METHODS(double, Double,
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
index 1de588bbd2d75b..ea8a0d5808cb16 100644
--- a/aten/src/TH/generic/THStorageCopy.cpp
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -29,40 +29,6 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage
     data[i] = static_cast<scalar_t>(src_data[i]);                           \
 }
 
-#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  auto data = THStorage_(data)(storage);      \
-  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->numel(); i++)					\
-    data[i] = (scalar_t)TH_half2float(src_data[i]); \
-}
-
-#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  auto data = THStorage_(data)(storage);      \
-  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->numel(); i++)					\
-    data[i] = TH_float2half((float)(src_data[i])); \
-}
-
-#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  auto data = THStorage_(data)(storage);      \
-  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->numel(); i++)					\
-    data[i] = static_cast<scalar_t>(src_data[i]); \
-}
-
-#ifndef TH_REAL_IS_HALF
 IMPLEMENT_THStorage_COPY(Byte)
 IMPLEMENT_THStorage_COPY(Char)
 IMPLEMENT_THStorage_COPY(Short)
@@ -70,18 +36,6 @@ IMPLEMENT_THStorage_COPY(Int)
 IMPLEMENT_THStorage_COPY(Long)
 IMPLEMENT_THStorage_COPY(Float)
 IMPLEMENT_THStorage_COPY(Double)
-IMPLEMENT_THStorage_COPY_FROM_HALF(Half)
-#else
-/* only allow pass-through for Half */
-IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half)
-IMPLEMENT_THStorage_COPY_TO_HALF(Byte)
-IMPLEMENT_THStorage_COPY_TO_HALF(Char)
-IMPLEMENT_THStorage_COPY_TO_HALF(Short)
-IMPLEMENT_THStorage_COPY_TO_HALF(Int)
-IMPLEMENT_THStorage_COPY_TO_HALF(Long)
-IMPLEMENT_THStorage_COPY_TO_HALF(Float)
-IMPLEMENT_THStorage_COPY_TO_HALF(Double)
-#endif
-
+IMPLEMENT_THStorage_COPY(Half)
 
 #endif
diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp
index a9e0564fb574c8..f04c75cd3f9105 100644
--- a/aten/src/TH/generic/THTensorCopy.cpp
+++ b/aten/src/TH/generic/THTensorCopy.cpp
@@ -203,28 +203,6 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src
                        static_cast<inter_copy_type_t<scalar_t>>(*src_data));) \
 }
 
-#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \
-}
-
-#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, \
-                  *tensor_data = static_cast<scalar_t>( \
-                      static_cast<inter_copy_type_t<scalar_t>>( \
-                          TH_half2float(*src_data)));) \
-}
-
-#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \
-}
-
-#ifndef TH_REAL_IS_HALF
 IMPLEMENT_THTensor_COPY(Byte, uint8_t)
 IMPLEMENT_THTensor_COPY(Char, int8_t)
 IMPLEMENT_THTensor_COPY(Short, int16_t)
@@ -232,18 +210,6 @@ IMPLEMENT_THTensor_COPY(Int, int32_t)
 IMPLEMENT_THTensor_COPY(Long, int64_t)
 IMPLEMENT_THTensor_COPY(Float, float)
 IMPLEMENT_THTensor_COPY(Double, double)
-IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
-#else
-/* only allow pass-through for Half */
-IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf)
-IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
-IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
-
-#endif /* REAL_IS_HALF */
+IMPLEMENT_THTensor_COPY(Half, at::Half)
 
 #endif
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index 44f5d188d5b42b..4b8ab9f4ab101a 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -92,7 +92,6 @@ INSTALL(FILES
           THCGenerateFloatType.h
           THCGenerateFloatTypes.h
           THCGenerateDoubleType.h
-          THCHalf.h
           THCIntegerDivider.cuh
           THCNumerics.cuh
           THCTensorSort.cuh
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 8fec96dde6f647..756fa0f905ac13 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -2,7 +2,7 @@
 #define THC_ATOMICS_INC
 
 #include "THC.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCNumerics.cuh"
 #include "ATen/ATen.h"
 
@@ -95,7 +95,7 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
   AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
 
-static inline  __device__ void atomicAdd(half *address, half val) {
+static inline  __device__ void atomicAdd(at::Half *address, at::Half val) {
   unsigned int * address_as_ui =
     (unsigned int *) ((char *)address - ((size_t)address & 2));
   unsigned int old = *address_as_ui;
@@ -103,23 +103,13 @@ static inline  __device__ void atomicAdd(half *address, half val) {
 
   do {
     assumed = old;
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-    half hsum;
+    at::Half hsum;
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = THCNumerics<half>::add(hsum, val);
-#else
-    __half_raw hsum;
-    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    half tmpres = THCNumerics<half>::add(hsum, val);
-    hsum = __half_raw(tmpres);
-#endif
+    hsum = THCNumerics<at::Half>::add(hsum, val);
     old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
     old = atomicCAS(address_as_ui, assumed, old);
   } while (assumed != old);
 }
-static inline __device__ void atomicAdd(at::Half *address, at::Half val) {
-  atomicAdd(reinterpret_cast<half*>(address), val);
-}
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
 // from CUDA C Programmic Guide
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index 20b13d82b9a152..6375ced8c691cc 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -1,6 +1,6 @@
 #include "THCBlas.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include <algorithm>
 
@@ -50,7 +50,7 @@ double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, doub
   return 0;
 }
 
-half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy)
+at::Half THCudaBlas_Hdot(THCState *state, int64_t n, at::Half *x, int64_t incx, at::Half *y, int64_t incy)
 {
 #if CUDA_VERSION >= 8000
   if (n == 1) {
@@ -59,7 +59,7 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y,
   }
 
   if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
-    half result;
+    at::Half result;
     cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
     cublasSetStream(handle, THCState_getCurrentStream(state));
     THCublasCheck(cublasDotEx(handle, n,
@@ -72,10 +72,10 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y,
 
   THError("Cublas_Hdot only supports n, incx and incy "
           "up to signed integer limits: %d", INT_MAX);
-  return THC_float2half(0);
+  return 0.0;
 #else
   THError("Cublas_Hdot requires CUDA 8.0+");
-  return THC_float2half(0);
+  return 0.0;
 #endif
 }
 
@@ -267,7 +267,7 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6
 #  define CUDA_R_16F CUBLAS_DATA_HALF
 #endif
 
-void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc)
+void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc)
 {
   adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
@@ -293,8 +293,8 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
 #else
 
       // Simulated Hgemm
-      float fAlpha = THC_half2float(alpha);
-      float fBeta = THC_half2float(beta);
+      float fAlpha = alpha;
+      float fBeta = beta;
 
 #if CUDA_VERSION < 9000
       THCublasCheck(cublasSgemmEx(handle, opa, opb,
@@ -355,8 +355,8 @@ void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int6
 
 #if CUDA_VERSION >= 9010
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
-                             half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB,
-                             half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
+                             at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB,
+                             at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
 {
   if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
 
@@ -371,8 +371,8 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
 
   cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
   cublasSetStream(handle, THCState_getCurrentStream(state));
-  float fAlpha = THC_half2float(alpha);
-  float fBeta = THC_half2float(beta);
+  float fAlpha = alpha;
+  float fBeta = beta;
   THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
   THCublasCheck(cublasGemmStridedBatchedEx(handle,
                                    opa, opb, (int)m, (int)n, (int)k,
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
index 36a0b35294dff2..a73b47f162c3f3 100644
--- a/aten/src/THC/THCBlas.h
+++ b/aten/src/THC/THCBlas.h
@@ -2,12 +2,12 @@
 #define THC_BLAS_INC
 
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 /* Level 1 */
 THC_API float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy);
 THC_API double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy);
-THC_API half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy);
+THC_API THHalf THCudaBlas_Hdot(THCState *state, int64_t n, THHalf *x, int64_t incx, THHalf *y, int64_t incy);
 
 /* Level 2 */
 THC_API void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy);
@@ -19,7 +19,7 @@ THC_API void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha
 THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
 THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
 
-THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc);
+THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc);
 
 THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
@@ -38,8 +38,8 @@ THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char t
 
 #if CUDA_VERSION >= 9010
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
-                                     half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB,
-                                                                  half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount);
+                                     THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB,
+                                                                  THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount);
 #endif
 
 /* Inverse */
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
index 6bc7da7cee6e2d..05da61bd56e754 100644
--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@@ -125,7 +125,7 @@ void THCudaShutdown(THCState* state)
   for (int dev = 0; dev < deviceCount; ++dev) {
     THCudaCheck(cudaSetDevice(dev));
     THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
-    
+
     // Frees BLAS handle
     if (res->blasHandle) {
       THCublasCheck(cublasDestroy(res->blasHandle));
@@ -256,7 +256,7 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
     THError("THCState and sparseHandles must be set as there is no default sparseHandle");
     return NULL;
   }
-    
+
   int device;
   THCudaCheck(cudaGetDevice(&device));
 
@@ -280,7 +280,7 @@ cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
 
   int device;
   THCudaCheck(cudaGetDevice(&device));
-  
+
   // Creates the sparse handle if not created yet
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   if (!res->sparseHandle) {
@@ -474,30 +474,3 @@ cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalB
 
 #include "THCStorage.cpp"
 #include "THCAllocator.cpp"
-
-/* from THCHalf.h */
-
-half THC_float2half(float f)
-{
-#if CUDA_VERSION < 9000
-  half h;
-  TH_float2halfbits(&f, &h.x);
-  return h;
-#else
-  __half_raw h_raw;
-  TH_float2halfbits(&f, &h_raw.x);
-  return half(h_raw);
-#endif
-}
-
-float  THC_half2float(half h)
-{
-  float f;
-#if CUDA_VERSION < 9000
-  TH_halfbits2float(&h.x, &f);
-#else
-  __half_raw h_raw(h);
-  TH_halfbits2float(&h_raw.x, &f);
-#endif
-  return f;
-}
diff --git a/aten/src/THC/THCGenerateHalfType.h b/aten/src/THC/THCGenerateHalfType.h
index 54358a9be1a59d..596ea47904820f 100644
--- a/aten/src/THC/THCGenerateHalfType.h
+++ b/aten/src/THC/THCGenerateHalfType.h
@@ -2,9 +2,9 @@
 #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h"
 #endif
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
-#define scalar_t half
+#define scalar_t THHalf
 #define accreal float
 #define Real Half
 
diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h
deleted file mode 100644
index aeae06fc4739ba..00000000000000
--- a/aten/src/THC/THCHalf.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef THC_HALF_CONVERSION_INC
-#define THC_HALF_CONVERSION_INC
-
-#include "THCGeneral.h"
-
-#include <cuda_fp16.h>
-#include <stdint.h>
-
-#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
-#ifndef __cplusplus
-typedef __half_raw half;
-#endif
-#endif
-
-THC_API half THC_float2half(float a);
-THC_API float THC_half2float(half a);
-
-#endif
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 286d36f7b284b3..157a324f6e45b8 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -4,7 +4,7 @@
 #include <limits>
 #include <cuda.h>
 #include <assert.h>
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "ATen/ATen.h"
 #include "ATen/cuda/NumericLimits.cuh"
 
@@ -165,210 +165,81 @@ struct THCNumerics<int64_t> {
 
 // DEPRECATED: use math functions from std and NumericLimits.cuh
 template <>
-struct THCNumerics<half> {
-  static inline __host__ __device__ half min() { return at::numeric_limits<at::Half>::lowest(); }
-  static inline __host__ __device__ half max() { return at::numeric_limits<at::Half>::max(); }
-  static inline __host__ __device__ half lower_bound() { return at::numeric_limits<at::Half>::lower_bound(); }
-  static inline __host__ __device__ half upper_bound() { return at::numeric_limits<at::Half>::upper_bound(); }
-
-  static inline __host__ __device__ bool lt(half a, half b) {
-    return static_cast<at::Half>(a) < static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool le(half a, half b) {
-    return static_cast<at::Half>(a) <= static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool gt(half a, half b) {
-    return static_cast<at::Half>(a) > static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool ge(half a, half b) {
-    return static_cast<at::Half>(a) >= static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool eq(half a, half b) {
-    // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands
-    // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve
-    return static_cast<float>(static_cast<at::Half>(a)) == static_cast<float>(static_cast<at::Half>(b));
-  }
-
-  static inline __host__ __device__ bool ne(half a, half b) {
-    // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands
-    // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve
-    return static_cast<float>(static_cast<at::Half>(a)) != static_cast<float>(static_cast<at::Half>(b));
-  }
-
-  static inline __host__ __device__ half exp(half a) {
-    return static_cast<at::Half>(std::exp(static_cast<at::Half>(a)));
-  }
-
-  // note that exp10 is not in the std namespace.
-  static inline __host__ __device__ half exp10(half a) {
-    return static_cast<at::Half>(::exp10(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log(half a) {
-    return static_cast<at::Half>(::log(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log10(half a) {
-    return static_cast<at::Half>(::log10(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log1p(half a) {
-    return static_cast<at::Half>(::log1p(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log2(half a) {
-    return static_cast<at::Half>(::log2(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half lgamma(half a) {
-    return static_cast<at::Half>(::lgamma(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half expm1(half a) {
-    return static_cast<at::Half>(::expm1(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half cos(half a) {
-    return static_cast<at::Half>(::cos(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half sin(half a) {
-    return static_cast<at::Half>(::sin(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half sqrt(half a) {
-    return static_cast<at::Half>(::sqrt(static_cast<at::Half>(a)));
-  }
-
-  // note that rsqrt is not in the std namespace.
-  static inline __host__ __device__ half rsqrt(half a) {
-    return static_cast<at::Half>(::rsqrt(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half ceil(half a) {
-    return static_cast<at::Half>(::ceil(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half floor(half a) {
-    return static_cast<at::Half>(::floor(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half trunc(half a) {
-    return static_cast<at::Half>(::trunc(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half neg(half a) {
-    return static_cast<at::Half>(-(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half acos(half a) {
-    return static_cast<at::Half>(::acos(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half cosh(half a) {
-    return static_cast<at::Half>(::cosh(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half asin(half a) {
-    return static_cast<at::Half>(::asin(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half sinh(half a) {
-    return static_cast<at::Half>(::sinh(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half tan(half a) {
-    return static_cast<at::Half>(::tan(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half atan(half a) {
-    return static_cast<at::Half>(::atan(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half tanh(half a) {
-    return static_cast<at::Half>(::tanh(static_cast<at::Half>(a)));
-  }
-
-
-   static inline __host__ __device__ half erf(half a) {
-    return static_cast<at::Half>(::erf(static_cast<at::Half>(a)));
-  }
-
-
-   static inline __host__ __device__ half erfc(half a) {
-    return static_cast<at::Half>(::erfc(static_cast<at::Half>(a)));
-  }
-
-  // note that erfinv is not in the std namespace.
-  static inline __host__ __device__ half erfinv(half a) {
-    return static_cast<at::Half>(::erfinv(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half abs(half a) {
-    return static_cast<at::Half>(::abs(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half round(half a) {
-    return static_cast<at::Half>(::round(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half frac(half a) {
+struct THCNumerics<at::Half> {
+  static inline __host__ __device__ at::Half min() { return at::numeric_limits<at::Half>::lowest(); }
+  static inline __host__ __device__ at::Half max() { return at::numeric_limits<at::Half>::max(); }
+  static inline __host__ __device__ at::Half lower_bound() { return at::numeric_limits<at::Half>::lower_bound(); }
+  static inline __host__ __device__ at::Half upper_bound() { return at::numeric_limits<at::Half>::upper_bound(); }
+
+  static inline __host__ __device__ bool lt(at::Half a, at::Half b) { return a < b; }
+  static inline __host__ __device__ bool le(at::Half a, at::Half b) { return a <= b; }
+  static inline __host__ __device__ bool gt(at::Half a, at::Half b) { return a > b; }
+  static inline __host__ __device__ bool ge(at::Half a, at::Half b) { return a >= b; }
+  static inline __host__ __device__ bool eq(at::Half a, at::Half b) { return a == b; }
+  static inline __host__ __device__ bool ne(at::Half a, at::Half b) { return a != b; }
+
+  static inline __host__ __device__ at::Half exp(at::Half a) { return std::exp(a); }
+  static inline __host__ __device__ at::Half exp10(at::Half a) { return ::exp10(a); }
+  static inline __host__ __device__ at::Half log(at::Half a) { return ::log(a); }
+  static inline __host__ __device__ at::Half log10(at::Half a) { return ::log10(a); }
+  static inline __host__ __device__ at::Half log1p(at::Half a) { return ::log1p(a); }
+  static inline __host__ __device__ at::Half log2(at::Half a) { return ::log2(a); }
+  static inline __host__ __device__ at::Half lgamma(at::Half a) { return ::lgamma(a); }
+  static inline __host__ __device__ at::Half expm1(at::Half a) { return ::expm1(a); }
+  static inline __host__ __device__ at::Half cos(at::Half a) { return ::cos(a); }
+  static inline __host__ __device__ at::Half sin(at::Half a) { return ::sin(a); }
+  static inline __host__ __device__ at::Half sqrt(at::Half a) { return ::sqrt(a); }
+  static inline __host__ __device__ at::Half rsqrt(at::Half a) { return ::rsqrt(a); }
+  static inline __host__ __device__ at::Half ceil(at::Half a) { return ::ceil(a); }
+  static inline __host__ __device__ at::Half floor(at::Half a) { return ::floor(a); }
+  static inline __host__ __device__ at::Half trunc(at::Half a) { return ::trunc(a); }
+  static inline __host__ __device__ at::Half neg(at::Half a) { return -a; }
+  static inline __host__ __device__ at::Half acos(at::Half a) { return ::acos(a); }
+  static inline __host__ __device__ at::Half cosh(at::Half a) { return ::cosh(a); }
+  static inline __host__ __device__ at::Half asin(at::Half a) { return ::asin(a); }
+  static inline __host__ __device__ at::Half sinh(at::Half a) { return ::sinh(a); }
+  static inline __host__ __device__ at::Half tan(at::Half a) { return ::tan(a); }
+  static inline __host__ __device__ at::Half atan(at::Half a) { return ::atan(a); }
+  static inline __host__ __device__ at::Half tanh(at::Half a) { return ::tanh(a); }
+  static inline __host__ __device__ at::Half erf(at::Half a) { return ::erf(a); }
+  static inline __host__ __device__ at::Half erfc(at::Half a) { return ::erfc(a); }
+  static inline __host__ __device__ at::Half erfinv(at::Half a) { return ::erfinv(a); }
+  static inline __host__ __device__ at::Half abs(at::Half a) { return ::abs(a); }
+  static inline __host__ __device__ at::Half round(at::Half a) { return ::round(a); }
+
+  static inline __host__ __device__ at::Half frac(at::Half a) {
     #ifdef __CUDA_ARCH__
-        return static_cast<at::Half>(a) - static_cast<at::Half>(::trunc(static_cast<at::Half>(a)));
+        return a - ::trunc(a);
     #else // __CUDA_ARCH__
-        return static_cast<at::Half>(a) - static_cast<at::Half>(::floor(static_cast<at::Half>(a)));
+        return a - ::floor(a);
     #endif
   }
 
-  static inline __host__ __device__ half cinv(half a) {
-    return static_cast<at::Half>(1.0f / static_cast<at::Half>(a));
-  }
-
-  static inline __host__ __device__ half add(half a, half b) {
-    return static_cast<at::Half>(a) + static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ half div(half a, half b) {
-    return static_cast<at::Half>(a) / static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ half mul(half a, half b) {
-    return static_cast<at::Half>(a) * static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ half sub(half a, half b) {
-    return static_cast<at::Half>(a) - static_cast<at::Half>(b);
-  }
+  static inline __host__ __device__ at::Half cinv(at::Half a) { return 1.0f / a; }
+  static inline __host__ __device__ at::Half add(at::Half a, at::Half b) { return a + b; }
+  static inline __host__ __device__ at::Half div(at::Half a, at::Half b) { return a / b; }
+  static inline __host__ __device__ at::Half mul(at::Half a, at::Half b) { return a * b; }
+  static inline __host__ __device__ at::Half sub(at::Half a, at::Half b) { return a - b; }
+  static inline __host__ __device__ at::Half pow(at::Half a, at::Half b) { return ::pow(a, b); }
+  static inline __host__ __device__ at::Half atan2(at::Half a, at::Half b) { return ::atan2(a, b); }
 
-  static inline __host__ __device__ half pow(half a, half b) {
-    return static_cast<at::Half>(::pow(static_cast<at::Half>(a), static_cast<at::Half>(b)));
-  }
-
-  static inline __host__ __device__ half atan2(half a, half b) {
-    return static_cast<at::Half>(::atan2(static_cast<at::Half>(a), static_cast<at::Half>(b)));
-  }
-
-  static inline __host__ __device__ bool isnan(half a) {
+  static inline __host__ __device__ bool isnan(at::Half a) {
     #ifdef _MSC_VER
       // Windows requires this explicit conversion. The reason is unclear
       // related issue with clang: https://reviews.llvm.org/D37906
-      return ::isnan((float)static_cast<at::Half>(a));
+      return ::isnan((float) a);
     #else
-      return ::isnan(static_cast<at::Half>(a));
+      return ::isnan(a);
     #endif
   }
 
-  static inline __host__ __device__ bool isinf(half a) {
+  static inline __host__ __device__ bool isinf(at::Half a) {
     #ifdef _MSC_VER
       // Windows requires this explicit conversion. The reason is unclear
       // related issue with clang: https://reviews.llvm.org/D37906
-      return ::isinf((float)static_cast<at::Half>(a));
+      return ::isinf((float) a);
     #else
-      return ::isinf(static_cast<at::Half>(a));
+      return ::isinf(a);
     #endif
   }
 
@@ -506,35 +377,6 @@ struct ScalarConvert {
   static __host__ __device__ Out to(const In v) { return (Out) v; }
 };
 
-template <typename Out>
-struct ScalarConvert<half, Out> {
-  static __host__ __device__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
-    return (Out) __half2float(v);
-#else
-    return (Out) THC_half2float(v);
-#endif
-  }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
-  static __host__ __device__ half to(const In v) {
-#ifdef __CUDA_ARCH__
-    return __float2half((float) v);
-#else
-    return THC_float2half((float) v);
-#endif
-  }
-};
-
-template <>
-struct ScalarConvert<half, half> {
-  static __host__ __device__ half to(const half v) {
-    return v;
-  }
-};
-
 // DEPRECATED: use static_cast in kernels instead of scalar_cast
 template <typename T, typename U>
 __host__ __device__ T scalar_cast(U u) {
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index 96e3938e20b0f9..3d1630adedbe5b 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -1,7 +1,7 @@
 #include "THCStorage.hpp"
 #include "THCGeneral.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include <new>
 
diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu
index 43a293422335fc..97b66ac2b5f891 100644
--- a/aten/src/THC/THCStorage.cu
+++ b/aten/src/THC/THCStorage.cu
@@ -7,7 +7,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include "generic/THCStorage.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCStorageCopy.cu b/aten/src/THC/THCStorageCopy.cu
index 8d7c869c12c004..c4f53f7160ca5b 100644
--- a/aten/src/THC/THCStorageCopy.cu
+++ b/aten/src/THC/THCStorageCopy.cu
@@ -1,7 +1,7 @@
 #include "THCStorageCopy.h"
 #include "THCGeneral.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensorCopy.h"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THC/THCStorageCopy.h b/aten/src/THC/THCStorageCopy.h
index 837056fc3801d2..250b60fcb2fe74 100644
--- a/aten/src/THC/THCStorageCopy.h
+++ b/aten/src/THC/THCStorageCopy.h
@@ -3,7 +3,7 @@
 
 #include "THCStorage.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include "generic/THCStorageCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu
index 259912a8d91806..a3a8ed2e182086 100644
--- a/aten/src/THC/THCTensorCopy.cu
+++ b/aten/src/THC/THCTensorCopy.cu
@@ -1,5 +1,5 @@
 #include "THCApply.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCNumerics.cuh"
 #include "THCTensorCopy.hpp"
 #include <type_traits>
diff --git a/aten/src/THC/THCTensorCopy.h b/aten/src/THC/THCTensorCopy.h
index 74f2b592f54657..48dcc64b9fbedc 100644
--- a/aten/src/THC/THCTensorCopy.h
+++ b/aten/src/THC/THCTensorCopy.h
@@ -3,7 +3,7 @@
 
 #include "THCTensor.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCStream.h"
 
 #include "generic/THCTensorCopy.h"
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
index 0ea5951d4ea734..6b4a77ea816225 100644
--- a/aten/src/THC/THCTensorIndex.cu
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -4,7 +4,7 @@
 #include "THCBlas.h"
 #include "THCTensorCopy.h"
 #include "THCTensorRandom.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
 #include "THCDeviceUtils.cuh"
diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu
index b1be123b886e03..cd5a77c2227181 100644
--- a/aten/src/THC/THCTensorMathPairwise.cu
+++ b/aten/src/THC/THCTensorMathPairwise.cu
@@ -1,6 +1,6 @@
 #include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
@@ -21,26 +21,6 @@ struct TensorAddConstantOp {
   const T val;
 };
 
-template <>
-struct TensorAddConstantOp<half> {
-  TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
-
 template <typename T>
 struct TensorSubConstantOp {
   TensorSubConstantOp(T v) : val(v) {}
@@ -55,27 +35,6 @@ struct TensorSubConstantOp {
   const T val;
 };
 
-
-template <>
-struct TensorSubConstantOp<half> {
-  TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
-
 template <typename T>
 struct TensorMulConstantOp {
   TensorMulConstantOp(T v) : val(v) {}
@@ -90,25 +49,6 @@ struct TensorMulConstantOp {
   const T val;
 };
 
-template <>
-struct TensorMulConstantOp<half> {
-  TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
 template <typename T>
 struct TensorDivConstantOp {
   TensorDivConstantOp(T v) : val(v) {}
@@ -151,24 +91,6 @@ struct TensorDivConstantOp<double> {
   const double val;
 };
 
-template <>
-struct TensorDivConstantOp<half> {
-  TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
 template<typename T>
 static __device__ __forceinline__
 typename std::enable_if<std::is_signed<T>::value, bool>::type
@@ -232,22 +154,18 @@ struct TensorRemainderOp<double> {
 };
 
 template <>
-struct TensorRemainderOp<half> {
-  TensorRemainderOp(half v): fval(THC_half2float(v)) {}
+struct TensorRemainderOp<at::Half> {
+  TensorRemainderOp(at::Half v): val(v) {}
 
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin - fval * floorf(fin / fval);
-    *out = __float2half(fout);
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
+    *out = *in - val * floorf(*in / val);
   }
 
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv = fv - fval * floorf(fv / fval);
-    *v = __float2half(fv);
+  __device__ __forceinline__ void operator()(at::Half* v) {
+    *v = *v - val * floorf(*v / val);
   }
 
-  const float fval;
+  const at::Half val;
 };
 
 template <typename T>
@@ -278,21 +196,6 @@ struct TensorFmodOp<double> {
   const double val;
 };
 
-template <>
-struct TensorFmodOp<half> {
-  TensorFmodOp(half v): fval(THC_half2float(v)) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    *out = __float2half(fmodf(__half2float(*in), fval));
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    *v = __float2half(fmodf(__half2float(*v), fval));
-  }
-
-  const float fval;
-};
-
 template <typename T, int Upper>
 struct TensorTriOp {
   TensorTriOp(T *start_, int64_t stride0_, int64_t stride1_, int64_t k_)
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
index fb15a05b155c0f..fb1072b8bbddc1 100644
--- a/aten/src/THC/THCTensorMathPointwise.cuh
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -4,7 +4,7 @@
 #include <type_traits>
 #include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
@@ -31,19 +31,6 @@ struct TensorSigmoidOp {
   }
 };
 
-template <>
-struct TensorSigmoidOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) const {
-    float fin = __half2float(*in);
-    *out = __float2half(1.0f / (1.0f + expf(- fin)));
-  }
-
-  __device__ __forceinline__ void operator()(half* v) const {
-    float fv = __half2float(*v);
-    *v = __float2half(1.0f / (1.0f + expf(- fv)));
-  }
-};
-
 template <typename T>
 struct TensorSignOp {
   __device__ __forceinline__ void operator()(T* out, T* in) {
@@ -70,19 +57,6 @@ struct TensorSignOp<unsigned char> {
   }
 };
 
-template <>
-struct TensorSignOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float orig = __half2float(*in);
-    *out = __float2half((orig > 0) - (orig < 0));
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float orig = __half2float(*v);
-    *v = __float2half((orig > 0) - (orig < 0));
-  }
-};
-
 template <typename T>
 struct TensorCAddOp {
   TensorCAddOp(T v) : val(v) {}
@@ -98,31 +72,6 @@ struct TensorCAddOp {
   T val;
 };
 
-template <>
-struct TensorCAddOp<half> {
-  TensorCAddOp(half v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fout = __half2float(*out);
-    float fval = __half2float(val);
-    float fin = __half2float(*in);
-
-    fout += fval * fin;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fval = __half2float(val);
-
-    float fout = fin1 + fval * fin2;
-    *out = __float2half(fout);
-  }
-
-  half val;
-};
-
 template <typename T>
 struct TensorMulOp {
   __device__ __forceinline__ void operator()(T* out, T* in) {
@@ -134,23 +83,6 @@ struct TensorMulOp {
   }
 };
 
-template <>
-struct TensorMulOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout *= fin;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 * fin2;
-    *out = __float2half(fout);
-  }
-};
-
 template<typename T, int StaticExp>
 struct TensorPowOp {
   TensorPowOp(T v) : val(v) {}
@@ -249,7 +181,6 @@ struct TensorCPowOp<float> {
   }
 };
 
-
 template <>
 struct TensorCPowOp<double> {
   __device__ __forceinline__ void operator()(double* out, double* in) {
@@ -261,25 +192,6 @@ struct TensorCPowOp<double> {
   }
 };
 
-template <>
-struct TensorCPowOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    // No fp16 pow function yet
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout = powf(fout, fin);
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    // No fp16 pow function yet
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = powf(fin1, fin2);
-    *out = __float2half(fout);
-  }
-};
-
 template<typename T>
 static __device__ __forceinline__
 typename std::enable_if<std::is_signed<T>::value, bool>::type
@@ -336,17 +248,13 @@ struct TensorCRemainderOp<double> {
 };
 
 template <>
-struct TensorCRemainderOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN);
+struct TensorCRemainderOp<at::Half> {
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
+    *out = *in != 0.f ? *out - *in * floorf(*out / *in) : NAN;
   }
 
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN);
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) {
+    *out = *in2 != 0.f ? *in1 - *in2 * floorf(*in1 / *in2) : NAN;
   }
 };
 
@@ -384,13 +292,13 @@ struct TensorCFmodOp<double> {
 };
 
 template <>
-struct TensorCFmodOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    *out = __float2half(fmodf(__half2float(*out), __half2float(*in)));
+struct TensorCFmodOp<at::Half> {
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
+    *out = fmodf(*out, *in);
   }
 
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2)));
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) {
+    *out = fmodf(*in1, *in2);
   }
 };
 
@@ -594,7 +502,6 @@ struct TensorRShiftOp {
   }
 };
 
-
 template <>
 struct TensorRShiftOp<float> {
   __device__ __forceinline__ void
@@ -671,7 +578,7 @@ template <typename T, typename accreal>
 struct TensorDigammaOp {
   __device__ __forceinline__ void
   operator()(T* out, T* in) {
-    using compute_type = typename std::conditional<std::is_same<T, half>::value, accreal, T>::type;
+    using compute_type = typename std::conditional<std::is_same<T, at::Half>::value, accreal, T>::type;
     static const double PI_f64 = 3.14159265358979323846;
     static const compute_type PSI_10 = 2.25175258906672110764;
     static const compute_type A[] = {
@@ -731,7 +638,7 @@ struct TensorDigammaOp {
 
 template <typename T, typename accreal>
 struct TensorTrigammaOp {
-  using compute_type = typename std::conditional<std::is_same<T, half>::value, accreal, T>::type;
+  using compute_type = typename std::conditional<std::is_same<T, at::Half>::value, accreal, T>::type;
   __device__ __forceinline__ void
   operator()(T* out, T* in) {
     const compute_type PI = 3.14159265358979323846;
diff --git a/aten/src/THC/THCTensorMode.cuh b/aten/src/THC/THCTensorMode.cuh
index 0158f254a2d014..a3ed2ae8e4be93 100644
--- a/aten/src/THC/THCTensorMode.cuh
+++ b/aten/src/THC/THCTensorMode.cuh
@@ -7,33 +7,33 @@
 
 struct ThrustHalfLess
 {
-  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
-    return THCNumerics<half>::lt(lhs, rhs);
+  __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) {
+    return THCNumerics<at::Half>::lt(lhs, rhs);
   }
 };
 
 struct ThrustHalfNotEqualTo
 {
-  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
-    return THCNumerics<half>::ne(lhs, rhs);
+  __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) {
+    return THCNumerics<at::Half>::ne(lhs, rhs);
   }
 };
 
 struct ThrustHalfEqualTo
 {
-  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
-    return THCNumerics<half>::eq(lhs, rhs);
+  __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) {
+    return THCNumerics<at::Half>::eq(lhs, rhs);
   }
 };
 
 struct ThrustHalfEqualToPredicate
 {
-  ThrustHalfEqualToPredicate(half val): val_(val) {}
-  __host__ __device__ inline bool operator()(half x) {
-    return THCNumerics<half>::eq(val_, x);
+  ThrustHalfEqualToPredicate(at::Half val): val_(val) {}
+  __host__ __device__ inline bool operator()(at::Half x) {
+    return THCNumerics<at::Half>::eq(val_, x);
   }
 
-  half val_;
+  at::Half val_;
 };
 
 template <typename T>
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index 8eb580169cc953..386473a430329a 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -14,7 +14,7 @@
 #include <curand.h>
 #include <curand_kernel.h>
 
-#define MAX_NUM_BLOCKS 200 
+#define MAX_NUM_BLOCKS 200
 #define BLOCK_SIZE 256
 
 
@@ -107,11 +107,11 @@ __device__ inline T reverse_bounds(T value) {
 }
 
 
-__device__ inline half half_uniform_scale_and_shift(float x, double a, double b) {
-  half width = ScalarConvert<double, half>::to(b - a);
-  half start = ScalarConvert<double, half>::to(a);
-  half scaled = THCNumerics<half>::mul(reverse_bounds(ScalarConvert<float, half>::to(x)), width);
-  return THCNumerics<half>::add(scaled, start);
+__device__ inline at::Half half_uniform_scale_and_shift(float x, double a, double b) {
+  at::Half width = ScalarConvert<double, at::Half>::to(b - a);
+  at::Half start = ScalarConvert<double, at::Half>::to(a);
+  at::Half scaled = THCNumerics<at::Half>::mul(reverse_bounds(ScalarConvert<float, at::Half>::to(x)), width);
+  return THCNumerics<at::Half>::add(scaled, start);
 }
 
 #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM)      \
@@ -181,10 +181,10 @@ GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uni
 GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
 GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
 
-GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b)))
-GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, half>::to((x * stdv) + mean)))
-GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert<float, half>::to((float)(-1. / lambda * log(x)))))
-GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
+GENERATE_KERNEL2(generate_uniform, at::Half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b)))
+GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, at::Half>::to((x * stdv) + mean)))
+GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(-1. / lambda * log(x)))))
+GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
 
 #include "generic/THCTensorRandom.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
index 4f7a6b8c697913..6d1ef1b6bbfe37 100644
--- a/aten/src/THC/THCTensorTopK.cuh
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -113,10 +113,10 @@ struct TopKTypeConfig<double> {
 };
 
 template <>
-struct TopKTypeConfig<half> {
+struct TopKTypeConfig<at::Half> {
   typedef uint32_t RadixType;
 
-  static inline __device__ RadixType convert(half v) {
+  static inline __device__ RadixType convert(at::Half v) {
 #if CUDA_VERSION >= 8000
     RadixType x = __half_as_ushort(v);
     RadixType mask = -((x >> 15)) | 0x8000;
@@ -127,16 +127,16 @@ struct TopKTypeConfig<half> {
 #endif
   }
 
-  static inline __device__ half deconvert(RadixType v) {
+  static inline __device__ at::Half deconvert(RadixType v) {
 #if CUDA_VERSION >= 8000
     RadixType mask = ((v >> 15) - 1) | 0x8000;
     return __ushort_as_half(v ^ mask);
 #else
     assert(false);
-    return ScalarConvert<int, half>::to(0);
+    return ScalarConvert<int, at::Half>::to(0);
 #endif
   }
-}; 
+};
 
 // This function counts the distribution of all input values in a
 // slice we are selecting by radix digit at `radixDigitPos`, but only
@@ -214,7 +214,7 @@ __device__ DataType findPattern(DataType* smem,
                              BitDataType desired,
                              BitDataType desiredMask) {
 #ifdef __HIP_PLATFORM_HCC__
-  if (threadIdx.x < 64) { 
+  if (threadIdx.x < 64) {
 #else
   if (threadIdx.x < 32) {
 #endif
diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh
index 377b363c006ba2..aecd8f01713d55 100644
--- a/aten/src/THC/THCTensorTypeUtils.cuh
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@@ -4,7 +4,7 @@
 #include <cuda.h>
 #include <assert.h>
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensor.hpp"
 #include "THCTensorInfo.cuh"
 #include "THCTensor.hpp"
@@ -80,56 +80,4 @@ struct ScalarInv {
   static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
 };
 
-template <>
-struct ScalarNegate<half> {
-  static __host__ __device__ half to(const half v) {
-#ifdef __CUDA_ARCH__
-    return __float2half(-__half2float(v));
-#else
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-    half out = v;
-#else
-    __half_raw out = __half_raw(v);
-#endif
-    out.x ^= 0x8000; // toggle sign bit
-    return out;
-#endif
-  }
-};
-
-template <>
-struct ScalarInv<half> {
-  static __host__ __device__ half to(const half v) {
-#if defined (__CUDA_ARCH_) || defined(__HIP_PLATFORM_HCC__)
-    return __float2half(1.0f / __half2float(v));
-#else
-    float fv = THC_half2float(v);
-    fv = 1.0f / fv;
-    return THC_float2half(fv);
-#endif
-  }
-};
-
-inline bool operator==(half a, half b) {
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-  return a.x == b.x;
-#else
-  __half_raw araw, braw;
-  araw = __half_raw(a);
-  braw = __half_raw(b);
-  return araw.x == braw.x;
-#endif
-}
-
-inline bool operator!=(half a, half b) {
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-    return a.x != b.x;
-#else
-  __half_raw araw, braw;
-  araw = __half_raw(a);
-  braw = __half_raw(b);
-  return araw.x != braw.x;
-#endif
-}
-
 #endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 0320fdfe8035a5..4919a7a2a364ab 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -72,7 +72,7 @@ IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long, int64_t)
 // THCudaTensor aka the non-existent THCudaFloatTensor
 IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, , float)
 IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double, double)
-IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, half)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, at::Half)
 
 #undef IMPLEMENT_THC_CUDA_TENSOR_COPY
 
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index a37645de394de8..45364c7574cad4 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -27,11 +27,10 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
                                 THCTensor_(data)(state, self), 1,
                                 THCTensor_(data)(state, src), 1);
 #elif defined(THC_REAL_IS_HALF)
-  accreal result = ScalarConvert<half, accreal>::to(
-                   THCudaBlas_Hdot(state,
+  accreal result = THCudaBlas_Hdot(state,
                                 THCTensor_(nElement)(state, self),
                                 THCTensor_(data)(state, self), 1,
-                                THCTensor_(data)(state, src), 1));
+                                THCTensor_(data)(state, src), 1);
 #endif
 
   THCTensor_(free)(state, src);
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
index 40d6bdb6382983..06720b358df985 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -43,23 +43,13 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
 THC_API void
 THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
 {
-#ifdef THC_REAL_IS_HALF
-  auto v = THC_half2float(value) * THC_half2float(alpha);
-  THCTensor_(add)(state, self_, src_, THC_float2half(v));
-#else
   THCTensor_(add)(state, self_, src_, value * alpha);
-#endif
 }
 
 THC_API void
 THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
 {
-#ifdef THC_REAL_IS_HALF
-  auto v = THC_half2float(value) * THC_half2float(alpha);
-  THCTensor_(sub)(state, self_, src_, THC_float2half(v));
-#else
   THCTensor_(sub)(state, self_, src_, value * alpha);
-#endif
 }
 
 THC_API void
diff --git a/aten/src/THCUNN/Abs.cu b/aten/src/THCUNN/Abs.cu
index 72b7ff3c2f53d9..98542eda7e8a75 100644
--- a/aten/src/THCUNN/Abs.cu
+++ b/aten/src/THCUNN/Abs.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu
index cb0f47510bc559..30aa975594a160 100644
--- a/aten/src/THCUNN/AbsCriterion.cu
+++ b/aten/src/THCUNN/AbsCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCApply.cuh"
 
diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu
index 3624588015c8a1..e458bb81c9725e 100644
--- a/aten/src/THCUNN/BCECriterion.cu
+++ b/aten/src/THCUNN/BCECriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
 #include "THCApply.cuh"
diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu
index 0c393c3a9d6db0..97579d1c4aefd5 100644
--- a/aten/src/THCUNN/BatchNormalization.cu
+++ b/aten/src/THCUNN/BatchNormalization.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
diff --git a/aten/src/THCUNN/ClassNLLCriterion.cu b/aten/src/THCUNN/ClassNLLCriterion.cu
index 1043454ff1528b..dd430e9b88d120 100644
--- a/aten/src/THCUNN/ClassNLLCriterion.cu
+++ b/aten/src/THCUNN/ClassNLLCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
diff --git a/aten/src/THCUNN/Col2Im.cu b/aten/src/THCUNN/Col2Im.cu
index d7fd995de4b88b..73eca7ff16ad30 100644
--- a/aten/src/THCUNN/Col2Im.cu
+++ b/aten/src/THCUNN/Col2Im.cu
@@ -4,7 +4,7 @@
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/Col2Im.cu"
diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu
index e4e85b71045f8e..bd26c0c003bb7e 100644
--- a/aten/src/THCUNN/DistKLDivCriterion.cu
+++ b/aten/src/THCUNN/DistKLDivCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCApply.cuh"
 
diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu
index 9c4c2ea1fdc8b6..ec441645e8d7d5 100644
--- a/aten/src/THCUNN/ELU.cu
+++ b/aten/src/THCUNN/ELU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/GatedLinearUnit.cu b/aten/src/THCUNN/GatedLinearUnit.cu
index aba9f1e794e308..376fdb3855eb23 100644
--- a/aten/src/THCUNN/GatedLinearUnit.cu
+++ b/aten/src/THCUNN/GatedLinearUnit.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "common.h"
diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu
index 539b22fec5a515..8d6a7953975f75 100644
--- a/aten/src/THCUNN/HardTanh.cu
+++ b/aten/src/THCUNN/HardTanh.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/Im2Col.cu b/aten/src/THCUNN/Im2Col.cu
index 95bdcd4e8b9cd7..252c488df33f00 100644
--- a/aten/src/THCUNN/Im2Col.cu
+++ b/aten/src/THCUNN/Im2Col.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/IndexLinear.cu b/aten/src/THCUNN/IndexLinear.cu
index eebb7efc153d88..032e8e31d2cffb 100644
--- a/aten/src/THCUNN/IndexLinear.cu
+++ b/aten/src/THCUNN/IndexLinear.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/L1Cost.cu b/aten/src/THCUNN/L1Cost.cu
index eda58c18e2c6e4..4f11a94f8106c3 100644
--- a/aten/src/THCUNN/L1Cost.cu
+++ b/aten/src/THCUNN/L1Cost.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/device_ptr.h>
diff --git a/aten/src/THCUNN/LeakyReLU.cu b/aten/src/THCUNN/LeakyReLU.cu
index ec9efb836c3441..dec13dfd112dcc 100644
--- a/aten/src/THCUNN/LeakyReLU.cu
+++ b/aten/src/THCUNN/LeakyReLU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/LogSigmoid.cu b/aten/src/THCUNN/LogSigmoid.cu
index 59ee39a6871bf7..e318fcea4f92f5 100644
--- a/aten/src/THCUNN/LogSigmoid.cu
+++ b/aten/src/THCUNN/LogSigmoid.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu
index 59aa7e8f4612ba..05eb432871ed34 100644
--- a/aten/src/THCUNN/LookupTable.cu
+++ b/aten/src/THCUNN/LookupTable.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "THCThrustAllocator.cuh"
 #include <thrust/unique.h>
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensorSort.cuh"
 #include "../THC/THCTensorMathReduce.cuh"
diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu
index c2ba9f52081972..c579b93bcb2331 100644
--- a/aten/src/THCUNN/LookupTableBag.cu
+++ b/aten/src/THCUNN/LookupTableBag.cu
@@ -11,7 +11,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 #include <thrust/unique.h>
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensorSort.cuh"
 
diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu
index e9571fe06c4e30..99b287baba38a9 100644
--- a/aten/src/THCUNN/MSECriterion.cu
+++ b/aten/src/THCUNN/MSECriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
 #include "THCApply.cuh"
diff --git a/aten/src/THCUNN/MarginCriterion.cu b/aten/src/THCUNN/MarginCriterion.cu
index 7ccdbb725fe6b1..459d62a4e57a81 100644
--- a/aten/src/THCUNN/MarginCriterion.cu
+++ b/aten/src/THCUNN/MarginCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
diff --git a/aten/src/THCUNN/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/MultiLabelMarginCriterion.cu
index 13b432c15c38c6..220de837d2349e 100644
--- a/aten/src/THCUNN/MultiLabelMarginCriterion.cu
+++ b/aten/src/THCUNN/MultiLabelMarginCriterion.cu
@@ -2,7 +2,7 @@
 #include "THCTensor.hpp"
 #include "common.h"
 #include "THCReduceApplyUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu
index c2fa2134626101..de802c82979985 100644
--- a/aten/src/THCUNN/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/MultiMarginCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/PReLU.cu b/aten/src/THCUNN/PReLU.cu
index cdc6b2b71a1ee4..c89152487d2f3f 100644
--- a/aten/src/THCUNN/PReLU.cu
+++ b/aten/src/THCUNN/PReLU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "THCTensor.hpp"
@@ -69,7 +69,7 @@ struct PReLUAccGradParametersShared
 {
   __device__ __forceinline__ void operator()(T *gradInput, T  *input, T *gradOutput)
   {
-    *gradInput = (*input) * (*gradOutput) * (*input <= 0);
+    *gradInput = (*input) * (*gradOutput) * static_cast<int>(*input <= 0);
   }
 };
 
@@ -84,7 +84,7 @@ struct PReLUAccGradParameters
 
   __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
   {
-    *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0);
+    *gradInput = (*input) * (*gradOutput) * scale * static_cast<int>(*input <= 0);
   }
 };
 
@@ -99,7 +99,7 @@ struct PReLUAccGradParameters1to1
 
   __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput)
   {
-    *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0);
+    *gradWeight += (*input) * (*gradOutput) * scale * static_cast<int>(*input <= 0);
   }
 };
 
diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu
index 388627791a012f..d044fadadfb0d1 100644
--- a/aten/src/THCUNN/RReLU.cu
+++ b/aten/src/THCUNN/RReLU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "common.h"
@@ -15,8 +15,8 @@ template<typename T>
 inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
 
 template <>
-inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
-  return ScalarConvert<float, half>::to(curand_uniform(state));
+inline THHalf __device__ curand_uniform_type<THHalf>(curandStateMtgp32 *state) {
+  return ScalarConvert<float, THHalf>::to(curand_uniform(state));
 }
 
 template <>
diff --git a/aten/src/THCUNN/Sigmoid.cu b/aten/src/THCUNN/Sigmoid.cu
index 0be57d6a621dfc..6ade198fe02af1 100644
--- a/aten/src/THCUNN/Sigmoid.cu
+++ b/aten/src/THCUNN/Sigmoid.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu
index c8018d997365dc..48a86e3efec996 100644
--- a/aten/src/THCUNN/SmoothL1Criterion.cu
+++ b/aten/src/THCUNN/SmoothL1Criterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
 #include "THCApply.cuh"
diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu
index ee53e76dca2625..63b0ef3cc7ca7e 100644
--- a/aten/src/THCUNN/SoftMarginCriterion.cu
+++ b/aten/src/THCUNN/SoftMarginCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCApply.cuh"
 
diff --git a/aten/src/THCUNN/SoftPlus.cu b/aten/src/THCUNN/SoftPlus.cu
index 42b2c3c5ef0617..a8a36c22fa918a 100644
--- a/aten/src/THCUNN/SoftPlus.cu
+++ b/aten/src/THCUNN/SoftPlus.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/SoftShrink.cu b/aten/src/THCUNN/SoftShrink.cu
index a4e45d87c6b3a3..5ec104155ea737 100644
--- a/aten/src/THCUNN/SoftShrink.cu
+++ b/aten/src/THCUNN/SoftShrink.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu
index 2bc17a75a76f7e..08f72046341ca7 100644
--- a/aten/src/THCUNN/SparseLinear.cu
+++ b/aten/src/THCUNN/SparseLinear.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
diff --git a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
index 2c671dad5a8364..ff68ab8440f757 100644
--- a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
+++ b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
index 592e6fd8b08154..591dd012fdd521 100644
--- a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
+++ b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/SpatialAveragePooling.cu b/aten/src/THCUNN/SpatialAveragePooling.cu
index ce9941a62398c7..d07255a954ee77 100644
--- a/aten/src/THCUNN/SpatialAveragePooling.cu
+++ b/aten/src/THCUNN/SpatialAveragePooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "common.h"
 
diff --git a/aten/src/THCUNN/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/SpatialClassNLLCriterion.cu
index 83addd09a2769b..92262354b3b696 100644
--- a/aten/src/THCUNN/SpatialClassNLLCriterion.cu
+++ b/aten/src/THCUNN/SpatialClassNLLCriterion.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "common.h"
diff --git a/aten/src/THCUNN/SpatialConvolutionLocal.cu b/aten/src/THCUNN/SpatialConvolutionLocal.cu
index 17801d52b12688..0af7685432991b 100644
--- a/aten/src/THCUNN/SpatialConvolutionLocal.cu
+++ b/aten/src/THCUNN/SpatialConvolutionLocal.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu
index 4a59acb2975188..d9f6f128efe81b 100644
--- a/aten/src/THCUNN/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/SpatialConvolutionMM.cu
@@ -3,7 +3,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/SpatialConvolutionMM.cu"
diff --git a/aten/src/THCUNN/SpatialCrossMapLRN.cu b/aten/src/THCUNN/SpatialCrossMapLRN.cu
index cd6f081b1302df..2262ddba4743ed 100644
--- a/aten/src/THCUNN/SpatialCrossMapLRN.cu
+++ b/aten/src/THCUNN/SpatialCrossMapLRN.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/SpatialDilatedConvolution.cu b/aten/src/THCUNN/SpatialDilatedConvolution.cu
index b8e96024fdaa87..b0edadb2b357fc 100644
--- a/aten/src/THCUNN/SpatialDilatedConvolution.cu
+++ b/aten/src/THCUNN/SpatialDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
index 6732e4f2b53409..3aef4ecf524cce 100644
--- a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
+++ b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCNumerics.cuh"
 #include "common.h"
diff --git a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
index f3ca162453107a..71ddce8c84a995 100644
--- a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
+++ b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
@@ -3,7 +3,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialFullConvolution.cu b/aten/src/THCUNN/SpatialFullConvolution.cu
index 4e37ecf280bbf5..b4eff4b9c22d1a 100644
--- a/aten/src/THCUNN/SpatialFullConvolution.cu
+++ b/aten/src/THCUNN/SpatialFullConvolution.cu
@@ -1,7 +1,7 @@
 #include "THCUNN.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/SpatialFullConvolution.cu"
diff --git a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
index 61e1fe5910ad18..9ba2236cea66cd 100644
--- a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "im2col.h"
 #include "THCTensor.hpp"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/SpatialFullDilatedConvolution.cu"
diff --git a/aten/src/THCUNN/SpatialReflectionPadding.cu b/aten/src/THCUNN/SpatialReflectionPadding.cu
index 96472eed08b839..0b0643c8e41342 100644
--- a/aten/src/THCUNN/SpatialReflectionPadding.cu
+++ b/aten/src/THCUNN/SpatialReflectionPadding.cu
@@ -7,7 +7,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include <THC/THCApply.cuh>
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialReplicationPadding.cu b/aten/src/THCUNN/SpatialReplicationPadding.cu
index f63c2090d5fb14..1ee5c62f6e7132 100644
--- a/aten/src/THCUNN/SpatialReplicationPadding.cu
+++ b/aten/src/THCUNN/SpatialReplicationPadding.cu
@@ -7,7 +7,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include <THC/THCApply.cuh>
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialSubSampling.cu b/aten/src/THCUNN/SpatialSubSampling.cu
index bb0484662254fd..8e8e390c136388 100644
--- a/aten/src/THCUNN/SpatialSubSampling.cu
+++ b/aten/src/THCUNN/SpatialSubSampling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
index 07daa0e9fec01d..b093ee287edce0 100644
--- a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
+++ b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
index 889d64e1817e1c..8b0784cfd75351 100644
--- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu
+++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/Sqrt.cu b/aten/src/THCUNN/Sqrt.cu
index a52ce34117aaf0..7358f8c6bd0bc7 100644
--- a/aten/src/THCUNN/Sqrt.cu
+++ b/aten/src/THCUNN/Sqrt.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/Square.cu b/aten/src/THCUNN/Square.cu
index 66bbec49d29cd9..f44fbfe7ea4fc4 100644
--- a/aten/src/THCUNN/Square.cu
+++ b/aten/src/THCUNN/Square.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
index 5f8fda89909552..fe8a8bbc3cdc1a 100644
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
@@ -1,23 +1,23 @@
 #ifndef THC_HALF_AUTO_NUMERICS_INC
 #define THC_HALF_AUTO_NUMERICS_INC
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCNumerics.cuh"
 
-// WARNING: THCNumerics is being deprecated. Read the comments and function usage 
+// WARNING: THCNumerics is being deprecated. Read the comments and function usage
 //          in THCNumerics to learn about the deprecation
-//      
+//
 // Half numerics functions defined as free functions, so cunn code can be
-//written generically, i.e. without excessive calling of THCNumerics<half> functions.
+// written generically, i.e. without excessive calling of THCNumerics<THHalf> functions.
 
 // these functions should move to THCNumerics
 
-inline __host__ __device__ half fmaxType(half x, half y) {
-  return THCNumerics<half>::ge(x, y) ? x : y;
+inline __host__ __device__ THHalf fmaxType(THHalf x, THHalf y) {
+  return THCNumerics<THHalf>::ge(x, y) ? x : y;
 }
 
-inline __host__ __device__ float fmaxType(float x, half y) {
-  return fmaxf(x, ScalarConvert<half, float>::to(y));
+inline __host__ __device__ float fmaxType(float x, THHalf y) {
+  return fmaxf(x, ScalarConvert<THHalf, float>::to(y));
 }
 
 inline __host__ __device__ float fmaxType(float x, float y) {
@@ -31,217 +31,40 @@ inline __host__ __device__ double fmaxType(double x, double y) {
 
 // arithmetic functions
 
-inline __host__ __device__ half operator+(half a, half b) {
-  return THCNumerics<half>::add(a, b);
+inline __host__ __device__ THHalf abs(THHalf a) {
+  return THCNumerics<THHalf>::abs(a);
 }
 
-inline __host__ __device__ float operator+(half a, float b) {
-  return ScalarConvert<half, float>::to(a) + b;
+inline __host__ __device__ THHalf exp(THHalf a) {
+  return THCNumerics<THHalf>::exp(a);
 }
 
-inline __host__ __device__ float operator+(float a, half b) {
-  return a + ScalarConvert<half, float>::to(b);
+inline __host__ __device__ THHalf log10(THHalf a) {
+  return THCNumerics<THHalf>::log10(a);
 }
 
-inline __host__ __device__ double operator+(double a, half b) {
-  return a + ScalarConvert<half, double>::to(b);
+inline __host__ __device__ THHalf log1p(THHalf a) {
+  return THCNumerics<THHalf>::log1p(a);
 }
 
-inline __host__ __device__ half operator-(half a) {
-  return THCNumerics<half>::neg(a);
+inline __host__ __device__ THHalf log2(THHalf a) {
+  return THCNumerics<THHalf>::log2(a);
 }
 
-inline __host__ __device__ half operator-(half a, half b) {
-  return THCNumerics<half>::add(a, THCNumerics<half>::neg(b));
+inline __host__ __device__ THHalf expm1(THHalf a) {
+  return THCNumerics<THHalf>::expm1(a);
 }
 
-inline __host__ __device__ half operator-(half a, int b) {
-  return THCNumerics<half>::add(a, THCNumerics<half>::neg(ScalarConvert<int, half>::to(b)));
+inline __host__ __device__ THHalf pow(THHalf a, THHalf b) {
+  return THCNumerics<THHalf>::pow(a, b);
 }
 
-inline __host__ __device__ float operator-(half a, float b) {
-  return ScalarConvert<half, float>::to(a) - b;
+inline __host__ __device__ THHalf sqrt(THHalf a) {
+  return THCNumerics<THHalf>::sqrt(a);
 }
 
-inline __host__ __device__ double operator-(half a, double b) {
-  return ScalarConvert<half, double>::to(a) - b;
-}
-
-inline __host__ __device__ half operator-(int a, half b) {
-  return THCNumerics<half>::add(ScalarConvert<int, half>::to(a), THCNumerics<half>::neg(b));
-}
-
-inline __host__ __device__ float operator-(float a, half b) {
-  return a - ScalarConvert<half, float>::to(b);
-}
-
-inline __host__ __device__ double operator-(double a, half b) {
-  return a - ScalarConvert<half, double>::to(b);
-}
-
-inline __host__ __device__ half operator*(half a, half b) {
-  return THCNumerics<half>::mul(a, b);
-}
-
-inline __host__ __device__ float operator*(half a, float b) {
-  return ScalarConvert<half, float>::to(a) * b;
-}
-
-inline __host__ __device__ double operator*(half a, double b) {
-  return ScalarConvert<half, double>::to(a) * b;
-}
-
-inline __host__ __device__ half operator*(half a, int b) {
-  return a * ScalarConvert<int, half>::to(b);
-}
-
-inline __host__ __device__ float operator*(float a, half b) {
-  return a * ScalarConvert<half, float>::to(b);
-}
-
-inline __host__ __device__ double operator*(double a, half b) {
-  return a * ScalarConvert<half, double>::to(b);
-}
-
-inline __host__ __device__ half operator/(half a, half b) {
-  return THCNumerics<half>::div(a, b);
-}
-
-inline __host__ __device__ float operator/(float a, half b) {
-  return a / ScalarConvert<half, float>::to(b);
-}
-
-inline __host__ __device__ double operator/(double a, half b) {
-  return a / ScalarConvert<half, double>::to(b);
-}
-
-inline __host__ __device__ half operator/(int a, half b) {
-  return ScalarConvert<int, half>::to(a) / b;
-}
-
-inline __host__ __device__ float operator/(half a, float b) {
-  return ScalarConvert<half, float>::to(a) / b;
-}
-
-inline __host__ __device__ double operator/(half a, double b) {
-  return ScalarConvert<half, double>::to(a) / b;
-}
-
-inline __host__ __device__ half operator/(half a, int b) {
-  return a / ScalarConvert<int, half>::to(b);
-}
-
-inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) {
-  lhs = lhs + rhs;
-  return lhs;
-}
-inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) {
-  lhs = lhs + rhs;
-  return lhs;
-}
-
-inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) {
-  lhs = lhs - rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) {
-  lhs = lhs * rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) {
-  lhs = lhs / rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) {
-  lhs = lhs / rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half abs(half a) {
-  return THCNumerics<half>::abs(a);
-}
-
-inline __host__ __device__ half exp(half a) {
-  return THCNumerics<half>::exp(a);
-}
-
-inline __host__ __device__ half log10(half a) {
-  return THCNumerics<half>::log10(a);
-}
-
-inline __host__ __device__ half log1p(half a) {
-  return THCNumerics<half>::log1p(a);
-}
-
-inline __host__ __device__ half log2(half a) {
-  return THCNumerics<half>::log2(a);
-}
-
-inline __host__ __device__ half expm1(half a) {
-  return THCNumerics<half>::expm1(a);
-}
-
-inline __host__ __device__ half pow(half a, half b) {
-  return THCNumerics<half>::pow(a, b);
-}
-
-inline __host__ __device__ half sqrt(half a) {
-  return THCNumerics<half>::sqrt(a);
-}
-
-inline __host__ __device__ half tanh(half a) {
-  return THCNumerics<half>::tanh(a);
-}
-
-#if defined(_MSC_VER) && CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
-inline __host__ __device__ half operator+(half a, int b) {
-  return THCNumerics<half>::add(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ double operator+(half a, double b) {
-  return ScalarConvert<half, double>::to(a) + b;
-}
-
-inline __host__ __device__ half operator*(half a, bool b) {
-  return THCNumerics<half>::mul(a, ScalarConvert<bool, half>::to(b));
-}
-#endif
-
-// comparison functions
-
-inline __host__ __device__ bool operator<(half a, half b) {
-  return THCNumerics<half>::lt(a, b);
-}
-
-inline __host__ __device__ bool operator<=(half a, half b) {
-  return THCNumerics<half>::le(a, b);
-}
-
-inline __host__ __device__ bool operator<=(half a, int b) {
-  return THCNumerics<half>::le(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ bool operator<(half a, int b) {
-  return THCNumerics<half>::lt(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ bool operator>(half a, half b) {
-  return THCNumerics<half>::gt(a, b);
-}
-
-inline __host__ __device__ bool operator>(half a, int b) {
-  return THCNumerics<half>::gt(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ bool operator>=(half a, half b) {
-  return THCNumerics<half>::ge(a, b);
-}
-
-inline __host__ __device__ bool operator>=(half a, int b) {
-  return THCNumerics<half>::ge(a, ScalarConvert<int ,half>::to(b));
+inline __host__ __device__ THHalf tanh(THHalf a) {
+  return THCNumerics<THHalf>::tanh(a);
 }
 
 #endif
diff --git a/aten/src/THCUNN/Tanh.cu b/aten/src/THCUNN/Tanh.cu
index a19cc71c045dd2..676da711f33443 100644
--- a/aten/src/THCUNN/Tanh.cu
+++ b/aten/src/THCUNN/Tanh.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/TemporalConvolution.cu b/aten/src/THCUNN/TemporalConvolution.cu
index af12169d7a4ce8..847b82449ac60f 100644
--- a/aten/src/THCUNN/TemporalConvolution.cu
+++ b/aten/src/THCUNN/TemporalConvolution.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
diff --git a/aten/src/THCUNN/TemporalMaxPooling.cu b/aten/src/THCUNN/TemporalMaxPooling.cu
index 2508f835177b25..de478339d8f984 100644
--- a/aten/src/THCUNN/TemporalMaxPooling.cu
+++ b/aten/src/THCUNN/TemporalMaxPooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/TemporalReflectionPadding.cu b/aten/src/THCUNN/TemporalReflectionPadding.cu
index 4dd4da84c0a2d6..9e905f914653ca 100644
--- a/aten/src/THCUNN/TemporalReflectionPadding.cu
+++ b/aten/src/THCUNN/TemporalReflectionPadding.cu
@@ -8,7 +8,7 @@
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/TemporalReplicationPadding.cu b/aten/src/THCUNN/TemporalReplicationPadding.cu
index 2c812bda8d64f5..3f74d1e62473b2 100644
--- a/aten/src/THCUNN/TemporalReplicationPadding.cu
+++ b/aten/src/THCUNN/TemporalReplicationPadding.cu
@@ -7,7 +7,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include <THC/THCApply.cuh>
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/TemporalRowConvolution.cu b/aten/src/THCUNN/TemporalRowConvolution.cu
index 745fef807510d8..097c78fde6f81b 100644
--- a/aten/src/THCUNN/TemporalRowConvolution.cu
+++ b/aten/src/THCUNN/TemporalRowConvolution.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "row2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
index 89b0c37b1fa78d..2aaf4bcf4435e4 100644
--- a/aten/src/THCUNN/TemporalUpSamplingLinear.cu
+++ b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
index c87129da7e1563..225e319423e8ad 100644
--- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu
+++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/Threshold.cu b/aten/src/THCUNN/Threshold.cu
index 37290894103919..1d44e442e21277 100644
--- a/aten/src/THCUNN/Threshold.cu
+++ b/aten/src/THCUNN/Threshold.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
index 84e2c7f7063c3a..89ecfe09bfaf94 100644
--- a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
+++ b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
index 6d542ba39037ee..3e631b2755e9bd 100644
--- a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
+++ b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/VolumetricAveragePooling.cu b/aten/src/THCUNN/VolumetricAveragePooling.cu
index 110eac44dcb997..66e89d2a950b72 100644
--- a/aten/src/THCUNN/VolumetricAveragePooling.cu
+++ b/aten/src/THCUNN/VolumetricAveragePooling.cu
@@ -4,7 +4,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu
index da66140fb61537..2e405f3c3b00c2 100644
--- a/aten/src/THCUNN/VolumetricConvolution.cu
+++ b/aten/src/THCUNN/VolumetricConvolution.cu
@@ -1,7 +1,7 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 // Kernel for fast unfold+copy
diff --git a/aten/src/THCUNN/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/VolumetricDilatedConvolution.cu
index 8a32c70b6701ad..09fc83b01a54f2 100644
--- a/aten/src/THCUNN/VolumetricDilatedConvolution.cu
+++ b/aten/src/THCUNN/VolumetricDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "THCTensor.hpp"
 #include "common.h"
 #include "vol2col.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/VolumetricDilatedConvolution.cu"
diff --git a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
index 1a0f2f617d343e..bff981f73d3a3a 100644
--- a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
+++ b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
@@ -4,7 +4,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
index e6260ceabbe282..4875ae9f7da07a 100644
--- a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
+++ b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
@@ -3,7 +3,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricFullConvolution.cu b/aten/src/THCUNN/VolumetricFullConvolution.cu
index 556b5bc1d4a5ac..6c4ace126e04e7 100644
--- a/aten/src/THCUNN/VolumetricFullConvolution.cu
+++ b/aten/src/THCUNN/VolumetricFullConvolution.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/VolumetricFullConvolution.cu"
diff --git a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
index c5c7196bac899e..d315ace7582440 100644
--- a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "THCTensor.hpp"
 #include "common.h"
 #include "vol2col.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/VolumetricFullDilatedConvolution.cu"
diff --git a/aten/src/THCUNN/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/VolumetricMaxUnpooling.cu
index eac3b2d17af5e4..0974ebc763cd8d 100644
--- a/aten/src/THCUNN/VolumetricMaxUnpooling.cu
+++ b/aten/src/THCUNN/VolumetricMaxUnpooling.cu
@@ -4,7 +4,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <cfloat>
diff --git a/aten/src/THCUNN/VolumetricReplicationPadding.cu b/aten/src/THCUNN/VolumetricReplicationPadding.cu
index 27ea3ecad3faa2..e9ff31de27240b 100644
--- a/aten/src/THCUNN/VolumetricReplicationPadding.cu
+++ b/aten/src/THCUNN/VolumetricReplicationPadding.cu
@@ -5,7 +5,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 #include "THCReduceApplyUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include <THC/THCApply.cuh>
diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
index babbd58b0d4a0f..2f06bdaa78ca0b 100644
--- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
+++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
index 0f353b91acb7ea..ea4c50433370fe 100644
--- a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
+++ b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 74eb2669300241..44dc23ade9cd8d 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -150,13 +150,6 @@
         "gamma_grad_cuda_kernel",
       ]
     },
-    {
-      "path": "aten/src/THC/THCGeneral.cpp",
-      "functions": [
-        "THC_float2half",
-        "THC_half2float"
-      ]
-    },
     {
       "path": "aten/src/THCUNN/generic/SparseLinear.cu",
       "functions": [
diff --git a/torch/csrc/cuda/THCP.h b/torch/csrc/cuda/THCP.h
index bd3b2d1c5580cd..8ed6f6bf91ce2c 100644
--- a/torch/csrc/cuda/THCP.h
+++ b/torch/csrc/cuda/THCP.h
@@ -4,7 +4,7 @@
 #include "torch/csrc/python_headers.h"
 #include <TH/TH.h>
 #include <THC/THC.h>
-#include <THC/THCHalf.h>
+#include <TH/THHalf.h>
 #include <THC/THCTensor.hpp>
 
 #include "torch/csrc/THP.h"
diff --git a/torch/csrc/generic/utils.h b/torch/csrc/generic/utils.h
index ade810564fd2ae..bd7c2fb1050bff 100644
--- a/torch/csrc/generic/utils.h
+++ b/torch/csrc/generic/utils.h
@@ -15,7 +15,7 @@ typedef class THPPointer<THWStorage>      THWStoragePtr;
 typedef class THPPointer<THWTensor>       THWTensorPtr;
 typedef class THPPointer<THPStorage>     THPStoragePtr;
 
-#if (!defined(THC_GENERIC_FILE) || defined(THC_REAL_IS_HALF)) && \
+#if (!defined(THC_GENERIC_FILE)) && \
     (!defined(THD_GENERIC_FILE))
 template<>
 struct THPUtils_typeTraits<scalar_t> {
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 60b9b8b05352d9..1d1d0188940216 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -74,13 +74,8 @@
 #define THPFloatUtils_unpackAccreal(object)   (double)THPUtils_unpackReal_FLOAT(object)
 #define THPFloatUtils_newAccreal(value)       THPUtils_newReal_FLOAT(value)
 #define THPHalfUtils_checkReal(object)        THPUtils_checkReal_FLOAT(object)
-#ifndef THP_HOST_HALF
-#define THPHalfUtils_unpackReal(object)       (half)THC_float2half(THPUtils_unpackReal_FLOAT(object))
-#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(THC_half2float(value))
-#else
-#define THPHalfUtils_unpackReal(object)       TH_float2half(THPUtils_unpackReal_FLOAT(object))
-#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(TH_half2float(value))
-#endif
+#define THPHalfUtils_unpackReal(object)       (at::Half)THPUtils_unpackReal_FLOAT(object)
+#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(value)
 #define THPHalfUtils_checkAccreal(object)     THPUtils_checkReal_FLOAT(object)
 #define THPHalfUtils_unpackAccreal(object)    (double)THPUtils_unpackReal_FLOAT(object)
 #define THPHalfUtils_newAccreal(value)        THPUtils_newReal_FLOAT(value)

From 5da0b31bee56038d9a4c197a7b05536d8bb3dd44 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@mit.edu>
Date: Wed, 12 Sep 2018 17:36:35 -0700
Subject: [PATCH 106/237] More native docs on TensorOptions. (#11558)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11558

Differential Revision: D9783655

Pulled By: ezyang

fbshipit-source-id: 17c749c9ef99fd9dfd0ff365ebfe22102fb891d7
---
 aten/src/ATen/native/README.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 9b06a513a14cd5..df937d6464b487 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -74,10 +74,19 @@ signature.
 - `std::array<bool,N>` (where N is `1-4`).  NB: you MUST NOT put a space after the comma, otherwise
   this argument will not parse correctly.  (If you decide to fix this, make sure you fix the
   argument parser both in ATen and in PyTorch.)
+- `TensorOptions`.  Tensor options provide information about how a
+  tensor should be constructed; it is most useful when you are writing a
+  factory function, where you have no `Tensor` inputs and thus
+  cannot otherwise determine how to construct a `Tensor`.
 - `*` is a special sentinel argument, which doesn't translate into an actual
   argument, but indicates that in the Python bindings, any subsequent arguments
   must be specified as keyword arguments (and cannot be provided positionally).
 
+Functions with no tensor inputs are called *factory functions*, and
+are handled specially by code generation.  If your function is behaving
+differently than another example, check first and see if one is a
+factory while another is not.
+
 **Return types.** These types are permissible as ReturnType:
 
 - `Tensor` and `TensorList`, which translate into the C++ types `Tensor` and `std::vector<Tensor>`,
@@ -218,8 +227,9 @@ direct consequences on valid implementations:
 
 * Never create a `Tensor` directly (e.g., `at::CPU` or `at::CUDA`), as a
   caller will be expecting to get `Variable`s out if it passes `Variable`.
-  Instead, create tensors from the `type()` of one of the input tensors, e.g.,
-  `input.type().tensor()`  or `input.type().toScalarType(kByte)` if you need
+  Instead, create tensors using the `options()` of one of the input
+  tensors.  E.g., `at::empty(sizes, input.options())` or
+  `at::ones(input.options().dtype(kByte))`, if you need
   a different scalar type.
 
 * If you need to call other ATen functions, be sure to qualify the call

From 0a6931cfee93a4c70d17980786337799ed5d56ee Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Wed, 12 Sep 2018 18:23:57 -0700
Subject: [PATCH 107/237] Only reference ONNX through onnx_pb.h (#11609)

Summary:
I think this is needed to land https://github.com/onnx/onnx/pull/1407 without CI errors.

cc mingzhe09088 houseroad
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11609

Reviewed By: houseroad

Differential Revision: D9803490

Pulled By: orionr

fbshipit-source-id: 26193f38ab0a2eef9ad7d0da9a0310dc40ef0f2d
---
 torch/csrc/jit/export.cpp | 2 +-
 torch/csrc/jit/import.cpp | 2 +-
 torch/csrc/onnx/init.cpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index c4ef14f191f0ee..0d304a91583e8d 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -1,7 +1,7 @@
 #include "torch/csrc/jit/export.h"
 #include "torch/csrc/jit/serialization.h"
 #include "torch/csrc/autograd/symbolic.h"
-#include "onnx/onnx.pb.h"
+#include "onnx/onnx_pb.h"
 #include "torch/csrc/onnx/onnx.h"
 
 #include "torch/csrc/utils/functional.h"
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index d2f828a2517ef8..0ca9f79e6f50dc 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -1,6 +1,6 @@
 #include "torch/csrc/jit/import.h"
 #include "torch/csrc/jit/serialization.h"
-#include "onnx/onnx.pb.h"
+#include "onnx/onnx_pb.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/jit/assertions.h"
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index ea78e83ad01b36..fe40b60ce674f7 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -1,6 +1,6 @@
 #include "torch/csrc/onnx/init.h"
 #include "torch/csrc/onnx/onnx.h"
-#include "onnx/onnx.pb.h"
+#include "onnx/onnx_pb.h"
 
 namespace torch { namespace onnx {
 void initONNXBindings(PyObject* module) {

From 17637f2b03b2dc9a64fb31ff9aafb2503bb05305 Mon Sep 17 00:00:00 2001
From: Guan Pang <gpang@fb.com>
Date: Wed, 12 Sep 2018 18:39:15 -0700
Subject: [PATCH 108/237] enable_mkl support for resnet18+lstm model

Summary:
* Many op in lstm part of the model don't have implementation in ideep/mkl, and it doesn't make sense to copy back and forth for the few available ops because majority of RNN will be on CPU
* Thus the strategy is to enable mkl only for the resnet18 part of the model, then switch to default cpu engine for the lstm part

* The net may contain some external_inputs falsely added during ONNX->Caffe2. Canary in service shows their existence could leads to service crash (presumably due to these blob somehow get shared between threads). They're now manually removed which seem to be enough to avoid the crash.

Reviewed By: viswanathgs

Differential Revision: D8888763

fbshipit-source-id: da7761bcb7d876ff7bbb6640ae4b24712c0b1de6
---
 caffe2/python/mkl/rewrite_graph.py | 125 +++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index ae8b6cd889ef38..146d5eb53cebe9 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -86,6 +86,131 @@ def mkl_tmp(name):
         fix_BoxWithNMSLimit(net)
 
 
+def rewrite_run_net_simple_xrayocr_lstm(net, ideep=True):
+    # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
+    # enable mkl, then copy the temporary output blob at the break point
+    # and all external inputs for lstm part to cpu, and execuate rest of the net
+    # (two lstm) on cpu
+    # This only works for the xrayocr lstm model which uses the first 'Shape' op
+    # to decide the break point, and after two lstm it's external_output
+    # directly so there's no need to copy back to ideep/mkl
+
+    def mkl_tmp(name):
+        return "{}__MKL__".format(name)
+
+    def cpu_tmp(name):
+        return "{}__CPU__".format(name)
+
+    input_blob = net.external_input[0]
+    if input_blob != net.op[0].input[0]:
+        raise Exception(
+            "Input blob: {} is not consumed by first op: {}".format(
+                input_blob, net.op[0]))
+    # Modify input/outputs to point to copied MKL blobs.
+    from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL"
+    to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU"
+    copy_input_op = core.CreateOperator(
+        from_cpu, input_blob, mkl_tmp(input_blob))
+    net.op[0].input[0] = mkl_tmp(input_blob)
+
+    # the net may contain some external_inputs falsely added during ONNX->Caffe2
+    # This should be taken care of in early steps during pytorch_to_caffe2,
+    # but if not it can cause issue in follow up steps, so check here to confirm
+    for input_blob in net.external_input:
+        for op in net.op:
+            # look for if the external_input blob is output of any op in the net
+            assert input_blob not in op.output
+
+    external_output = None
+    external_inputs_to_cpu = set()
+    find_first_shape_op = False
+    cpu_op_start_idx = -1
+    for op_idx, op in enumerate(net.op):
+        # the first Shape op mark the starting point of LSTM chunk of the net
+        if not find_first_shape_op:
+            if op.type == 'Shape':
+                external_output = op.input
+                find_first_shape_op = True
+                cpu_op_start_idx = op_idx
+        else:
+            # any external input in the LSTM part need to be copied to CPU
+            for in_blob in op.input:
+                if in_blob in net.external_input:
+                    external_inputs_to_cpu.add(in_blob)
+
+    # make sure we found the expected break point of the net
+    assert external_output is not None
+
+    # create op to copy external input blobs used in LSTM part from IDEEP to CPU
+    copy_extra_input_ops = []
+    for in_blob in external_inputs_to_cpu:
+        copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
+                                                        cpu_tmp(in_blob)))
+        # rename input blobs in LSTM part to use the CPU copy
+        for op in net.op[cpu_op_start_idx:]:
+            renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
+                             for blob in op.input]
+            op.input[:] = renamed_input
+
+    copy_output_ops = [
+        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
+        for output_blob in external_output]
+
+    for output_blob in external_output:
+        last_producer_idx = last_producer(net.op, output_blob)
+        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
+                           for blob in net.op[last_producer_idx].output]
+        net.op[last_producer_idx].output[:] = renamed_outputs
+
+    # rearrange all ops in correct order
+    ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
+          + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
+    del net.op[:]
+    net.op.extend(ops)
+
+    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
+    for op in net.op:
+        # the first Shape op mark the starting point of LSTM chunk of the net
+        if op.type == 'Shape':
+            # all LSTM ops should run on CPU
+            device = caffe2_pb2.CPU
+        op.device_option.MergeFrom(
+            core.DeviceOption(device_type=device))
+        op.engine = ""
+
+        # RecurrentNetwork has a nested step_net that needs special treatment
+        if op.type == 'RecurrentNetwork':
+            for arg in op.arg:
+                if arg.name == 'step_net':
+                    for nested_op in arg.n.op:
+                        # set device to CPU
+                        nested_op.device_option.MergeFrom(
+                            core.DeviceOption(device_type=device))
+                        nested_op.engine = ""
+
+                        # rename inputs in op of nested net
+                        renamed_input = []
+                        for blob in nested_op.input:
+                            renamed_input.append(blob
+                                if blob not in external_inputs_to_cpu
+                                else cpu_tmp(blob))
+                        nested_op.input[:] = renamed_input
+
+                    # rename external inputs of nested net
+                    new_external_input = []
+                    for blob in arg.n.external_input:
+                        new_external_input.append(blob
+                            if blob not in external_inputs_to_cpu
+                            else cpu_tmp(blob))
+                    arg.n.external_input[:] = new_external_input
+
+    if ideep:
+        # Temporarily disbale conv+relu fusion until we verify further
+        # net.ParseFromString(
+        #     C.transform_optimizeForIDEEP(net.SerializeToString()))
+        fix_BoxWithNMSLimit(net)
+
+
 def rewrite_model_helper_simple(model, ideep=True):
     model = copy.deepcopy(model)
     # All parameter initialization should run on MKL

From 6f05b5ee5464d8c3386a8d1a059c327a6aaf5a59 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 12 Sep 2018 21:47:19 -0700
Subject: [PATCH 109/237] Pin Sphinx to 1.7.9 (#11620)

Summary:
Sphinx 1.8.0 breaks us.  Upgrading is tracked in #11618.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11620

Differential Revision: D9806440

Pulled By: ezyang

fbshipit-source-id: 7a8d849c78e697a8775d00cd3a463a7bdbcddabe
---
 .jenkins/pytorch/build.sh | 5 ++---
 docs/requirements.txt     | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 3159ee1ad629e6..2dc64157c5d00d 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -117,9 +117,8 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
   # TODO: Don't run this here
-  # TODO: Reenable doc build
-  #pip install -r requirements.txt || true
-  #LC_ALL=C make html
+  pip install -r requirements.txt || true
+  LC_ALL=C make html
   popd
 fi
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 159253ca0a6e33..a8d4eb2a85d11e 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,3 @@
-sphinx
+sphinx==1.7.9
 -e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme
 sphinxcontrib.katex

From e998038bc009a872eebf29cf135f455862fb6715 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 12 Sep 2018 22:13:06 -0700
Subject: [PATCH 110/237] Use TypeMeta instead of TypeIdentifier within
 at::StorageImpl (#11236)

Summary:
Further aligns at::StorageImpl with caffe2::StorageImpl
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11236

Differential Revision: D9776286

Pulled By: cpuhrsch

fbshipit-source-id: f2c53995fcece013b77b3a1f709ab0f9df8ab23e
---
 aten/src/ATen/UndefinedType.cpp                  |  3 +++
 aten/src/ATen/UndefinedType.h                    |  1 +
 aten/src/ATen/Utils.h                            |  4 ++--
 aten/src/ATen/core/ScalarType.h                  | 12 ++++++++++++
 aten/src/ATen/core/Storage.cpp                   |  8 ++++----
 aten/src/ATen/core/Storage.h                     |  6 +++---
 aten/src/ATen/core/StorageImpl.cpp               |  7 +++----
 aten/src/ATen/core/StorageImpl.h                 | 14 +++++++-------
 aten/src/ATen/core/TensorImpl.cpp                |  4 ++--
 aten/src/ATen/core/TensorImpl.h                  |  6 ++++++
 aten/src/ATen/core/Type.h                        |  1 +
 aten/src/ATen/templates/SparseTypeDerived.cpp    |  3 +++
 aten/src/ATen/templates/Type.h                   |  1 +
 aten/src/ATen/templates/TypeDefault.cpp          |  8 ++++----
 aten/src/ATen/templates/TypeDerived.cpp          |  6 ++++++
 aten/src/ATen/templates/TypeDerived.h            |  1 +
 aten/src/TH/THStorageFunctions.cpp               |  4 ++--
 aten/src/TH/THStorageFunctions.hpp               |  2 +-
 aten/src/TH/THTensor.cpp                         |  6 +++---
 aten/src/TH/generic/THStorage.cpp                | 16 ++++++++--------
 aten/src/THC/THCStorage.cpp                      |  4 ++--
 aten/src/THC/THCStorage.hpp                      |  2 +-
 aten/src/THC/THCTensor.cpp                       |  9 +++++----
 aten/src/THC/THCTensor.hpp                       |  2 +-
 aten/src/THC/THCTensorCopy.cu                    |  3 +--
 aten/src/THC/generic/THCStorage.cpp              |  9 +++++----
 aten/src/THC/generic/THCTensorCopy.cu            |  4 ++--
 .../complex_registration_extension.cpp           |  8 +++++++-
 tools/autograd/templates/VariableType.cpp        |  3 +++
 tools/autograd/templates/VariableType.h          |  1 +
 torch/csrc/DynamicTypes.cpp                      |  2 +-
 torch/csrc/jit/import.cpp                        |  2 +-
 32 files changed, 103 insertions(+), 59 deletions(-)

diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
index bea9baf61892f5..9c3457bdd4803f 100644
--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -8,6 +8,9 @@ UndefinedType::UndefinedType()
 ScalarType UndefinedType::scalarType() const {
   return ScalarType::Undefined;
 }
+caffe2::TypeMeta UndefinedType::typeMeta() const {
+  AT_ERROR("typeMeta not defined for UndefinedType");
+}
 Backend UndefinedType::backend() const {
   return Backend::Undefined;
 }
diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h
index 594fb99e61dc0e..4ccd6101851a72 100644
--- a/aten/src/ATen/UndefinedType.h
+++ b/aten/src/ATen/UndefinedType.h
@@ -14,6 +14,7 @@ namespace at {
 struct UndefinedType final : public TypeDefault {
   explicit UndefinedType();
   virtual ScalarType scalarType() const override;
+  virtual caffe2::TypeMeta typeMeta() const override;
   virtual Backend backend() const override;
   virtual Allocator* allocator() const override;
   virtual Device getDeviceFromPtr(void* data) const override;
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index cea2f83d3e9180..c4473d1471ab7d 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -44,12 +44,12 @@ static inline const Storage& checked_storage(
         name,
         "'");
   }
-  if (expr.dtype() != data_type) {
+  if (expr.dtype().id() != data_type) {
     AT_ERROR(
         "Expected object of data type ",
         data_type,
         " but got data type ",
-        expr.dtype(),
+        expr.dtype().id(),
         " for argument #",
         pos,
         " '",
diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
index b5e1a47646d7d6..6fe88bfadb05f5 100644
--- a/aten/src/ATen/core/ScalarType.h
+++ b/aten/src/ATen/core/ScalarType.h
@@ -80,6 +80,18 @@ static inline DataType scalarTypeToDataType(ScalarType scalar_type) {
 #undef DEFINE_CASE
 }
 
+static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+#define DEFINE_CASE(ctype,name,_) \
+  case ScalarType:: name : return caffe2::TypeMeta::Make<ctype>();
+
+  switch(scalar_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+    case ScalarType::Undefined: return caffe2::TypeMeta();
+    default: AT_ERROR("Unrecognized Scalartype ", scalar_type, " (please report this error)");
+  }
+#undef DEFINE_CASE
+}
+
 static inline ScalarType dataTypeToScalarType(DataType dtype) {
 #define DEFINE_IF(ctype,name,_) \
   if (dtype == caffe2::TypeMeta::Id<ctype>()) { \
diff --git a/aten/src/ATen/core/Storage.cpp b/aten/src/ATen/core/Storage.cpp
index 21f3e35ada4fd2..854a68876e2e5d 100644
--- a/aten/src/ATen/core/Storage.cpp
+++ b/aten/src/ATen/core/Storage.cpp
@@ -3,24 +3,24 @@
 namespace at {
 
 Storage::Storage(
-    at::ScalarType scalar_type,
+    caffe2::TypeMeta data_type,
     size_t size,
     Allocator* allocator,
     bool resizable)
     : storage_impl_(c10::make_intrusive<StorageImpl>(
-          at::scalarTypeToDataType(scalar_type),
+          data_type,
           size,
           allocator,
           resizable)) {}
 
 Storage::Storage(
-    at::ScalarType scalar_type,
+    caffe2::TypeMeta data_type,
     at::DataPtr data_ptr,
     size_t size,
     const std::function<void(void*)>& deleter,
     bool resizable)
     : storage_impl_(c10::make_intrusive<StorageImpl>(
-          at::scalarTypeToDataType(scalar_type),
+          data_type,
           size,
           std::move(data_ptr),
           /* allocator */ nullptr,
diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h
index 6f2a8fd68ee716..656ddb972c7959 100644
--- a/aten/src/ATen/core/Storage.h
+++ b/aten/src/ATen/core/Storage.h
@@ -11,12 +11,12 @@ struct AT_API Storage {
   Storage(const c10::intrusive_ptr<StorageImpl>& ptr) : storage_impl_(ptr) {}
   Storage(c10::intrusive_ptr<StorageImpl>&& ptr) : storage_impl_(std::move(ptr)) {}
   Storage(
-      at::ScalarType,
+      caffe2::TypeMeta,
       size_t size,
       Allocator* allocator,
       bool resizable = false);
   Storage(
-      at::ScalarType,
+      caffe2::TypeMeta,
       at::DataPtr,
       size_t size,
       const std::function<void(void*)>& deleter,
@@ -33,7 +33,7 @@ struct AT_API Storage {
   bool resizable() const { return storage_impl_->resizable(); }
   // get() use here is to get const-correctness
   void* data() const { return storage_impl_.get()->data(); }
-  const at::DataType dtype() const {
+  const caffe2::TypeMeta dtype() const {
     return storage_impl_->dtype();
   }
   const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); }
diff --git a/aten/src/ATen/core/StorageImpl.cpp b/aten/src/ATen/core/StorageImpl.cpp
index b0f82132e39cb8..9c8bdfc45d3b70 100644
--- a/aten/src/ATen/core/StorageImpl.cpp
+++ b/aten/src/ATen/core/StorageImpl.cpp
@@ -3,7 +3,7 @@
 namespace at {
 
 StorageImpl::StorageImpl(
-    at::DataType data_type,
+    caffe2::TypeMeta data_type,
     int64_t numel,
     at::DataPtr data_ptr,
     at::Allocator* allocator,
@@ -15,15 +15,14 @@ StorageImpl::StorageImpl(
       allocator_(allocator) {}
 
 StorageImpl::StorageImpl(
-    at::DataType data_type,
+    caffe2::TypeMeta data_type,
     int64_t numel,
     at::Allocator* allocator,
     bool resizable)
     : StorageImpl(
           data_type,
           numel,
-          allocator->allocate(
-              at::elementSize(dataTypeToScalarType(data_type)) * numel),
+          allocator->allocate(data_type.itemsize() * numel),
           allocator,
           resizable) {}
 
diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index e80c11c6b0e211..0b2b635745abb2 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -15,13 +15,13 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   StorageImpl() = delete;
   ~StorageImpl() {};
   StorageImpl(
-      at::DataType data_type,
+      caffe2::TypeMeta data_type,
       int64_t numel,
       at::DataPtr data_ptr,
       at::Allocator* allocator,
       bool resizable);
   StorageImpl(
-      at::DataType data_type,
+      caffe2::TypeMeta data_type,
       int64_t numel,
       at::Allocator* allocator,
       bool resizable);
@@ -34,10 +34,10 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   template <typename T>
   inline T* data() const {
     auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType<T>::to());
-    if (dtype() != data_type_T) {
+    if (dtype().id() != data_type_T) {
       AT_ERROR(
           "Attempt to access StorageImpl having data type ",
-          dtype(),
+          dtype().id(),
           " as data type ",
           data_type_T);
     }
@@ -56,7 +56,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   void operator=(const StorageImpl&) = delete;
 
   size_t itemsize() const {
-    return at::elementSize(dataTypeToScalarType(data_type_));
+    return data_type_.itemsize();
   }
 
   Type& type();
@@ -93,7 +93,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   at::Allocator* allocator() {
     return allocator_;
   };
-  const DataType dtype() const {
+  const caffe2::TypeMeta dtype() const {
     return data_type_;
   }
   const at::Allocator* allocator() const {
@@ -114,7 +114,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   }
 
  private:
-  at::DataType data_type_;
+  caffe2::TypeMeta data_type_;
   at::DataPtr data_ptr_;
   int64_t numel_;
   bool resizable_;
diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index d5e5781b038ca5..1bf6802c7d4d8e 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -22,12 +22,12 @@ TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *
   // UndefinedTensors and SparseTensors don't have storages.
   if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined
       && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    storage_ = Storage(scalar_type, 0, allocator, true);
+    storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true);
   }
 }
 
 TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable)
-    : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype()), is_variable) {}
+    : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {}
 
 TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable)
     : storage_(std::move(storage)),
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 3f41c1163c563b..831c53e23b6d49 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -111,10 +111,16 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_.unsafe_data<T>() + storage_offset_;
   }
 
+  // TODO: Remove this once we get rid of scalar_type and use dmeta or dtype
+  // instead.
   inline at::ScalarType scalar_type() const {
     return scalar_type_;
   }
 
+  inline caffe2::TypeMeta dtype() const {
+    return storage_.dtype();
+  }
+
   virtual int64_t storage_offset() const {
     return storage_offset_;
   }
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 6284e7634bb755..e8b2e2d40980c2 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -82,6 +82,7 @@ struct AT_API Type {
 
   virtual ~Type() {}
   virtual ScalarType scalarType() const = 0;
+  virtual caffe2::TypeMeta typeMeta() const = 0;
   virtual Backend backend() const = 0;
   Layout layout() const noexcept { return layout_from_backend(backend()); }
   virtual bool is_cuda() const = 0;
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 4bc8265a258f70..dc419b9ad52c01 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -32,6 +32,9 @@ namespace at {
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
+caffe2::TypeMeta ${Type}::typeMeta() const {
+  AT_ERROR("typeMeta not defined for SparseType");
+}
 Backend ${Type}::backend() const {
   return Backend::${Backend};
 }
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 18fb393aeca759..ef49b2880dcec7 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -53,6 +53,7 @@ struct AT_API Type {
 
   virtual ~Type() {}
   virtual ScalarType scalarType() const = 0;
+  virtual caffe2::TypeMeta typeMeta() const = 0;
   virtual Backend backend() const = 0;
   Layout layout() const noexcept { return layout_from_backend(backend()); }
   virtual bool is_cuda() const = 0;
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 530aa09fddb734..5e68470ecc0abd 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -94,20 +94,20 @@ Tensor TypeDefault::tensorWithAllocator(IntList sizes, IntList strides, Allocato
 }
 
 Storage TypeDefault::storage(bool resizable) const {
-  return Storage(scalarType(), 0, allocator(), resizable);
+  return Storage(typeMeta(), 0, allocator(), resizable);
 }
 Storage TypeDefault::storage(size_t size, bool resizable) const {
-  return Storage(scalarType(), size, allocator(), resizable);
+  return Storage(typeMeta(), size, allocator(), resizable);
 }
 Storage TypeDefault::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
     return Storage(
-      scalarType(),
+      typeMeta(),
       InefficientStdFunctionContext::makeDataPtr(data, deleter, getDeviceFromPtr(data)),
       size,
       deleter);
 }
 Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) const {
-    return Storage(scalarType(), size, allocator);
+    return Storage(typeMeta(), size, allocator);
 }
 Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const {
   auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(static_cast<TensorImpl*>(th_pointer));
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index bc2e12acb14f29..d012274c5fceed 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -32,9 +32,15 @@ namespace at {
 
 ${Type}::${Type}()
   : ${DenseBackend}TypeDefault(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
+
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
+
+caffe2::TypeMeta ${Type}::typeMeta() const {
+    return caffe2::TypeMeta::Make<${ScalarType}>();
+}
+
 Backend ${Type}::backend() const {
   return Backend::${Backend};
 }
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index 3a48d8b26e32b4..116df9b4d465fe 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -19,6 +19,7 @@ namespace at {
 struct ${Type} final : public ${DenseBackend}TypeDefault {
   explicit ${Type}();
   virtual ScalarType scalarType() const override;
+  virtual caffe2::TypeMeta typeMeta() const override;
   virtual Backend backend() const override;
   virtual const char * toString() const override;
   virtual size_t elementSizeInBytes() const override;
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index a5319e67dabe61..c4d1f778250ea2 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -15,9 +15,9 @@
 #include "generic/THStorageCopy.cpp"
 #include "THGenerateHalfType.h"
 
-THStorage* THStorage_new(at::ScalarType scalar_type) {
+THStorage* THStorage_new(caffe2::TypeMeta data_type) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(scalar_type),
+      data_type,
       0,
       getTHDefaultAllocator(),
       true).release();
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 93a89f1753f000..95e5bacc2cef55 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -30,7 +30,7 @@
 //    If it is not, you must report that the storage is dead.
 //
 
-TH_CPP_API THStorage* THStorage_new(at::ScalarType scalar_type);
+TH_CPP_API THStorage* THStorage_new(caffe2::TypeMeta data_type);
 TH_API ptrdiff_t THStorage_size(const THStorage *self);
 
 TH_API void THStorage_retain(THStorage *storage);
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 5f861457c58ef4..2d1200eebd37aa 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -39,14 +39,14 @@ void THTensor_setStorageNd(THTensor *self, THStorage *storage, ptrdiff_t storage
     if (!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype());
+    auto data_type = THTensor_getStoragePtr(self)->dtype();
     if(storage)
     {
       c10::raw::intrusive_ptr::incref(storage);
       THTensor_stealAndSetStoragePtr(self, storage);
     }
     else {
-      THTensor_stealAndSetStoragePtr(self, THStorage_new(scalar_type));
+      THTensor_stealAndSetStoragePtr(self, THStorage_new(data_type));
     }
   }
 
@@ -123,7 +123,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons
   if(totalSize+self->storage_offset() > 0)
   {
     if(!THTensor_getStoragePtr(self)) {
-      THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type()));
+      THTensor_stealAndSetStoragePtr(self, THStorage_new(self->dtype()));
     }
     if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) {
       THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset());
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 7ed962567a31ff..6dfd90cfbe1bd2 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -21,13 +21,13 @@ size_t THStorage_(elementSize)()
 
 THStorage* THStorage_(new)(void)
 {
-  return THStorage_new(at::CTypeToScalarType<scalar_t>::to());
+  return THStorage_new(caffe2::TypeMeta::Make<scalar_t>());
 }
 
 THStorage* THStorage_(newWithSize)(ptrdiff_t size)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       getTHDefaultAllocator(),
       true).release();
@@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
                                         at::Allocator *allocator)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       allocator,
       true).release();
@@ -48,18 +48,18 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
 
 THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
 {
-  auto scalar_type = at::CTypeToScalarType<scalar_t>::to();
+  auto type_meta = caffe2::TypeMeta::Make<scalar_t>();
   size_t actual_size = -1;
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(scalar_type),
+      type_meta,
       size,
       THMapAllocator::makeDataPtr(
-          filename, flags, size * at::elementSize(scalar_type), &actual_size),
+          filename, flags, size * type_meta.itemsize(), &actual_size),
       /* allocator */ nullptr,
       false).release();
 
   if (size <= 0) {
-    storage->set_numel(actual_size / at::elementSize(scalar_type));
+    storage->set_numel(actual_size / type_meta.itemsize());
   }
 
   return storage;
@@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage)
 THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size,
                                                at::Allocator* allocator) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       std::move(data),
       allocator,
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index 3d1630adedbe5b..b6c52791eb56d6 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -55,9 +55,9 @@ int THCStorage_getDevice(THCState* state, const THCStorage* storage) {
 
 THC_API THCStorage* THCStorage_new(
     THCState* state,
-    at::ScalarType scalar_type) {
+    caffe2::TypeMeta data_type) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(scalar_type),
+      data_type,
       0,
       state->cudaDeviceAllocator,
       true).release();
diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
index ee683feced4bfb..3eedf4a907144f 100644
--- a/aten/src/THC/THCStorage.hpp
+++ b/aten/src/THC/THCStorage.hpp
@@ -16,7 +16,7 @@ struct CTypeToScalarType<__half> : public CTypeToScalarType<Half> {};
 
 }
 
-THC_API THCStorage* THCStorage_new(THCState* state, at::ScalarType);
+THC_API THCStorage* THCStorage_new(THCState* state, caffe2::TypeMeta);
 
 THC_API void THCStorage_retain(THCState *state, THCStorage *storage);
 
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index b35551301383f8..b5f8d391dc8a96 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -40,8 +40,9 @@ int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self,
   return THTensor_strideLegacyNoScalars(self, dim);
 }
 
-THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type) {
-  switch(scalar_type) {
+THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta) {
+  auto scalar_type = at::dataTypeToScalarType(type_meta.id());
+  switch (scalar_type) {
     case at::ScalarType::Byte:
       return THCudaByteTensor_new(state);
     case at::ScalarType::Char:
@@ -189,13 +190,13 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag
     if (!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype());
+    auto data_type = THTensor_getStoragePtr(self)->dtype();
 
     if (storage) {
       c10::raw::intrusive_ptr::incref(storage);
       THTensor_stealAndSetStoragePtr(self, storage);
     } else {
-      THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, scalar_type));
+      THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, data_type));
     }
   }
 
diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp
index 3505354c834bb4..8fecaf0b2296fc 100644
--- a/aten/src/THC/THCTensor.hpp
+++ b/aten/src/THC/THCTensor.hpp
@@ -21,7 +21,7 @@ THC_API int64_t THCTensor_sizeLegacyNoScalars(THCState *state, const THCTensor *
 THC_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim);
 THC_API int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, int dim);
 
-THC_API THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type);
+THC_API THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta);
 
 THC_API void THCTensor_resize(THCState *state, THCTensor *tensor, at::IntList size, at::IntList stride);
 THC_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu
index a3a8ed2e182086..844539c4ac0b27 100644
--- a/aten/src/THC/THCTensorCopy.cu
+++ b/aten/src/THC/THCTensorCopy.cu
@@ -136,8 +136,7 @@ void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src) {
       } else {
         // Types are different
         // Copy into the new format, contiguous, on the source device
-        srcContig = THCTensor_new(state,
-                                  at::CTypeToScalarType<ScalarTypeDst>::to());
+        srcContig = THCTensor_new(state, caffe2::TypeMeta::Make<ScalarTypeDst>());
         THCTensor_resizeAs(state, srcContig, dst);
 
         bool succ =
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index 8918a449e19585..36a001059a5787 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -3,6 +3,7 @@
 #else
 
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/typeid.h>
 
 scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self)
 {
@@ -43,7 +44,7 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind
 THCStorage* THCStorage_(new)(THCState *state)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       0,
       state->cudaDeviceAllocator,
       true).release();
@@ -53,7 +54,7 @@ THCStorage* THCStorage_(new)(THCState *state)
 THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       state->cudaDeviceAllocator,
       true).release();
@@ -64,7 +65,7 @@ THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
                                           at::Allocator* allocator)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       allocator,
       true).release();
@@ -117,7 +118,7 @@ THCStorage* THCStorage_(newWithDataAndAllocator)(
     ptrdiff_t size,
     at::Allocator* allocator) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       std::move(data),
       allocator,
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 4919a7a2a364ab..25ded22747441f 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -10,8 +10,8 @@ THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
 
 template <>
 THCTensor *THCTensor_newClone<scalar_t>(THCState *state, THCTensor *self) {
-  THCTensor* tensor = THCTensor_new(
-      state, at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype()));
+  THCTensor* tensor =
+      THCTensor_new(state, THTensor_getStoragePtr(self)->dtype());
   THCTensor_resizeAs(state, tensor, self);
   THC_copyTensor<scalar_t, scalar_t>(state, tensor, self);
   return tensor;
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
index acfaccced2be09..db75e3f67f7772 100644
--- a/test/cpp_extensions/complex_registration_extension.cpp
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -33,6 +33,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault {
             /*is_undefined=*/false) {}
 
   ScalarType scalarType() const override;
+  caffe2::TypeMeta typeMeta() const override;
   Backend backend() const override;
   const char* toString() const override;
   size_t elementSizeInBytes() const override;
@@ -49,7 +50,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault {
       numel *= s;
     }
     Storage s{c10::make_intrusive<StorageImpl>(
-        scalarTypeToDataType(ScalarType::ComplexFloat),
+        scalarTypeToTypeMeta(ScalarType::ComplexFloat),
         numel,
         getCPUAllocator(),
         /* resizable */ true)};
@@ -73,6 +74,10 @@ ScalarType CPUComplexFloatType::scalarType() const {
   return ScalarType::ComplexFloat;
 }
 
+caffe2::TypeMeta CPUComplexFloatType::typeMeta() const {
+  return scalarTypeToTypeMeta(ScalarType::ComplexFloat);
+}
+
 Backend CPUComplexFloatType::backend() const {
   return Backend::CPU;
 }
@@ -80,6 +85,7 @@ Backend CPUComplexFloatType::backend() const {
 const char* CPUComplexFloatType::toString() const {
   return "CPUComplexFloatType";
 }
+
 TypeID CPUComplexFloatType::ID() const {
   return TypeID::CPUComplexFloat;
 }
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 589bbf8323f155..c6a84afe710a89 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -52,6 +52,9 @@ VariableType::VariableType(Context* context, Type* baseType)
 ScalarType VariableType::scalarType() const {
   return baseType->scalarType();
 }
+caffe2::TypeMeta VariableType::typeMeta() const {
+  return baseType->typeMeta();
+}
 Backend VariableType::backend() const {
   return baseType->backend();
 }
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index d1e847a141aace..19de1b910a0b69 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -36,6 +36,7 @@ void register_variable_type_for(at::Type* baseType);
 struct TORCH_API VariableType final : public at::TypeDefault {
   VariableType(Context* context, at::Type* baseType);
   at::ScalarType scalarType() const override;
+  virtual caffe2::TypeMeta typeMeta() const override;
   at::Backend backend() const override;
   at::Allocator* allocator() const override;
   at::Device getDeviceFromPtr(void * data) const override;
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index 5d1e991574b341..bed40cb67a2d0e 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -71,7 +71,7 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage)
 {
   auto attype = at::globalContext().getNonVariableTypeOpt(
       deviceTypeToBackend(storage.device_type()),
-      at::dataTypeToScalarType(storage.dtype()));
+      at::dataTypeToScalarType(storage.dtype().id()));
   auto it = attype_to_py_storage_type.find(attype);
   if (it != attype_to_py_storage_type.end()) {
     return it->second;
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 0ca9f79e6f50dc..8e6a07da0a00ca 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -315,7 +315,7 @@ at::Tensor ModuleDecoder::buildTensorCommon(
     int64_t size;
     std::tie(storage_ptr, size) = file_reader_.getRecordWithKey(record_number);
     auto storage = std::make_shared<at::Storage>(
-      at::CPU(type).scalarType(),
+      at::CPU(type).typeMeta(),
       std::move(storage_ptr),
       size,
       nullptr);

From 44b2b6b150e95f60b9858f28205a83d03440cba4 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Wed, 12 Sep 2018 22:42:18 -0700
Subject: [PATCH 111/237] clean up jit generated tests (#11403)

Summary:
Clean up some generated tests after we have newly nice features like var args.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11403

Differential Revision: D9800545

Pulled By: wanchaol

fbshipit-source-id: e9973b113f78dc38cf99a81b6ede3fa3485f1cfa
---
 test/test_jit.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 4aa43b1c8596a5..6d97b88514ecfa 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -7428,28 +7428,6 @@ def forward(self, x, y):
     'test_var_dim_neg0',
     'test_norm_inf',
     'test_renorm_norm_inf',
-    'test_split',
-    'test_expand',
-    'test_expand_1_element',
-    'test_expand_new_dim',
-    'test_expand_new_dim_front_old_front_1',
-    'test_expand_scalar_to_dims',
-    'test_expand_size',
-    'test_permute',
-    'test_permute_neg_dim',
-    'test_repeat',
-    'test_repeat_scalar',
-    'test_repeat_single_number',
-    'test_repeat_unsqueeze',
-    'test_reshape_1d',
-    'test_reshape_scalar_to_1d',
-    'test_view',
-    'test_view_1d',
-    'test_view_scalar_to_1d',
-    'test_split_dim',
-    'test_split_dim_neg0',
-    'test_gesv',
-    'test_inverse',
     'test_matrix_power_n=-1',  # involves inverse
     'test_matrix_power_n=-3',  # involves inverse
     # skipped nn functional tests

From cac11a4ac388af775c4e6aa4b31921601906a98d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Thu, 13 Sep 2018 01:12:34 -0700
Subject: [PATCH 112/237] Merge caffe2::/at::StorageImpl (#11543)

Summary:
Merges caffe2::StorageImpl methods with at::StorageImpl methods and defines caffe2::StorageImpl as at::StorageImpl.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/11543

Differential Revision: D9795228

Pulled By: cpuhrsch

fbshipit-source-id: fbd6fa3cbf6c9099a4803337286c30e00652f95c
---
 aten/src/ATen/core/StorageImpl.cpp |  28 -----
 aten/src/ATen/core/StorageImpl.h   | 132 ++++++++++++++++++++---
 aten/src/THC/THCTensor.cpp         |   1 -
 caffe2/core/storage.h              | 167 +----------------------------
 4 files changed, 121 insertions(+), 207 deletions(-)

diff --git a/aten/src/ATen/core/StorageImpl.cpp b/aten/src/ATen/core/StorageImpl.cpp
index 9c8bdfc45d3b70..5190a7766dcb49 100644
--- a/aten/src/ATen/core/StorageImpl.cpp
+++ b/aten/src/ATen/core/StorageImpl.cpp
@@ -1,29 +1 @@
 #include <ATen/core/StorageImpl.h>
-
-namespace at {
-
-StorageImpl::StorageImpl(
-    caffe2::TypeMeta data_type,
-    int64_t numel,
-    at::DataPtr data_ptr,
-    at::Allocator* allocator,
-    bool resizable)
-    : data_type_(data_type),
-      data_ptr_(std::move(data_ptr)),
-      numel_(numel),
-      resizable_(resizable),
-      allocator_(allocator) {}
-
-StorageImpl::StorageImpl(
-    caffe2::TypeMeta data_type,
-    int64_t numel,
-    at::Allocator* allocator,
-    bool resizable)
-    : StorageImpl(
-          data_type,
-          numel,
-          allocator->allocate(data_type.itemsize() * numel),
-          allocator,
-          resizable) {}
-
-} // namespace at
diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index 0b2b635745abb2..cc63bd00906669 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -12,24 +12,65 @@ struct Type;
 
 struct AT_API StorageImpl : public c10::intrusive_ptr_target {
  public:
-  StorageImpl() = delete;
-  ~StorageImpl() {};
   StorageImpl(
       caffe2::TypeMeta data_type,
       int64_t numel,
       at::DataPtr data_ptr,
       at::Allocator* allocator,
-      bool resizable);
+      bool resizable)
+      : data_type_(data_type),
+        data_ptr_(std::move(data_ptr)),
+        numel_(numel),
+        resizable_(resizable),
+        allocator_(allocator) {
+    if (numel > 0) {
+      if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) {
+        AT_ERROR(
+            "Constructing a storage with meta of unknown type and non-zero numel");
+      }
+    }
+  }
+
   StorageImpl(
       caffe2::TypeMeta data_type,
       int64_t numel,
       at::Allocator* allocator,
-      bool resizable);
+      bool resizable)
+      : StorageImpl(
+            data_type,
+            numel,
+            allocator->allocate(data_type.itemsize() * numel),
+            allocator,
+            resizable) {}
+
+  explicit StorageImpl(at::DeviceType device_type)
+      : StorageImpl(device_type, caffe2::TypeMeta()) {}
+
+  StorageImpl(at::DeviceType device_type, caffe2::TypeMeta data_type)
+      : StorageImpl(
+            data_type,
+            0,
+            at::DataPtr(nullptr, at::Device(device_type)),
+            nullptr,
+            true) {}
+
+  StorageImpl& operator=(StorageImpl&& other) = default;
+  StorageImpl& operator=(const StorageImpl&) = delete;
+  StorageImpl() = delete;
+  StorageImpl(StorageImpl&& other) = default;
   StorageImpl(StorageImpl&) = delete;
   StorageImpl(const StorageImpl&) = delete;
-  // NB: Don't move ref count!
-  StorageImpl(StorageImpl&& other) = default;
-  StorageImpl& operator=(StorageImpl&& other) = default;
+  ~StorageImpl() = default;
+
+  void reset() {
+    data_ptr_.clear();
+    numel_ = 0;
+  }
+
+  template <typename T>
+  inline bool IsType() const {
+    return data_type_.Match<T>();
+  }
 
   template <typename T>
   inline T* data() const {
@@ -53,52 +94,77 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
     data_ptr_.clear();
   }
 
-  void operator=(const StorageImpl&) = delete;
-
   size_t itemsize() const {
     return data_type_.itemsize();
   }
 
   Type& type();
 
+  size_t capacity() const {
+    return numel_ * itemsize();
+  }
+
   int64_t numel() const {
     return numel_;
   };
+
+  // TODO: remove later
   void set_numel(int64_t numel) {
     numel_ = numel;
   };
+
   bool resizable() const {
     return resizable_;
   };
+
   at::DataPtr& data_ptr() {
     return data_ptr_;
   };
+
   const at::DataPtr& data_ptr() const {
     return data_ptr_;
   };
+
   // Returns the previous data_ptr
   at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
     std::swap(data_ptr_, data_ptr);
     return std::move(data_ptr);
   };
+
+  // XXX: TERRIBLE! DONT USE UNLESS YOU HAVE TO! AND EVEN THEN DONT, JUST DONT!
+  // Setting the data_type will require you to audit many other parts of the
+  // struct again to make sure it's still valid.
+  void set_dtype(const caffe2::TypeMeta& data_type) {
+    int64_t capacity = numel_ * data_type_.itemsize();
+    data_type_ = data_type;
+    numel_ = capacity / data_type_.itemsize();
+  }
+
+  // TODO: Return const ptr eventually if possible
   void* data() {
     return data_ptr_.get();
-  };
-  const void* data() const {
+  }
+
+  void* data() const {
     return data_ptr_.get();
-  };
+  }
+
   at::DeviceType device_type() const {
     return data_ptr_.device().type();
   }
+
   at::Allocator* allocator() {
     return allocator_;
-  };
-  const caffe2::TypeMeta dtype() const {
+  }
+
+  const caffe2::TypeMeta& dtype() const {
     return data_type_;
   }
+
   const at::Allocator* allocator() const {
     return allocator_;
   };
+
   // You generally shouldn't use this method, but it is occasionally
   // useful if you want to override how a tensor will be reallocated,
   // after it was already allocated (and its initial allocator was
@@ -106,13 +172,51 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   void set_allocator(at::Allocator* allocator) {
     allocator_ = allocator;
   }
+
   Device device() const {
     return data_ptr_.device();
   }
+
   void set_resizable(bool resizable) {
     resizable_ = resizable;
   }
 
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity,
+      DeleterFnPtr d = nullptr) {
+    UniqueStorageShareExternalPointer(
+        at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity);
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity) {
+    data_type_ = data_type;
+    // TODO: Use CAFFE_ENFORCE_WITH_CALLER equivalent
+    // For now causes lots of redefine issues if caffe2/core/logging.h is used
+    if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) {
+      AT_ERROR(
+          "To share with a raw external pointer you need to have meta "
+          "already set.");
+    }
+    data_ptr_ = std::move(data_ptr);
+    // NOTE: data_type might change and so it's also possible that capacity
+    // might not be divisible by itemsize. There is no way for us to keep track
+    // of the exact capacity if we're not explicity storing is. More conrectely
+    // capacity() might not return the value that was set here, if itemsize does
+    // not evenly divide it.
+    numel_ = capacity / data_type_.itemsize();
+  }
+
  private:
   caffe2::TypeMeta data_type_;
   at::DataPtr data_ptr_;
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index b5f8d391dc8a96..bfef8fffb0f89d 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -191,7 +191,6 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag
       THError("Tensor: invalid null storage");
     }
     auto data_type = THTensor_getStoragePtr(self)->dtype();
-
     if (storage) {
       c10::raw::intrusive_ptr::incref(storage);
       THTensor_stealAndSetStoragePtr(self, storage);
diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
index 12bff9af69ad03..ce60a7e580f049 100644
--- a/caffe2/core/storage.h
+++ b/caffe2/core/storage.h
@@ -20,173 +20,11 @@
 #include <ATen/core/Device.h>
 #include <ATen/core/DeviceType.h>
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/StorageImpl.h>
 
 namespace caffe2 {
 
-class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
- public:
-  StorageImpl() = delete;
-  StorageImpl(const StorageImpl&) = delete;
-  StorageImpl& operator=(const StorageImpl&) = delete;
-
-  // Rule of Five
-  StorageImpl(StorageImpl&&) = default;
-  ~StorageImpl() = default;
-  StorageImpl& operator=(StorageImpl&&) = default;
-
-  StorageImpl(
-      TypeMeta data_type,
-      int64_t numel,
-      at::DataPtr data_ptr,
-      at::Allocator* allocator,
-      bool resizable)
-      : data_type_(data_type),
-        data_ptr_(std::move(data_ptr)),
-        numel_(numel),
-        resizable_(resizable),
-        allocator_(allocator) {
-    if (numel > 0) {
-      CAFFE_ENFORCE(
-          data_type.id() != TypeIdentifier::uninitialized(),
-          "Constructing a storage with meta of unknown type and non-zero numel");
-    }
-  }
-
-  explicit StorageImpl(at::DeviceType device_type)
-      : StorageImpl(device_type, TypeMeta()) {}
-  StorageImpl(at::DeviceType device_type, TypeMeta data_type)
-      : StorageImpl(
-            data_type,
-            0,
-            at::DataPtr(nullptr, at::Device(device_type)),
-            nullptr,
-            true) {}
-
-  void reset() {
-    data_ptr_.clear();
-    numel_ = 0;
-  }
-
-  template <typename T>
-  inline bool IsType() const {
-    return data_type_.Match<T>();
-  }
-
-  void* data() const {
-    return data_ptr_.get();
-  }
-
-  void* data() {
-    return data_ptr_.get();
-  }
-
-  at::DataPtr& data_ptr() {
-    return data_ptr_;
-  }
-
-  const at::DataPtr& data_ptr() const {
-    return data_ptr_;
-  }
-
-  // Returns the previous data_ptr
-  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
-    std::swap(data_ptr_, data_ptr);
-    return std::move(data_ptr);
-  };
-
-  void set_dtype(const TypeMeta& data_type) {
-    int64_t capacity = numel_ * data_type_.itemsize();
-    data_type_ = data_type;
-    numel_ = capacity / data_type_.itemsize();
-  }
-
-  const TypeMeta& dtype() const {
-    return data_type_;
-  }
-
-  const at::Allocator* allocator() const {
-    return allocator_;
-  };
-  // You generally shouldn't use this method, but it is occasionally
-  // useful if you want to override how a tensor will be reallocated,
-  // after it was already allocated (and its initial allocator was
-  // set)
-  void set_allocator(at::Allocator* allocator) {
-    allocator_ = allocator;
-  }
-
-  size_t capacity() const {
-    return numel_ * itemsize();
-  }
-
-  int64_t numel() const {
-    return numel_;
-  }
-
-  // TODO: remove later
-  void set_numel(int64_t numel) {
-    numel_ = numel;
-  }
-
-  at::DeviceType device_type() const {
-    return data_ptr_.device().type();
-  }
-
-  inline size_t itemsize() const {
-    return data_type_.itemsize();
-  }
-
-  bool resizable() const {
-    return resizable_;
-  };
-
-  void set_resizable(bool resizable) {
-    resizable_ = resizable;
-  }
-
-  /**
-   * Can only be called when use_count is 1
-   */
-  void UniqueStorageShareExternalPointer(
-      void* src,
-      const TypeMeta& data_type,
-      size_t capacity,
-      MemoryDeleter d = nullptr) {
-    UniqueStorageShareExternalPointer(
-        at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity);
-  }
-
-  /**
-   * Can only be called when use_count is 1
-   */
-  void UniqueStorageShareExternalPointer(
-      at::DataPtr&& data_ptr,
-      const TypeMeta& data_type,
-      size_t capacity) {
-    data_type_ = data_type;
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to have meta "
-        "already set.");
-    data_ptr_ = std::move(data_ptr);
-    // NOTE: data_type might change and so it's also possible that capacity
-    // might not be divisible by itemsize. There is no way for us to keep track
-    // of the exact capacity if we're not explicity storing is. More conrectely
-    // capacity() might not return the value that was set here, if itemsize does
-    // not evenly divide it.
-    numel_ = capacity / data_type_.itemsize();
-  }
-
- private:
-  TypeMeta data_type_;
-  at::DataPtr data_ptr_;
-  int64_t numel_;
-  bool resizable_;
-  at::Allocator* allocator_;
-  // allocator_ takes precedence over StaticContext from device_type_
-  // Allocator* allocator_;
-  // at::DeviceType device_type_ = CPU;
-};
+using StorageImpl = at::StorageImpl;
 
 class CAFFE2_API Storage {
  public:
@@ -251,6 +89,7 @@ class CAFFE2_API Storage {
   const TypeMeta& dtype() const {
     return storage_impl_->dtype();
   }
+
   size_t capacity() const {
     return storage_impl_->capacity();
   }

From 77f6998e541238391aca98041a96bb49a533687d Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Thu, 13 Sep 2018 08:43:38 -0700
Subject: [PATCH 113/237] Guard against inputting or returning sparse tensors
 (#11550)

Summary:
Add guards against using sparse tensor by checking the conversion from IValue -> PyObject & PyObject -> IValue.

This diff also changes the behavior in constant propagation to not run python ops even if all ops are constant because of possible mutation to global state. This came up in trying to run get_sparse(), and I'm including it here to make it easier to land.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11550

Differential Revision: D9804712

Pulled By: eellison

fbshipit-source-id: 9fe7daf721c6d6e48df4925c0f9c775873bcdc77
---
 test/test_jit.py                              | 20 ++++++++++++++++++
 .../csrc/jit/passes/constant_propagation.cpp  |  3 +--
 torch/csrc/jit/pybind_utils.h                 | 21 +++++++++++++++----
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 6d97b88514ecfa..97ef28630b7836 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1571,6 +1571,26 @@ def test_fn(ten, mask):
         ten = torch.rand(3, 3)
         self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
 
+    def test_sparse_tensors_error(self):
+        def get_sparse():
+            return torch.sparse.FloatTensor(2, 3)
+
+        @torch.jit.script
+        def sparse(input):
+            output = get_sparse()
+            return output, input
+
+        with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"):
+            sparse(get_sparse())
+
+        # has a different entry point than calling sparse directly
+        with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"):
+            torch._C._jit_pass_shape_analysis(
+                sparse.graph, (get_sparse(),), False)
+
+        with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"):
+            sparse(torch.tensor([1]))
+
     def test_constant_prop_simple(self):
         @torch.jit.script
         def constant_prop(input_tensor):
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index e83971bfaf9dfb..c9cc1b703de100 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -15,9 +15,8 @@ namespace {
 std::unordered_set<Symbol> skip_list = {
   prim::If,
   prim::Loop, //TODO: handle Loop
-  //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be
-  //FIXME treated as having side effects but ONNX depends on them being removed
   prim::Print,
+  prim::PythonOp, //may have side effects
   //all the rand functions from native_functions.yaml
   aten::rand,
   aten::rand_out,
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 54414534b06215..004bc075161697 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -56,7 +56,11 @@ inline void findErrorInKwargs(
 
 inline IValue toIValue(py::handle input) {
   if (THPVariable_Check(input.ptr())) {
-    return py::cast<at::Tensor>(input);
+    auto ten = py::cast<at::Tensor>(input);
+    if (ten.is_sparse()) {
+      AT_ERROR("sparse tensors not supported");
+    }
+    return ten;
   } else if (py::isinstance<py::tuple>(input)) {
     py::tuple input_tuple = py::cast<py::tuple>(input);
     Stack s;
@@ -89,8 +93,13 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
     switch (type->kind()) {
       case TypeKind::DynamicType:
       case TypeKind::TensorType:
-      case TypeKind::CompleteTensorType:
-        return py::cast<autograd::Variable>(obj);
+      case TypeKind::CompleteTensorType: {
+        auto var = py::cast<autograd::Variable>(obj);
+        if (var.is_sparse()) {
+          AT_ERROR("sparse tensors not supported");
+        }
+        return var;
+      }
       case TypeKind::FloatType:
         return py::cast<double>(obj);
       case TypeKind::IntType:
@@ -174,7 +183,11 @@ inline py::object toPyObject(IValue&& ivalue) {
   if (ivalue.isNone()) {
     return py::none();
   } else if (ivalue.isTensor()) {
-    return py::cast(autograd::Variable(ivalue.toTensor()));
+    auto tensor = std::move(ivalue).toTensor();
+    if (tensor.is_sparse()) {
+      AT_ERROR("sparse tensors not supported");
+    }
+    return py::cast(autograd::Variable(std::move(tensor)));
   } else if (ivalue.isDouble()) {
     return py::cast(ivalue.toDouble());
   } else if (ivalue.isInt()) {

From 36fc1a0a58ab20f34e3047af41b4d15c92c8e2f7 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Thu, 13 Sep 2018 09:25:27 -0700
Subject: [PATCH 114/237] Merge caffe2::/at::Storage

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11637

Reviewed By: gchanan

Differential Revision: D9806425

Pulled By: ezyang

fbshipit-source-id: e20ec93bff6dc7fb22ca9b7e7348d060b3876b67
---
 aten/src/ATen/core/Storage.cpp |  24 -----
 aten/src/ATen/core/Storage.h   | 158 ++++++++++++++++++++++++++++++---
 caffe2/core/storage.h          | 128 +-------------------------
 caffe2/core/tensor.cc          |   2 +-
 4 files changed, 147 insertions(+), 165 deletions(-)

diff --git a/aten/src/ATen/core/Storage.cpp b/aten/src/ATen/core/Storage.cpp
index 854a68876e2e5d..aca4bb75d2c95b 100644
--- a/aten/src/ATen/core/Storage.cpp
+++ b/aten/src/ATen/core/Storage.cpp
@@ -2,28 +2,4 @@
 
 namespace at {
 
-Storage::Storage(
-    caffe2::TypeMeta data_type,
-    size_t size,
-    Allocator* allocator,
-    bool resizable)
-    : storage_impl_(c10::make_intrusive<StorageImpl>(
-          data_type,
-          size,
-          allocator,
-          resizable)) {}
-
-Storage::Storage(
-    caffe2::TypeMeta data_type,
-    at::DataPtr data_ptr,
-    size_t size,
-    const std::function<void(void*)>& deleter,
-    bool resizable)
-    : storage_impl_(c10::make_intrusive<StorageImpl>(
-          data_type,
-          size,
-          std::move(data_ptr),
-          /* allocator */ nullptr,
-          resizable)) {}
-
 } // namespace at
diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h
index 656ddb972c7959..23a3327da1cafc 100644
--- a/aten/src/ATen/core/Storage.h
+++ b/aten/src/ATen/core/Storage.h
@@ -11,16 +11,56 @@ struct AT_API Storage {
   Storage(const c10::intrusive_ptr<StorageImpl>& ptr) : storage_impl_(ptr) {}
   Storage(c10::intrusive_ptr<StorageImpl>&& ptr) : storage_impl_(std::move(ptr)) {}
   Storage(
-      caffe2::TypeMeta,
+      caffe2::TypeMeta data_type,
       size_t size,
       Allocator* allocator,
-      bool resizable = false);
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            data_type,
+            size,
+            allocator,
+            resizable)) {}
+
   Storage(
-      caffe2::TypeMeta,
-      at::DataPtr,
+      caffe2::TypeMeta data_type,
+      at::DataPtr data_ptr,
       size_t size,
       const std::function<void(void*)>& deleter,
-      bool resizable = false);
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            data_type,
+            size,
+            std::move(data_ptr),
+            /* allocator */ nullptr,
+            resizable)) {}
+
+  Storage(at::DeviceType device_type)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(device_type)) {}
+  Storage(at::DeviceType device_type, caffe2::TypeMeta data_type)
+      : storage_impl_(
+            c10::make_intrusive<StorageImpl>(device_type, data_type)) {}
+
+  Storage(
+      caffe2::TypeMeta data_type,
+      int64_t numel,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator,
+      bool resizable)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            data_type,
+            numel,
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
+
+  void reset() {
+    storage_impl_->reset();
+  }
+
+  template <typename T>
+  inline bool IsType() const {
+    return storage_impl_->IsType<T>();
+  }
 
   template <typename T>
   T* data() const { return storage_impl_->data<T>(); }
@@ -28,32 +68,122 @@ struct AT_API Storage {
   template <typename T>
   T* unsafe_data() const { return storage_impl_->unsafe_data<T>(); }
 
-  size_t elementSize() const { return storage_impl_->itemsize(); }
-  ptrdiff_t size() const { return storage_impl_->numel(); }
-  bool resizable() const { return storage_impl_->resizable(); }
+  size_t elementSize() const {
+    return storage_impl_->itemsize();
+  }
+
+  inline size_t itemsize() const {
+    return storage_impl_->itemsize();
+  }
+
+  ptrdiff_t size() const {
+    return storage_impl_->numel();
+  }
+
+  int64_t numel() const {
+    return storage_impl_->numel();
+  }
+
+  // TODO: remove later
+  void set_numel(int64_t numel) {
+    storage_impl_->set_numel(numel);
+  }
+
+  bool resizable() const {
+    return storage_impl_->resizable();
+  }
+
+  size_t capacity() const {
+    return storage_impl_->capacity();
+  }
   // get() use here is to get const-correctness
-  void* data() const { return storage_impl_.get()->data(); }
-  const caffe2::TypeMeta dtype() const {
+
+  void* data() {
+    return storage_impl_->data();
+  }
+
+  void* data() const {
+    return storage_impl_.get()->data();
+  }
+
+  const caffe2::TypeMeta& dtype() const {
     return storage_impl_->dtype();
   }
-  const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); }
-  DeviceType device_type() const { return storage_impl_->device_type(); }
-  at::Allocator* allocator() const { return storage_impl_.get()->allocator(); }
-  at::Device device() const { return storage_impl_->device(); }
+
+  at::DataPtr& data_ptr() {
+    return storage_impl_->data_ptr();
+  }
+
+  const at::DataPtr& data_ptr() const {
+    return storage_impl_->data_ptr();
+  }
+
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
+    return storage_impl_->set_data_ptr(std::move(data_ptr));
+  };
+
+  void set_dtype(const caffe2::TypeMeta& data_type) {
+    storage_impl_->set_dtype(data_type);
+  }
+
+  DeviceType device_type() const {
+    return storage_impl_->device_type();
+  }
+
+  at::Allocator* allocator() const {
+    return storage_impl_.get()->allocator();
+  }
+
+  at::Device device() const {
+    return storage_impl_->device();
+  }
 
   StorageImpl* unsafeReleaseStorageImpl() {
     return storage_impl_.release();
   }
+
   StorageImpl* unsafeGetStorageImpl() const noexcept {
     return storage_impl_.get();
   }
+
   operator bool() const {
     return storage_impl_;
   }
+
   size_t use_count() const {
     return storage_impl_.use_count();
   }
 
+  inline bool unique() const {
+    return storage_impl_.unique();
+  }
+
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity,
+      DeleterFnPtr d = nullptr) {
+    if (!storage_impl_.unique()) {
+      AT_ERROR(
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(
+        src, data_type, capacity, d);
+  }
+
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity) {
+    if (!storage_impl_.unique()) {
+      AT_ERROR(
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(
+        std::move(data_ptr), data_type, capacity);
+  }
+
  protected:
   c10::intrusive_ptr<StorageImpl> storage_impl_;
 };
diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
index ce60a7e580f049..35647d7b62d8af 100644
--- a/caffe2/core/storage.h
+++ b/caffe2/core/storage.h
@@ -20,137 +20,13 @@
 #include <ATen/core/Device.h>
 #include <ATen/core/DeviceType.h>
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/Storage.h>
 #include <ATen/core/StorageImpl.h>
 
 namespace caffe2 {
 
 using StorageImpl = at::StorageImpl;
-
-class CAFFE2_API Storage {
- public:
-  Storage() {}
-  Storage(at::DeviceType device_type)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(device_type)) {}
-  Storage(at::DeviceType device_type, TypeMeta data_type)
-      : storage_impl_(
-            c10::make_intrusive<StorageImpl>(device_type, data_type)) {}
-
-  Storage(
-      TypeMeta data_type,
-      int64_t numel,
-      at::DataPtr data_ptr,
-      at::Allocator* allocator,
-      bool resizable)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(
-            data_type,
-            numel,
-            std::move(data_ptr),
-            allocator,
-            resizable)) {}
-
-  void reset() {
-    storage_impl_->reset();
-  }
-
-  // For debugging purpose only, please don't call it
-  StorageImpl* unsafeGetStorageImp() const {
-    return storage_impl_.get();
-  }
-
-  template <typename T>
-  inline bool IsType() const {
-    return storage_impl_->IsType<T>();
-  }
-
-  void* data() const {
-    return storage_impl_->data();
-  }
-
-  void* data() {
-    return storage_impl_->data();
-  }
-
-  at::DataPtr& data_ptr() {
-    return storage_impl_->data_ptr();
-  }
-
-  const at::DataPtr& data_ptr() const {
-    return storage_impl_->data_ptr();
-  }
-  // Returns the previous data_ptr
-  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
-    return storage_impl_->set_data_ptr(std::move(data_ptr));
-  };
-
-  void set_dtype(const TypeMeta& data_type) {
-    storage_impl_->set_dtype(data_type);
-  }
-
-  const TypeMeta& dtype() const {
-    return storage_impl_->dtype();
-  }
-
-  size_t capacity() const {
-    return storage_impl_->capacity();
-  }
-
-  int64_t numel() const {
-    return storage_impl_->numel();
-  }
-
-  // TODO: remove later
-  void set_numel(int64_t numel) {
-    storage_impl_->set_numel(numel);
-  }
-
-  at::DeviceType device_type() const {
-    return storage_impl_->device_type();
-  }
-
-  const at::Allocator* allocator() const {
-    return storage_impl_->allocator();
-  }
-
-  inline size_t itemsize() const {
-    return storage_impl_->itemsize();
-  }
-
-  inline long use_count() const {
-    return storage_impl_.use_count();
-  }
-
-  inline bool unique() const {
-    return storage_impl_.unique();
-  }
-
-  void UniqueStorageShareExternalPointer(
-      void* src,
-      const TypeMeta& data_type,
-      size_t capacity,
-      MemoryDeleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_impl_.unique(),
-        "UniqueStorageShareExternalPointer can only be called when \
-        use_count == 1");
-    storage_impl_->UniqueStorageShareExternalPointer(
-        src, data_type, capacity, d);
-  }
-
-  void UniqueStorageShareExternalPointer(
-      at::DataPtr&& data_ptr,
-      const TypeMeta& data_type,
-      size_t capacity) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_impl_.unique(),
-        "UniqueStorageShareExternalPointer can only be called when \
-        use_count == 1");
-    storage_impl_->UniqueStorageShareExternalPointer(
-        std::move(data_ptr), data_type, capacity);
-  }
-
- protected:
-  c10::intrusive_ptr<StorageImpl> storage_impl_;
-};
+using Storage = at::Storage;
 
 } // namespace caffe2
 
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index a24f8cb67b7347..15aefc37c74188 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -96,7 +96,7 @@ vector<TIndex> GetTensorInfo(
   const Tensor* tc = static_cast<const Tensor*>(c);
   CHECK(tc);
   CHECK(tc->unsafeGetTensorImpl());
-  CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImp());
+  CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImpl());
   *capacity = tc->capacity_nbytes();
   tc->ExtractDeviceOption(device);
   return tc->dims();

From 57f149a861bd3e4b0a8925d41cfe9091657794fc Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Thu, 13 Sep 2018 09:28:52 -0700
Subject: [PATCH 115/237] Only join pin_memory_thread after it started (#11599)

Summary:
Same reason as in #11432 .

Example error:
```
Exception ignored in: <function _DataLoaderIter.__del__ at 0x7fa06963cf28>
Traceback (most recent call last):
  File "/private/home/ssnl/miniconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 405, in __del__
    self._shutdown_workers()
  File "/private/home/ssnl/miniconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 401, in _shutdown_workers
    self.pin_memory_thread.join()
AttributeError: '_DataLoaderIter' object has no attribute 'pin_memory_thread'
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11599

Differential Revision: D9801143

Pulled By: SsnL

fbshipit-source-id: 520590a21f56fa381fcac621457a7544d3fba47e
---
 torch/utils/data/dataloader.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 951321fcdf1052..9d69ab4daf0fc8 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -289,12 +289,15 @@ def __init__(self, loader):
 
             if self.pin_memory:
                 self.data_queue = queue.Queue()
-                self.pin_memory_thread = threading.Thread(
+                pin_memory_thread = threading.Thread(
                     target=_pin_memory_loop,
                     args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory,
                           torch.cuda.current_device()))
-                self.pin_memory_thread.daemon = True
-                self.pin_memory_thread.start()
+                pin_memory_thread.daemon = True
+                pin_memory_thread.start()
+                # Similar to workers (see comment above), we only register
+                # pin_memory_thread once it is started.
+                self.pin_memory_thread = pin_memory_thread
             else:
                 self.data_queue = self.worker_result_queue
 
@@ -397,7 +400,7 @@ def _shutdown_workers(self):
                 q.put(None)
             for w in self.workers:
                 w.join()
-            if self.pin_memory:
+            if hasattr(self, 'pin_memory_thread'):
                 self.pin_memory_thread.join()
 
     def __del__(self):

From d4d72b87e3eef9936b6355effb05df546cbdb803 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Thu, 13 Sep 2018 10:16:33 -0700
Subject: [PATCH 116/237] Sphinx is case sensitive

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11646

Differential Revision: D9811355

Pulled By: SsnL

fbshipit-source-id: d484561baa2ac5b3113870b4ee06fa3560b686e4
---
 docs/source/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 31c6c3f743da06..8a0f53975e6098 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -33,7 +33,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    torch.autograd <autograd>
    torch.distributed <distributed>
    torch.distributions <distributions>
-   torch.jit <JIT>
+   torch.jit <jit>
    torch.multiprocessing <multiprocessing>
    bottleneck
    checkpoint

From 1f49b879d1b7f2d5e4f5f955fa037f8e54ec457a Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 10:25:57 -0700
Subject: [PATCH 117/237] Add missing include for __half (#11638)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11638

Differential Revision: D9811063

Pulled By: ezyang

fbshipit-source-id: dd103bb152485bcdbb0108b4d3de2443c30d5572
---
 aten/src/THC/THCStorage.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
index 3eedf4a907144f..e8dfc2213076f1 100644
--- a/aten/src/THC/THCStorage.hpp
+++ b/aten/src/THC/THCStorage.hpp
@@ -9,10 +9,16 @@
 
 #include "ATen/ScalarType.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
 namespace at {
 
+#if defined(__CUDACC__) || defined(__HIP_PLATFORM_HCC__)
 template <>
 struct CTypeToScalarType<__half> : public CTypeToScalarType<Half> {};
+#endif
 
 }
 

From d278344e36dde4170da4e4d0c45953dcb53dcf65 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Thu, 13 Sep 2018 10:35:53 -0700
Subject: [PATCH 118/237] Automatic update of fbcode/onnx to
 39dd0d4fec5913aa517b71bcfcbf638a427894eb (#11622)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11622

Previous import was bff0b8835870c7df7762ef43498d000d2d8ffb52

Included changes:
- **[39dd0d4](https://github.com/onnx/onnx/commit/39dd0d4)**: [build] Add ONNX_API for protos in all cases (#1407) <Orion Reblitz-Richardson>
- **[944db4f](https://github.com/onnx/onnx/commit/944db4f)**: cmake (#1401) <zrphercule>
- **[8ccc8dd](https://github.com/onnx/onnx/commit/8ccc8dd)**: Remove ONNXIFI_CHECK_RESULT from onnxRelease* functions (#1397) <Marat Dukhan>
- **[df14e74](https://github.com/onnx/onnx/commit/df14e74)**: Change onnxifi test driver classname (#1396) <zrphercule>
- **[0c885cc](https://github.com/onnx/onnx/commit/0c885cc)**: ONNXIFI cpp test driver (#1290) <zrphercule>
- **[a557848](https://github.com/onnx/onnx/commit/a557848)**: Coverage Report Tools for Backend Scoreboard (#1301) <Akshay Chalana>
- **[31fd87f](https://github.com/onnx/onnx/commit/31fd87f)**: fix AvgPool doc. add default value for count_include_pad (#1391) <Wenhao Hu>
- **[8ff08c2](https://github.com/onnx/onnx/commit/8ff08c2)**: Do not export onnx symbols in the python extension (#1388) <bddppq>

Reviewed By: orionr

Differential Revision: D9806635

fbshipit-source-id: f61c052b6bd14e0c80ace19c1a5f0ba659030c6f
---
 third_party/onnx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/onnx b/third_party/onnx
index bff0b8835870c7..39dd0d4fec5913 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit bff0b8835870c7df7762ef43498d000d2d8ffb52
+Subproject commit 39dd0d4fec5913aa517b71bcfcbf638a427894eb

From a861573e36860471a3997cc1cec4a0dd1fdf9746 Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Thu, 13 Sep 2018 10:59:11 -0700
Subject: [PATCH 119/237] fix tensor export bug in IR export (#11613)

Differential Revision: D9811094

Pulled By: li-roy

fbshipit-source-id: 012792dbedc70bd3fa242fdf2e39da0b21ce158d
---
 test/test_jit.py          | 3 ++-
 torch/csrc/jit/export.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 97ef28630b7836..20ec276981428a 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -5115,10 +5115,11 @@ def __init__(self):
                 self.param1 = torch.nn.Parameter(torch.rand(5, 5))
                 self.param2 = torch.nn.Parameter(self.param1[3])
                 self.param3 = torch.nn.Parameter(torch.rand(5, 5))
+                self.param4 = torch.nn.Parameter(torch.rand(11, 5)[1:6])
 
             @torch.jit.script_method
             def foo(self):
-                return self.param1 + self.param2 + self.param3
+                return self.param1 + self.param2 + self.param3 + self.param4
 
         m_orig = M()
         m_import = self.getExportImportCopy(m_orig)
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 0d304a91583e8d..0740d2e88a2c8a 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -685,7 +685,7 @@ void ModuleEncoder::EncodeTensor(
     }
 
     auto record_number = file_writer_.writeRecord(
-      static_cast<char*>(t.storage().data()), t.type().elementSizeInBytes() * t.numel());
+      static_cast<char*>(t.storage().data()), t.type().elementSizeInBytes() * t.storage().size());
     tensor_proto->add_int64_data(record_number);
     storage_dedup_map_[storage_ptr] = record_number;
   }

From 5bc90b8554a9f978e7d87ed120fa13040a983cac Mon Sep 17 00:00:00 2001
From: Roger-luo <hiroger@qq.com>
Date: Thu, 13 Sep 2018 11:04:41 -0700
Subject: [PATCH 120/237] support conversion and dispatch of complex numbers
 (#11603)

Summary:
- Just a simple fix to support `fill_`
- And a fix for indexing in `pytorch-complex`

Differential Revision: D9804061

Pulled By: ezyang

fbshipit-source-id: 631129b3fa220a9670770b3766f14a8e03633bdf
---
 aten/src/ATen/Dispatch.h               | 49 ++++++++++++++++++++++++++
 aten/src/ATen/core/Scalar.h            |  2 +-
 aten/src/ATen/native/Scalar.cpp        |  2 +-
 torch/csrc/utils/python_arg_parser.cpp |  4 +++
 torch/csrc/utils/python_arg_parser.h   |  4 +++
 5 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index c598901b2b943f..64f181d4dccb3c 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -79,3 +79,52 @@
         AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
+
+#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                            \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)      \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
+    }                                                                         \
+  }()
+
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...)                       \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)      \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
+    }                                                                         \
+  }()
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...)                       \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)      \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
+    }                                                                         \
+  }()
diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h
index 35c4b538336aeb..de01a56ce33748 100644
--- a/aten/src/ATen/core/Scalar.h
+++ b/aten/src/ATen/core/Scalar.h
@@ -99,6 +99,6 @@ template<> \
 inline T Scalar::to<T>() { \
   return to##name(); \
 }
-AT_FORALL_SCALAR_TYPES(DEFINE_TO)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO)
 #undef DEFINE_TO
 }
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index 6ffb891bf7d777..975ae8c1ff9c47 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -18,7 +18,7 @@ Scalar _local_scalar(const Tensor& self) {
 
 Scalar _local_scalar_dense_cpu(const Tensor& self) {
   Scalar r;
-  AT_DISPATCH_ALL_TYPES_AND_HALF(
+  AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(
       self.type(), "_local_scalar_dense_cpu", [&] {
         scalar_t value = *self.data<scalar_t>();
         r = Scalar(value);
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index d36f70e460d793..5dbc83eddc59b9 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -103,6 +103,10 @@ bool FunctionParameter::check(PyObject* obj) {
       return THPVariable_Check(obj) || (allow_numbers_as_tensors && THPUtils_checkDouble(obj));
     }
     case ParameterType::SCALAR:
+      if (PyComplex_Check(obj)) {
+        return true;
+      }
+      // fallthrough
     case ParameterType::DOUBLE: {
       if (THPUtils_checkDouble(obj)) {
         return true;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 86453a79e30dc2..294819a5a11550 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -232,6 +232,10 @@ inline at::Scalar PythonArgs::scalarWithDefault(int i, at::Scalar default_scalar
   if (THPUtils_checkLong(args[i])) {
     return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(args[i])));
   }
+
+  if (PyComplex_Check(args[i])) {
+    return at::Scalar(THPUtils_unpackComplexDouble(args[i]));
+  }
   return at::Scalar(THPUtils_unpackDouble(args[i]));
 }
 

From ab3a2d25fbf695a80bae4e9e269ee0e532cdb533 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Thu, 13 Sep 2018 11:05:09 -0700
Subject: [PATCH 121/237] Improve error messages when trying to use nested
 lists.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11606

Differential Revision: D9806949

Pulled By: zdevito

fbshipit-source-id: c38abc4ce745a63d26a64f6aa1b41350e4b1acd5
---
 test/test_jit.py                   | 17 +++++++++++++++--
 torch/csrc/jit/script/compiler.cpp | 23 ++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 20ec276981428a..5a7ea169d8d598 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2529,6 +2529,19 @@ def stuff3(x):
             return torch.ones(x), x
         self.checkScript(stuff3, ([3, 2],))
 
+    def test_nested_list_error(self):
+        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
+            @torch.jit.script
+            def foo(x):
+                # type: (Tuple[List[List[int]]]) -> int
+                return 4
+
+    def test_nested_list_construct_error(self):
+        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
+            @torch.jit.script
+            def foo(x):
+                return [[4]]
+
     def test_script_cu(self):
         cu = torch.jit.CompilationUnit('''
             def foo(a):
@@ -5507,10 +5520,10 @@ def f3(a):
             def f4(a):
                 torch.cat(a)
 
-        with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found Tensor[][]'):
+        with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found int\[\]'):
             @torch.jit.script
             def f5(a):
-                torch.cat([[a]])
+                torch.cat([3])
 
         with self.assertRaisesRegex(RuntimeError, 'Lists must contain only a single type'):
             @torch.jit.script
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 5efa0668d91296..d49a653194b1eb 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -721,6 +721,24 @@ std::shared_ptr<SugaredValue> BuiltinFunction::call(
       emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, true));
 }
 
+inline bool isSupportedListElementType(TypePtr type) {
+  return type->isSubtypeOf(DynamicType::get()) ||
+      type->isSubtypeOf(NumberType::get());
+}
+
+// guard for List types we do not currently have operations for
+inline void ensureLegalType(const SourceRange& range, TypePtr ptr) {
+  if(TupleTypePtr tt = ptr->cast<TupleType>()) {
+    for(auto elem : tt->elements()) {
+      ensureLegalType(range, elem);
+    }
+  } else if(ListTypePtr lt = ptr->cast<ListType>()) {
+    if(!isSupportedListElementType(lt->getElementType())) {
+        throw ErrorReport(range) << "Lists can only contain numbers or Tensors, but found " << lt->getElementType()->str();
+    }
+  }
+}
+
 struct to_ir {
   to_ir(
       Def def,
@@ -771,6 +789,7 @@ struct to_ir {
       // Record the type for the schema and set the Type on the Value*
       arguments.push_back(schema.arguments.at(arg_annotation_idx++));
       new_input->setType(arguments.back().type);
+      ensureLegalType((*it).ident().range(), arguments.back().type);
     }
     // body
     auto stmts = def.statements();
@@ -1550,8 +1569,10 @@ struct to_ir {
                 << *elem_type << " but found " << *v->type() << " instead";
           }
         }
-        return graph->insertNode(graph->createList(elem_type, values))
+        Value* result = graph->insertNode(graph->createList(elem_type, values))
             ->output();
+        ensureLegalType(tree->range(), result->type());
+        return result;
       } break;
       case TK_TUPLE_LITERAL: {
         auto ll = TupleLiteral(tree);

From 6f53b4efea8746d4ff296cc8c5ddd35fd4975998 Mon Sep 17 00:00:00 2001
From: David Riazati <davidriazati@fb.com>
Date: Thu, 13 Sep 2018 11:10:00 -0700
Subject: [PATCH 122/237] Remove implicit bool casts (#11503)

Summary:
In order to comply with Python's rules on implicit casting of
non-booleans to booleans, this PR removes implicit casting in favor of
explicit casts via `bool()`

cc zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11503

Differential Revision: D9780869

Pulled By: driazati

fbshipit-source-id: c753acaca27f4e79dddf424c6b04674f44a6aad9
---
 test/cpp/api/jit.cpp               |  2 +-
 test/test_jit.py                   | 71 ++++++++++++++++--------------
 torch/csrc/jit/script/compiler.cpp | 14 +++---
 torch/csrc/jit/test_jit.cpp        |  8 ++--
 torch/jit/batchop.py               | 20 ++++-----
 5 files changed, 63 insertions(+), 52 deletions(-)

diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp
index 8879d6f0007fdc..c46868cdbf6989 100644
--- a/test/cpp/api/jit.cpp
+++ b/test/cpp/api/jit.cpp
@@ -13,7 +13,7 @@ TEST_CASE("torch script") {
       def test_relu(a, b):
         return torch.relu(a + b)
       def test_while(a, i):
-        while i < 10:
+        while bool(i < 10):
           a += a
           i += 1
         return a
diff --git a/test/test_jit.py b/test/test_jit.py
index 5a7ea169d8d598..7bbc2455bf4a77 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -909,7 +909,7 @@ def test_recursive_cse(self):
 
         def fn(x, y):
             z = x
-            if x + y > x:
+            if bool(x + y > x):
                 z = x + y
             return z
 
@@ -1609,12 +1609,11 @@ def test_constant_prop_nested(self):
         @torch.jit.script
         def constant_prop(a):
             b = 2 + 1
-            if a < 2:
+            if bool(a < 2):
                 c = b + 2
             else:
                 c = b - 2
             return c
-
         out_ref = constant_prop(torch.tensor(2))
         self.run_pass('constant_propagation', constant_prop.graph)
         out_test = constant_prop(torch.tensor(2))
@@ -1648,8 +1647,8 @@ def constant_prop(a, b):
             c0 = 1
             c1 = 1
             c2 = 1
-            if a:  # -> c0, c1
-                if b:  # -> c0
+            if bool(a):  # -> c0, c1
+                if bool(b):  # -> c0
                     if True:  # -> c0
                         c0 = c0 + 1
                         if False:
@@ -2098,7 +2097,7 @@ def batch_sum(a):
 
     def test_if_else(self):
         def single_if(a, b):
-            if a > b:
+            if bool(a > b):
                 a = a + b
             else:
                 a = a - b
@@ -2118,7 +2117,7 @@ def single_if(a, b):
 
     def test_if_else_with_scalar(self):
         def single_if(a, b):
-            if a > 0.1:
+            if bool(a > 0.1):
                 a = a + b
             else:
                 a = a - b
@@ -2138,7 +2137,7 @@ def single_if(a, b):
 
     def test_if_noelse(self):
         def single_if(a, b):
-            if a > b:
+            if bool(a > b):
                 a = a + b
             return a
 
@@ -2156,7 +2155,7 @@ def single_if(a, b):
 
     def test_if_noelse_with_scalar(self):
         def single_if(a, b):
-            if a > 0.1:
+            if bool(a > 0.1):
                 a = a + b
             return a
 
@@ -2174,7 +2173,7 @@ def single_if(a, b):
 
     def test_while(self):
         def single_while(a, b):
-            while a > b:
+            while bool(a > b):
                 a = a - b
             return a
 
@@ -2262,7 +2261,7 @@ def test_greedy_search(self):
         def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
                    b_i, b_f, b_o, b_c, w_hs, b_s, iter_num):
             iter_count = torch.zeros_like(iter_num)
-            while(iter_count < iter_num):
+            while bool(iter_count < iter_num):
                 iter_count += 1
                 # LSTM Cell
                 i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
@@ -2328,7 +2327,7 @@ def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
             vocab_size = embed.size(1)
             iter_count = torch.zeros_like(iter_num)
             max_len = idx.size(2)
-            while(iter_count < iter_num):
+            while bool(iter_count < iter_num):
                 iter_count += 1
                 # LSTM Cell
                 i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
@@ -3454,7 +3453,7 @@ def func(a):
 
     def test_while(self):
         def func(a, b, max):
-            while a < max:
+            while bool(a < max):
                 a = a + 1
                 b = b + 1
             c = a + b
@@ -3471,7 +3470,7 @@ def func(lim):
             somenum = 5
             dontmutateme = 3
             third = 0
-            while i < lim:
+            while bool(i < lim):
                 third = first + second
                 first = second
                 second = third
@@ -3493,7 +3492,7 @@ def test_if(self):
         def func(a, b):
             # type: (int, int) -> int
             d = 3
-            if a > 10:
+            if bool(a > 10):
                 a = 3 + d
             else:
                 b = 3 + d
@@ -3509,7 +3508,7 @@ def func(a, b):
             # type: (int, int) -> int
             d = 3
             for _ in range(20):
-                if a > 10:
+                if bool(a > 10):
                     a = 3 + d
                 else:
                     b = 3 + d
@@ -3521,7 +3520,7 @@ def func(a, b):
 
     def test_if_noelse(self):
         def func(a, b):
-            if a > 10:
+            if bool(a > 10):
                 a = 3 + b
             c = a + b
             return c
@@ -3529,11 +3528,19 @@ def func(a, b):
         inputs = self._make_scalar_vars([-1, 1], torch.int64)
         self.checkScript(func, inputs, optimize=True)
 
+    def test_explicit_bool_cast(self):
+        with self.assertRaisesRegex(RuntimeError, "expected an integer"):
+            @torch.jit.script
+            def test_bool_cast(a):
+                if a:
+                    return a + 2
+                return a + 1
+
     def test_while_nonexistent_value(self):
         with self.assertRaisesRegex(RuntimeError, "undefined value x"):
             torch.jit.CompilationUnit('''
             def test_while(a, b):
-                while a < 10:
+                while bool(a < 10):
                     a = a + x
                     b = b + 1
                 return a + b
@@ -3551,7 +3558,7 @@ def test_while(a, b):
 
     def test_while_write_outer_then_read(self):
         def func(a, b):
-            while a < 10:
+            while bool(a < 10):
                 a = a + 1
                 b = a + 1
             return a + b
@@ -3700,7 +3707,7 @@ def test_script_bool_constant():
     def test_ternary(self):
         def func(a, b):
             c = 3
-            c = a + b if a > 3 else b
+            c = a + b if bool(a > 3) else b
             return c
 
         inputs_true = self._make_scalar_vars([5, 2], torch.int64)
@@ -3723,18 +3730,18 @@ def test_logical_short_circuit(self):
         @torch.jit.script
         def testNoThrows(t):
             c1 = 1
-            if (False and t[1]) or (True or t[1]):
+            if (False and bool(t[1])) or (True or bool(t[1])):
                 c1 = 0
             return c1
 
         @torch.jit.script
         def throwsOr(t):
-            c0 = False or t[1]
+            c0 = False or bool(t[1])
             print(c0)
 
         @torch.jit.script
         def throwsAnd(t):
-            c0 = True and t[1]
+            c0 = True and bool(t[1])
             print(c0)
 
         t = torch.randn(0)
@@ -3968,7 +3975,7 @@ def test_call_python(a):
             step = 1
             while i < 10:
                 b = pyfunc(b)
-                if b > 3.0:
+                if bool(b > 3.0):
                     b = pyfunc(b)
                 i = 11
             return b
@@ -4966,7 +4973,7 @@ def test_trace_of_script(self):
         @torch.jit.script
         def foo(a, c):
             b = 0.0
-            if a == 0.0:
+            if bool(a == 0.0):
                 b = 1.0
             return b + c
 
@@ -4985,7 +4992,7 @@ def use(b):
     def test_if_define(self):
         @torch.jit.script
         def foo(a):
-            if a == 0:
+            if bool(a == 0):
                 b = 1
             else:
                 b = 0
@@ -4994,14 +5001,14 @@ def foo(a):
         @torch.jit.script
         def foo2(a):
             b = 0
-            if a == 0:
+            if bool(a == 0):
                 b = 1
             return b + 1
 
         @torch.jit.script
         def foo3(a):
             b = 1
-            if a == 0:
+            if bool(a == 0):
                 c = 4
             else:
                 b = 0
@@ -5287,7 +5294,7 @@ def __init__(self):
 
             @torch.jit.script_method
             def forward(self, x):
-                if torch.sum(x) > 0:
+                if bool(torch.sum(x) > 0):
                     x = torch.neg(x)
                 return x
 
@@ -5402,8 +5409,8 @@ def forward(self, x):
                 # we cannot use `True` as the condition. Constant prop
                 # would remove the `if` statements.
                 c = sum(x) > 4
-                if c:
-                    if c:
+                if bool(c):
+                    if bool(c):
                         y = self.m(x)
                     else:
                         y = self.m(x)
@@ -5836,7 +5843,7 @@ def test_return_stmt_not_at_end(self):
         with self.assertRaisesRegex(RuntimeError, 'return statements can appear only at the end of the function body'):
             @torch.jit.script
             def return_stmt_wrong(x):
-                if x > 3:
+                if bool(x > 3):
                     return 3
                 else:
                     return x
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index d49a653194b1eb..fc7bf068f20998 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -988,11 +988,15 @@ struct to_ir {
 
   Value* emitCond(Expr cond) {
     Value* v = emitExpr(cond);
-    if(v->type()->isSubtypeOf(DynamicType::get())) {
-      v = typeCast(cond.range(), v, IntType::get());
-    }
-    if(!v->type()->isSubtypeOf(IntType::get())) {
-      throw ErrorReport(cond) << "expected a tensor or integer expression for condition but found " << v->type()->str();
+    if (!v->type()->isSubtypeOf(IntType::get())) {
+      ErrorReport error(cond);
+      error << "expected an integer expression for condition but found "
+            << v->type()->str();
+      if (v->type()->isSubtypeOf(DynamicType::get())) {
+        error << ", to use a tensor in a boolean"
+              << " expression, explicitly cast it with `bool()`";
+      }
+      throw error;
     }
     return v;
   }
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index bec6b0459a0814..4ffd16ff24c95a 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -844,25 +844,25 @@ const static auto cf_examples = R"JIT(
       # FIXME: use 0 instead of a.
       # c = 0
       c = a
-      if a < b:
+      if bool(a < b):
         c = b
       else:
         c = a
       return c
   def if_one(a, b):
     c = b
-    if a < b:
+    if bool(a < b):
       c = a
     return c
   def while_test(a, i):
-    while i < 3:
+    while bool(i < 3):
       a *= a
       i += 1
     return a
 )JIT";
 void testControlFlow() {
   script::Module cu;
-  script::defineMethodsInModule(cu, cf_examples, torch::jit::script::Resolver(), nullptr);
+  script::defineMethodsInModule(cu, cf_examples, torch::jit::script::nativeResolver, nullptr);
   auto run = [&](const std::string & name, std::vector<IValue> stack) {
     auto graph = cu.get_method(name).graph();
     Code code(graph);
diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py
index cc022a37da2e3f..229cafbb94119d 100644
--- a/torch/jit/batchop.py
+++ b/torch/jit/batchop.py
@@ -140,7 +140,7 @@ def batch_select(data, mask, dims, dim_, index_):
     # if dim == 0:
     #     raise ValueError("Cannot select 0 dim in BatchTensor")
     data = data.select(dim, index)
-    if dims[dim - 1]:
+    if bool(dims[dim - 1]):
         mask = mask.select(dim, index)
     else:
         mask = mask.select(dim, 0)
@@ -171,7 +171,7 @@ def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dim
     res_mask = torch.zeros([0])
     for i in range(batch_size):
         d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
         else:
             m = mask[i].unsqueeze(0)
@@ -310,7 +310,7 @@ def batch_argmax(data, mask, dims, dim_, keepdim_):
     batch_size = data.size(0)
     res_data = torch.zeros([0])
     for i in range(batch_size):
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             if dim - 1 != 0:
                 m = mask[i].transpose(0, dim - 1)
             else:
@@ -346,7 +346,7 @@ def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_):
     res_data = torch.zeros([0])
     res_index = torch.zeros([0])
     for i in range(batch_size):
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             if dim - 1 != 0:
                 m = mask[i].transpose(0, dim - 1)
             else:
@@ -364,7 +364,7 @@ def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_):
         else:
             res_data = torch.cat([res_data, d], 0)
             res_index = torch.cat([res_index, idx], 0)
-    if dims[dim - 1]:
+    if bool(dims[dim - 1]):
         mask = mask.narrow(dim, 0, k)
     return res_data, mask, dims, res_index, mask, dims
 
@@ -378,7 +378,7 @@ def batch_softmax(data, mask, dims, dim_):
     max_len = data.size(dim)
     res_data = torch.zeros([0])
     for i in range(batch_size):
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             if dim - 1 != 0:
                 m = mask[i].transpose(0, dim - 1)
             else:
@@ -417,7 +417,7 @@ def batch_view(data, mask, dims, sizes):
     res_dims = data_sizes_.narrow(0, 0, 1)
     for i_ in range(sizes.size(0) - 1):
         i = i_ + 1
-        if(sizes[i] == -1):
+        if bool(sizes[i] == -1):
             cur_size_ = mask.size(i)
             cur_dim = 1
         else:
@@ -434,7 +434,7 @@ def batch_view(data, mask, dims, sizes):
 def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_):
     dim = int(dim_)
     data = torch.cat([data1, data2], dim)
-    if(dims1[dim - 1]):
+    if bool(dims1[dim - 1]):
         mask = torch.cat([mask1, mask2], dim)
     else:
         mask = mask1
@@ -445,7 +445,7 @@ def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_):
 def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_):
     dim = int(dim_)
     data = torch.cat([data1, data2, data3], dim)
-    if(dims1[dim - 1]):
+    if bool(dims1[dim - 1]):
         mask = torch.cat([mask1, mask2, mask3], dim)
     else:
         mask = mask1
@@ -460,7 +460,7 @@ def batch_narrow(data, mask, dims, dimension_, start_, length_):
     # if dimension == 0:
     #     raise ValueError("cannot do narrow along batch_dim")
     data = data.narrow(dimension, start, length)
-    if dims[dimension - 1]:
+    if bool(dims[dimension - 1]):
         mask = mask.narrow(dimension, start, length)
     else:
         mask = mask.narrow(dimension, 0, 1)

From 9abc66674559ac214dad31ca42bc0d981d5260db Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Thu, 13 Sep 2018 11:51:17 -0700
Subject: [PATCH 123/237] stop allowing extra positional args in arg parser
 (#10499)

Summary:
Arg parser allowed additional positional args to be parsed into keyword-only params.

Fixes a couple cases:
- The positional argument happens to be of the right type, and it just works silently. Now, we fail as expected.
- The positional argument fails later down the line. Now, we fail at the appropriate time and get a better error message.

Pre-fix:
```
>>> torch.cuda.LongTensor((6, 0), 1, 1, 0)
tensor([6, 0], device='cuda:1')
```
Post-fix:
```
>>> torch.cuda.LongTensor((6, 0), 1, 1, 0)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: new() received an invalid combination of arguments - got (tuple, int, int, int), but expected one of:
 * (torch.device device)
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, torch.device device)
 * (object data, torch.device device)
```

Pre-fix:
```
>>> a = torch.tensor(5)
>>> a.new_zeros((5,5), 0)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: new_zeros(): argument 'dtype' (position 2) must be torch.dtype, not int
```

Post-fix:
```
>>> a = torch.tensor(5)
>>> a.new_zeros((5,5), 0)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: new_zeros() takes 1 positional argument but 2 were given
```

fixes #8351
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10499

Differential Revision: D9811093

Pulled By: li-roy

fbshipit-source-id: ce946270fd11b264ff1b09765db3300879491f76
---
 test/test_torch.py                     | 8 ++++++++
 torch/csrc/utils/python_arg_parser.cpp | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/test/test_torch.py b/test/test_torch.py
index dce509cb7c5b0e..837d7715f3d9e1 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -7427,6 +7427,14 @@ def test_parsing_intlist(self):
             self.assertRaises(TypeError, lambda: torch.ones(np.array(3, 3)))
             self.assertRaises(TypeError, lambda: torch.ones((np.array(3, 3))))
 
+        # fail parse with additional positional args after intlist arg
+        self.assertRaisesRegex(TypeError,
+                               "received an invalid combination of arguments",
+                               lambda: torch.LongTensor((6, 0), 1, 1, 0))
+        self.assertRaisesRegex(TypeError,
+                               "missing 1 required positional arguments",
+                               lambda: torch.tensor().new_zeros((5, 5), 0))
+
     def _test_serialization_data(self):
         a = [torch.randn(5, 5).float() for i in range(2)]
         b = [a[i % 2] for i in range(4)]  # 0-3
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 5dbc83eddc59b9..b1118481c4520a 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -450,6 +450,13 @@ bool FunctionSignature::parse(PyObject* args, PyObject* kwargs, PyObject* dst[],
     PyObject* obj = nullptr;
     bool is_kwd = false;
     if (arg_pos < nargs) {
+      // extra positional args given after single positional IntList arg
+      if (param.keyword_only) {
+        if (raise_exception) {
+          extra_args(*this, nargs);
+        }
+        return false;
+      }
       obj = PyTuple_GET_ITEM(args, arg_pos);
     } else if (kwargs) {
       obj = PyDict_GetItem(kwargs, param.python_name);

From 45e9ee096e64c572cc6c1411aff6c6adc698c0bf Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Thu, 13 Sep 2018 12:07:22 -0700
Subject: [PATCH 124/237] Fix test_mnist_training_leaks_no_memory_cuda warning
 (#11639)

Summary:
Before this PR it would warn that "dropout is non deterministic and can
cause problems when checking trace", so I disabled the trace checking.

cc zdevito apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11639

Differential Revision: D9812493

Pulled By: zou3519

fbshipit-source-id: fab86928a5fba8b218b47543533aaf7c82a10b4a
---
 test/test_jit.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 7bbc2455bf4a77..61a6b94db9935c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -7084,7 +7084,9 @@ def test_mnist_cuda(self):
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_mnist_training_leaks_no_memory_cuda(self):
         net = MnistNet().cuda()
-        traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')])
+        # MnistNet uses dropout, don't check its trace
+        traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')],
+                                     check_trace=False)
 
         def train(iters):
             for _ in range(iters):

From 912d3626c8f834dc7d66ed6fd2d3b27ec77e39ed Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 12:19:16 -0700
Subject: [PATCH 125/237] Split tensor.h into tensor_impl.h and tensor.h
 (#11642)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11642

This is just a preparatory change to help with future
refactoring:

- I want to reduce the number of includes that tensor_impl.h
  depends on, but
- I need to keep tensor.h providing all Caffe2 headers, because
  users may be relying on tensor.h transitively providing those
  headers.

Introducing a level of indirection lets me do both at the same time.

Reviewed By: jerryzh168

Differential Revision: D9810823

fbshipit-source-id: 8dfaac4b8768051a22898be8fcaf787ecc57eb13
---
 caffe2/core/tensor.cc      |  12 -
 caffe2/core/tensor.h       | 894 +-----------------------------------
 caffe2/core/tensor_impl.cc |  14 +
 caffe2/core/tensor_impl.h  | 902 +++++++++++++++++++++++++++++++++++++
 4 files changed, 917 insertions(+), 905 deletions(-)
 create mode 100644 caffe2/core/tensor_impl.cc
 create mode 100644 caffe2/core/tensor_impl.h

diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 15aefc37c74188..e88ed360e8b92b 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -1,18 +1,6 @@
 #include "caffe2/core/tensor.h"
 
 #include "caffe2/core/blob_stats.h"
-#include "caffe2/core/flags.h"
-
-CAFFE2_DEFINE_bool(
-    caffe2_keep_on_shrink,
-    true,
-    "If set, keeps memory when a tensor is shrinking its size.");
-
-CAFFE2_DEFINE_int64(
-    caffe2_max_keep_on_shrink_memory,
-    LLONG_MAX,
-    "The maximum memory in bytes to keep on shrink, if the difference between "
-    "tensor sizes is bigger than this then tensor will be reset.");
 
 namespace caffe2 {
 
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 6fd35133811cd9..5564034e545837 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -2,904 +2,12 @@
 #define CAFFE2_CORE_TENSOR_H_
 
 #include "caffe2/core/storage.h"
+#include "caffe2/core/tensor_impl.h"
 
 #include <ATen/core/intrusive_ptr.h>
 
-// A global boolean variable to control whether we free memory when a Tensor
-// is shrinked to a smaller size. As a result, a Tensor is always going to
-// keep the memory allocated for its maximum capacity reshaped to so far.
-CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
-
-// Since we can have high variance in blob memory allocated across different
-// inputs in the same run, we will shrink the blob only if the memory gain
-// is larger than this flag in bytes.
-CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
-
 namespace caffe2 {
 
-/**
- * A utility function to convert vector<int> to vector<TIndex>.
- */
-inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
-  return vector<TIndex>(src.begin(), src.end());
-}
-
-/**
- * Return product of all dimensions starting from k
- */
-inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
-  TIndex r = 1;
-  for (size_t i = k; i < dims.size(); ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims up to k (not including dims[k])
-inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
-  CAFFE_ENFORCE((unsigned)k <= dims.size());
-  TIndex r = 1;
-  for (int i = 0; i < k; ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims between k and l (not including dims[k] and dims[l])
-inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
-  CAFFE_ENFORCE((unsigned)l < dims.size());
-  TIndex r = 1;
-  if (k < l) {
-    for (int i = k + 1; i < l; ++i) {
-      r *= dims[i];
-    }
-  } else {
-    for (int i = l + 1; i < k; ++i) {
-      r *= dims[i];
-    }
-  }
-  return r;
-}
-
-// Wrap around axis_index if it is negative, s.t., -1 is the last dim
-inline int canonical_axis_index_(int axis_index, int ndims) {
-  CAFFE_ENFORCE_GE(axis_index, -ndims);
-  CAFFE_ENFORCE_LT(axis_index, ndims);
-  if (axis_index < 0) {
-    return axis_index + ndims;
-  }
-  return axis_index;
-}
-
-/**
- * @brief TensorImpl is the implementation of a tensor and the basic class
- * in Caffe2 that stores a contiguous memory with its shape information.
- *
- * The TensorImpl class is essentially a wrapper around a device-specific memory
- * (the device is specified by the Context template argument), and deals with
- * the allocation and de-allocation of such memory. We make a simplified
- * assumption that the memory is always contiguous.
- */
-class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
- public:
-  TensorImpl() = delete;
-  explicit TensorImpl(DeviceType device_type) : storage_(device_type) {}
-
-  /**
-   * @brief Creates a tensor of the given dimension.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * the first time mutable_data() is called.
-   */
-  // TODO: here, we create a Storage
-  // and immediately discard it in Resize() since
-  // reset_tensor will be true and FreeMemory will be called,
-  // we might want to avoid creating Storage twice?
-  explicit TensorImpl(const vector<TIndex>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  explicit TensorImpl(const vector<int>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  /* Now we require that context_for_copy has the same device type as src since
-   * template is removed
-   */
-  TensorImpl(
-      const TensorImpl& src,
-      BaseContext* context_for_copy,
-      at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src, context_for_copy);
-  }
-
-  /**
-   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
-   * src Tensor
-   */
-  TensorImpl(const TensorImpl& src, at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src);
-  }
-
-  /**
-   * @brief Creates a tensor, and fills its contents with the given values.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <typename T>
-  TensorImpl(
-      const vector<TIndex>& dims,
-      const vector<T>& values,
-      BaseContext* context)
-      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
-    Resize(dims);
-    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, values.data(), mutable_data<T>());
-  }
-
-  /**
-   * @brief Creates a scalar tensor, and fills its content with the given value.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  TensorImpl(const T& value, BaseContext* context)
-      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
-    Resize(vector<TIndex>{});
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, &value, mutable_data<T>());
-  }
-
-  /**
-   * @brief Delete the copy constructor and use Clone explicitly
-   */
-  TensorImpl(const TensorImpl& src) = delete;
-
-  TensorImpl(TensorImpl&& src) noexcept {
-    swap(src);
-  }
-
-  TensorImpl& operator=(TensorImpl&&) = default;
-  // Note(jiayq): possibly a rule-of-three violation, but we explicitly
-  // discourage the use of = for Tensors.
-  TensorImpl& operator=(const TensorImpl& src) = delete;
-
-  virtual ~TensorImpl() noexcept {}
-
-  /*
-   * Since we removed template from tensor, we now store a static
-   * context pointer in tensor, which indicates the type of the tensor.
-   */
-  BaseStaticContext* GetStaticContext() const {
-    return get_static_context(GetDeviceType());
-  }
-
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us
-   */
-  std::unique_ptr<BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
-  at::DeviceType GetDeviceType() const {
-    return storage_.device_type();
-  }
-
-  /**
-   * @brief Copies the data from a source tensor, with a contex provided to
-   * carry out the underlying memcpy operation.
-   */
-  void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) {
-    if ((void*)&src == (void*)this) {
-      return;
-    }
-    if (storage_.dtype() != src.meta()) {
-      storage_ = Storage(GetDeviceType(), src.meta());
-    }
-    if (src.size() == -1) {
-      dims_.clear();
-      numel_ = -1;
-      storage_.reset();
-      return;
-    }
-    Resize(src.dims());
-    if (size() > 0) {
-      if (storage_.dtype().copy()) {
-        CAFFE_ENFORCE(
-            GetDeviceType() == CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        CAFFE_ENFORCE(
-            src.GetDeviceType() == CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size());
-      } else {
-        // We'll need to use a non-CPU context to perform the copy if
-        // one of the context is not CPU since only non-CPU context
-        // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
-          if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          } else {
-            CAFFE_ENFORCE(
-                context->GetDevicetype() == src.GetDeviceType(),
-                "Type for provided context does not match the type of source");
-            context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          }
-        } else {
-          // In case source context is CPU, and target context is non-CPU
-          // We'll have to create a Context from target and perform the
-          // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
-        }
-      }
-    }
-  }
-
-  /**
-   * @brief Extend the outer-most dimension of this tensor
-   *        to dimension of `num`.
-   */
-  void ExtendTo(TIndex num, float growthPct, BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
-    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
-    Extend(num - dims_[0], growthPct, context);
-  }
-
-  /**
-   * @brief Extends the outer-most dimension of this tensor by num elements,
-   * preserving the existing data.
-   *
-   * The underlying data may be reallocated in order to accommodate the new
-   * elements, in which case this tensors' capacity is grown at a factor of
-   * growthPct. This ensures that Extend runs on an amortized O(1) time
-   * complexity.
-   */
-  void Extend(TIndex num, float growthPct, BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(
-        num, 0, "`num` must be non-negative for Extend");
-    auto newDims = dims_;
-    newDims[0] += num;
-    if (!storage_.data()) {
-      Resize(newDims);
-      return;
-    }
-    auto newNumel = std::accumulate(
-        newDims.begin(),
-        newDims.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      dims_ = newDims;
-      numel_ = newNumel;
-      return;
-    }
-    auto newCapacity = dims_;
-    newCapacity[0] = std::max<size_t>(
-        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
-    auto oldData = std::move(storage_.data_ptr());
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    auto* newData = raw_mutable_data(storage_.dtype());
-    CAFFE_ENFORCE(
-        context != nullptr, "Context must be provided to Extend the tensor");
-    context->CopyItemsSameDevice(
-        storage_.dtype(), oldSize, oldData.get(), newData);
-    reserved_ = true;
-    dims_ = newDims;
-    numel_ = newNumel;
-  }
-
-  /**
-   * @brief Shrinks the outer-most dimension to given size, keeping the data.
-   *
-   * This method guarantees that no re-allocations are carried out, which means
-   * that the extra capacity after the end of the shurnk tensor is maintained.
-   */
-  void ShrinkTo(TIndex outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
-    CAFFE_ENFORCE_WITH_CALLER(
-        outer_dim <= dims_[0],
-        "New outer dimension must be smaller than current.");
-    CAFFE_ENFORCE(
-        storage_.unique(),
-        "Can't call ShrinkTo on shared storage, please call Resize instead.");
-    dims_[0] = outer_dim;
-    numel_ = std::accumulate(
-        dims_.begin(),
-        dims_.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-  }
-
-  /**
-   * @brief Reserve space for the underlying tensor.
-   *
-   * This must be called after Resize(), since we only specify the first
-   * dimension This does not copy over the old data to the newly allocated space
-   */
-  template <class T>
-  void ReserveSpace(const T& outer_dim) {
-    CAFFE_ENFORCE(
-        numel_ != -1, "size should be initialized before calling ReserveSpace");
-    CAFFE_ENFORCE(
-        storage_.unique(), "Can't call ReserveSpace on shared storage.");
-    auto newCapacity = dims_;
-    newCapacity[0] = outer_dim;
-    auto newNumel = std::accumulate(
-        newCapacity.begin(),
-        newCapacity.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      return;
-    }
-    // Old data is discarded
-    storage_.data_ptr().clear();
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    // Allocate new memory but don't copy over the data
-    raw_mutable_data(storage_.dtype());
-    dims_ = oldDims;
-    numel_ = oldSize;
-    reserved_ = true;
-  }
-
-  /**
-   * @brief Resizes a tensor.
-   *
-   * Resize takes in a vector of ints specifying the dimensions of the tensor.
-   * You can pass in an empty vector to specify that it is a scalar (i.e.
-   * containing one single item).
-   *
-   * The underlying storage may be deleted after calling Resize: if the new
-   * shape leads to a different number of items in the tensor, the old memory
-   * is deleted and new memory will be allocated next time you call
-   * mutable_data(). However, if the shape is different but the total number of
-   * items is the same, the underlying storage is kept.
-   */
-  template <typename... Ts>
-  void Resize(Ts... dim_source) {
-    bool is_init = numel_ == -1;
-    bool size_changed = SetDims(dim_source...);
-    if (size_changed) {
-      // If needed, we will free the data. the next mutable_data() call
-      // will create the data storage.
-      bool reset_tensor = false;
-      if (reserved_) {
-        // If tensor is reserved then don't claim its memeory unless capacity()
-        // is smaller than new size
-        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize();
-      } else {
-        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() ||
-            !FLAGS_caffe2_keep_on_shrink ||
-            storage_.capacity() - numel_ * storage_.itemsize() >
-                FLAGS_caffe2_max_keep_on_shrink_memory;
-      }
-
-      if (reset_tensor && !is_init) {
-        FreeMemory();
-      }
-    }
-  }
-
-  /**
-   * Resize the tensor like the source tensor. Note that this is just a
-   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
-   */
-  inline void ResizeLike(const TensorImpl& src_tensor) {
-    // Note: need casting for different context types.
-    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
-      Resize(src_tensor.dims());
-    }
-  }
-
-  /**
-   * Resizes the tensor without touching underlying storage.
-   * This requires the total size of the tensor to remains constant.
-   */
-  inline void Reshape(const vector<TIndex>& dims) {
-    TIndex new_size = 1;
-    for (auto d : dims) {
-      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
-      new_size *= d;
-    }
-    CAFFE_ENFORCE_WITH_CALLER(
-        new_size == numel_,
-        "New size and old size are not equal. You cannot use Reshape, "
-        "but should use Resize."
-        // TODO(jiayq): remove the following warning after pending diffs
-        // stabilize.
-        " The old caffe2 mixes Reshape and Resize but this behavior has "
-        "been changed. If you find this error, most likely you will need "
-        "to change corresponding code from Reshape to Resize.");
-    dims_ = dims;
-  }
-
-  inline void Reshape(const vector<int>& dims) {
-    Reshape(ToVectorTIndex(dims));
-  }
-
-  /**
-   * Release whatever memory the tensor was holding but keep size and type
-   * information. Subsequent call to mutable_data will trigger new memory
-   * allocation.
-   */
-  inline void FreeMemory() {
-    // We'll detach from the old Storage and create a new one
-    storage_ = Storage(storage_.device_type(), storage_.dtype());
-  }
-
-  /**
-   * A utility function to print the debug string for the tensor. Note that this
-   * is very slow since it involves quite some string operations, so do not use
-   * it in your performance-critical code.
-   */
-  string DebugString() const {
-    std::stringstream ss;
-    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
-       << storage_.dtype().name() << " and dimension (";
-    for (int d : dims_) {
-      ss << d << ",";
-    }
-    ss << ").";
-    return ss.str();
-  }
-
-  void swap(TensorImpl& other) noexcept {
-    std::swap(dims_, other.dims_);
-    std::swap(numel_, other.numel_);
-    std::swap(storage_, other.storage_);
-  }
-
-  /**
-   * @brief Shares the data with another tensor.
-   *
-   * To share data between two tensors, the sizes of the two tensors must be
-   * equal already. The reason we do not implicitly do a Resize to make the two
-   * tensors have the same shape is that we want to allow tensors of different
-   * shapes but the same number of items to still be able to share data. This
-   * allows one to e.g. have a n-dimensional Tensor and a flattened version
-   * sharing the same underlying storage.
-   *
-   * The source tensor should already have its data allocated.
-   */
-  void ShareData(const TensorImpl& src) {
-    // Right now, we are assuming the device_type are the same, since it is
-    // inherently the same in the non-templatized code. We should probably add
-    // an ENFORCE here which might affect perf a little bit.
-    CAFFE_ENFORCE_EQ_WITH_CALLER(
-        src.numel_,
-        numel_,
-        "Size mismatch - did you call reshape before sharing the data?");
-    // It is possible that the source tensor hasn't called mutable_data() yet,
-    // in which case ShareData() doesn't make much sense since we don't really
-    // know what to share yet.
-    CAFFE_ENFORCE_WITH_CALLER(
-        src.storage_.data() || src.numel_ == 0,
-        "Source tensor has no content and has size > 0");
-    // Finally, do sharing.
-    /* Since we create new Storage whenever we need to change data_type/capacity
-     * this still keeps the original semantics
-     */
-    storage_ = src.storage();
-  }
-
-  /**
-   * @brief Shares the data with an externally managed pointer.
-   *
-   * This is similar to ShareData() but the source is a pointer with an advanced
-   * deleter option. In default, no deletion takes place, and one needs to make
-   * sure that the external memory is deallocated only after the tensor finishes
-   * using it. If a Deleter object is passed in, when this tensor is reallocated
-   * or freed, the deleter function is going to be called.
-   */
-  template <typename T>
-  void
-  ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) {
-    ShareExternalPointer((void*)src, TypeMeta::Make<T>(), capacity, d);
-  }
-
-  template <typename T>
-  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
-    ShareExternalPointer(std::move(data_ptr), TypeMeta::Make<T>(), capacity);
-  }
-
-  void ShareExternalPointer(
-      void* src,
-      const TypeMeta& data_type,
-      size_t capacity = 0,
-      MemoryDeleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    ShareExternalPointer(
-        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
-  }
-
-  void ShareExternalPointer(
-      at::DataPtr&& data_ptr,
-      const TypeMeta& data_type,
-      size_t capacity) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    if (!capacity) {
-      capacity = numel_ * data_type.itemsize();
-    }
-    if (storage_.unique()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "To share data with a raw pointer, you need to set shape first.");
-      storage_.UniqueStorageShareExternalPointer(
-          std::move(data_ptr), data_type, capacity);
-    } else {
-      int64_t numel = capacity / data_type.itemsize();
-      // Create a new Storage
-      storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true);
-    }
-  }
-
-  /**
-   * Returns a const raw void* pointer of the underlying storage. mutable_data()
-   * or raw_mutable_data() must have been called prior to this function call.
-   */
-  inline const void* raw_data() const {
-    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
-    return storage_.data();
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage. mutable_data() or
-   * raw_mutable_data() must have been called prior to this function call, and
-   * the data type must be of the correct type. If you want to get a void*
-   * pointer instead, use raw_data().
-   */
-  template <typename T>
-  inline const T* data() const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_.data() || numel_ == 0,
-        "The tensor is of non-zero shape, but its data is not allocated yet. "
-        "Caffe2 uses a lazy allocation, so you will need to call "
-        "mutable_data() or raw_mutable_data() to actually allocate memory.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        IsType<T>(),
-        "Tensor type mismatch, caller expects elements to be ",
-        TypeMeta::TypeName<T>(),
-        ", while tensor contains ",
-        storage_.dtype().name(),
-        ". ");
-    return static_cast<T*>(storage_.data());
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. Since we will need
-   * to know the type of the data for allocation, a TypeMeta object is passed in
-   * to specify the necessary information. This is conceptually equivalent of
-   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
-   * the type T. This function differs from mutable_data<T>() in the sense that
-   * the type T can be specified during runtime via the TypeMeta object.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data(const TypeMeta& meta) {
-    // For 0-size tensors it's fine to return any pointer (including nullptr)
-    if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) {
-      return storage_.data();
-    } else {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "Tensor is not initialized. You probably need to call Resize() "
-          "before calling mutable_data()");
-      bool had_special_dtor = storage_.dtype().dtor() != nullptr;
-      if (storage_.unique()) {
-        storage_.set_dtype(meta);
-      } else {
-        if (storage_.dtype() != meta) {
-          storage_ = Storage(storage_.device_type(), meta);
-        }
-      }
-
-      // We can reuse the existing buffer if the current data does not have
-      // a special destructor and the new data doesn't have a special
-      // constructor.
-      if (numel_ == 0 ||
-          (meta.ctor() == nullptr && !had_special_dtor &&
-           storage_.numel() >= numel_)) {
-        return storage_.data();
-      }
-      const at::Allocator* allocator = storage_.allocator();
-      // TODO: Get rid of StaticContext
-      CAFFE_ENFORCE(
-          allocator == nullptr,
-          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
-      if (meta.ctor()) {
-        // For types that need placement new, we will call it, as well as
-        // making sure that when the data is freed, it calls the right
-        // destruction procedure.
-        auto size = numel_;
-        auto dtor = storage_.dtype().dtor();
-        void* ptr;
-        at::DeleterFnPtr deleter;
-        auto ptr_and_deleter = GetStaticContext()->New(
-            numel_ * storage_.itemsize()); // Removing this can get rid of
-                                           // InefficientStdFunctionContext
-        ptr = ptr_and_deleter.first;
-        deleter = ptr_and_deleter.second;
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr,
-            [size, dtor, deleter](void* local_ptr) -> void {
-              dtor(local_ptr, size);
-              deleter(local_ptr);
-            },
-            at::Device(storage_.device_type())));
-        storage_.dtype().ctor()(storage_.data(), numel_);
-      } else {
-        // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr_and_deleter.first,
-            ptr_and_deleter.second,
-            at::Device(storage_.device_type())));
-      }
-      storage_.set_numel(numel_);
-      return storage_.data();
-    }
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. This can only be
-   * used when you know for sure that the underlying storage of the tensor is
-   * already created via an earlier raw_mutable_data(meta) call or a
-   * mutable_data<T>() call.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data() {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_.dtype().id() != TypeIdentifier::uninitialized(),
-        "Calling raw_mutable_data() without meta, but the current meta is "
-        "of unknown type.");
-    return raw_mutable_data(storage_.dtype());
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage.
-   *
-   * For fundamental types, we reuse possible existing storage if there
-   * is sufficient capacity.
-   */
-  template <typename T>
-  inline T* mutable_data() {
-    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
-      return static_cast<T*>(storage_.data());
-    }
-    // Check it here statically - otherwise TypeMeta would throw the runtime
-    // error in attempt to invoke TypeMeta::ctor()
-    static_assert(
-        std::is_default_constructible<T>::value,
-        "Tensor can't hold non-default-constructible types");
-    return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
-  }
-
-  /**
-   * Returns the number of dimensions of the data.
-   */
-  inline int ndim() const {
-    return dims_.size();
-  }
-  /**
-   * Returns the size (i.e. the number of items) of the tensor.
-   */
-  inline TIndex size() const {
-    return numel_;
-  }
-  /**
-   * Return the number of bytes each item takes in the tensor.
-   */
-  inline size_t itemsize() const {
-    return storage_.itemsize();
-  }
-  /**
-   * Returns the total number of bytes of the storage.
-   *
-   * This is equivalent to calling size() * itemsize().
-   */
-  inline size_t nbytes() const {
-    return numel_ * itemsize();
-    ;
-  }
-
-  inline size_t capacity_nbytes() const {
-    return storage_.capacity();
-  }
-  /**
-   * Returns the dimensions of the tensor as a vector.
-   */
-  inline const vector<TIndex>& dims() const {
-    return dims_;
-  }
-
-  inline TIndex size_from_dim(int k) const {
-    return size_from_dim_(k, dims_);
-  }
-
-  inline TIndex size_to_dim(int k) const {
-    return size_to_dim_(k, dims_);
-  }
-
-  inline TIndex size_between_dim(int k, int l) const {
-    return size_between_dim_(k, l, dims_);
-  }
-
-  /**
-   * Returns the 'canonical' version of a (usually)  user-specified axis,
-   * allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param axis_index the axis index.
-   *        If 0 <= index < ndim(), return index.
-   *        If -ndim <= index <= -1, return (ndim() - (-index)),
-   *        e.g., the last axis index (ndim() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, ndim());
-  }
-
-  /**
-   * Checks if the tensor content is of the given data type.
-   */
-  template <typename T>
-  inline bool IsType() const {
-    return storage_.IsType<T>();
-  }
-  /**
-   * Returns the TypeMeta object associated with the current data type.
-   */
-  inline const TypeMeta& meta() const {
-    return storage_.dtype();
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor in int.
-   *
-   * This function returns an int value instead of TIndex, which depending on
-   * the typedef could be int64. If you want int64 dim values, make sure you
-   * call dim() instead.
-   */
-  inline int dim32(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
-    return static_cast<int>(dims_[i]);
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor. Note that the passed in index
-   * must be between 0 (inclusive) and the number of dimensions, otherwise
-   * this function will produce a fatal message.
-   */
-  inline TIndex dim(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    return dims_[i];
-  }
-
-  void ExtractDeviceOption(DeviceOption* device) const {
-    GetStaticContext()->ExtractDeviceOption(device, raw_data());
-  }
-
-  const Storage& storage() {
-    return storage_;
-  }
-
-  const Storage& storage() const {
-    return storage_;
-  }
-
- protected:
-  using DimVector = std::vector<TIndex>;
-  DimVector dims_; // sizes_
-  TIndex numel_ = -1; // numel_
-  // we decide to keep reserved_ and it will
-  // live in Tensor after the split
-  // The logic is that if Extend() or ReserveSpace() were ever called,
-  // then subsequent Resize()s will not free up Storage.
-  bool reserved_ = false;
-  Storage storage_;
-  // int64_t storage_offset_;
-
- private:
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_integral<T>::value>::type>
-  bool SetDims(const vector<T>& src) {
-    auto old_numel = numel_;
-    dims_.resize(src.size());
-    TIndex new_numel = 1;
-    for (size_t i = 0; i < src.size(); ++i) {
-      new_numel *= src[i];
-      dims_[i] = src[i];
-    }
-    numel_ = new_numel;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims() {
-    auto old_numel = numel_;
-    dims_.resize(0);
-    numel_ = 1;
-    return numel_ != old_numel;
-  }
-
-  // TODO(jiayq): maybe rewrite the following functions with initializer list.
-  // NVCC does not play well with initializer lists last time, but worth
-  // another shot.
-  bool SetDims(const TIndex d0) {
-    auto old_numel = numel_;
-    dims_.resize(1);
-    dims_[0] = d0;
-    numel_ = d0;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const TIndex d0, const TIndex d1) {
-    auto old_numel = numel_;
-    dims_.resize(2);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    numel_ = d0 * d1;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
-    auto old_numel = numel_;
-    dims_.resize(3);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    numel_ = d0 * d1 * d2;
-    return numel_ != old_numel;
-  }
-
-  bool
-  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
-    auto old_numel = numel_;
-    dims_.resize(4);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    dims_[3] = d3;
-    numel_ = d0 * d1 * d2 * d3;
-    return numel_ != old_numel;
-  }
-};
-
 class CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
   UndefinedTensorImpl() : TensorImpl(CPU){};
 
diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc
new file mode 100644
index 00000000000000..cff98c6101ea5d
--- /dev/null
+++ b/caffe2/core/tensor_impl.cc
@@ -0,0 +1,14 @@
+#include "caffe2/core/tensor_impl.h"
+
+#include "caffe2/core/flags.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_keep_on_shrink,
+    true,
+    "If set, keeps memory when a tensor is shrinking its size.");
+
+CAFFE2_DEFINE_int64(
+    caffe2_max_keep_on_shrink_memory,
+    LLONG_MAX,
+    "The maximum memory in bytes to keep on shrink, if the difference between "
+    "tensor sizes is bigger than this then tensor will be reset.");
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
new file mode 100644
index 00000000000000..ffa3ef61c4cc27
--- /dev/null
+++ b/caffe2/core/tensor_impl.h
@@ -0,0 +1,902 @@
+#pragma once
+
+#include "caffe2/core/storage.h"
+
+#include <ATen/core/intrusive_ptr.h>
+
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrinked to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.
+CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+namespace caffe2 {
+
+/**
+ * A utility function to convert vector<int> to vector<TIndex>.
+ */
+inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
+  return vector<TIndex>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from k
+ */
+inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
+  TIndex r = 1;
+  for (size_t i = k; i < dims.size(); ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to k (not including dims[k])
+inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
+  CAFFE_ENFORCE((unsigned)k <= dims.size());
+  TIndex r = 1;
+  for (int i = 0; i < k; ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
+  CAFFE_ENFORCE((unsigned)l < dims.size());
+  TIndex r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+inline int canonical_axis_index_(int axis_index, int ndims) {
+  CAFFE_ENFORCE_GE(axis_index, -ndims);
+  CAFFE_ENFORCE_LT(axis_index, ndims);
+  if (axis_index < 0) {
+    return axis_index + ndims;
+  }
+  return axis_index;
+}
+
+/**
+ * @brief TensorImpl is the implementation of a tensor and the basic class
+ * in Caffe2 that stores a contiguous memory with its shape information.
+ *
+ * The TensorImpl class is essentially a wrapper around a device-specific memory
+ * (the device is specified by the Context template argument), and deals with
+ * the allocation and de-allocation of such memory. We make a simplified
+ * assumption that the memory is always contiguous.
+ */
+class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
+ public:
+  TensorImpl() = delete;
+  explicit TensorImpl(DeviceType device_type) : storage_(device_type) {}
+
+  /**
+   * @brief Creates a tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   */
+  // TODO: here, we create a Storage
+  // and immediately discard it in Resize() since
+  // reset_tensor will be true and FreeMemory will be called,
+  // we might want to avoid creating Storage twice?
+  explicit TensorImpl(const vector<TIndex>& dims, at::DeviceType device_type)
+      : storage_(device_type) {
+    Resize(dims);
+  }
+
+  explicit TensorImpl(const vector<int>& dims, at::DeviceType device_type)
+      : storage_(device_type) {
+    Resize(dims);
+  }
+
+  /* Now we require that context_for_copy has the same device type as src since
+   * template is removed
+   */
+  TensorImpl(
+      const TensorImpl& src,
+      BaseContext* context_for_copy,
+      at::DeviceType device_type)
+      : storage_(device_type) {
+    CopyFrom(src, context_for_copy);
+  }
+
+  /**
+   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
+   * src Tensor
+   */
+  TensorImpl(const TensorImpl& src, at::DeviceType device_type)
+      : storage_(device_type) {
+    CopyFrom(src);
+  }
+
+  /**
+   * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
+   */
+  template <typename T>
+  TensorImpl(
+      const vector<TIndex>& dims,
+      const vector<T>& values,
+      BaseContext* context)
+      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
+    Resize(dims);
+    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
+    context->CopyItemsFromCPU(
+        storage_.dtype(), numel_, values.data(), mutable_data<T>());
+  }
+
+  /**
+   * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
+   */
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
+  TensorImpl(const T& value, BaseContext* context)
+      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
+    Resize(vector<TIndex>{});
+    context->CopyItemsFromCPU(
+        storage_.dtype(), numel_, &value, mutable_data<T>());
+  }
+
+  /**
+   * @brief Delete the copy constructor and use Clone explicitly
+   */
+  TensorImpl(const TensorImpl& src) = delete;
+
+  TensorImpl(TensorImpl&& src) noexcept {
+    swap(src);
+  }
+
+  TensorImpl& operator=(TensorImpl&&) = default;
+  // Note(jiayq): possibly a rule-of-three violation, but we explicitly
+  // discourage the use of = for Tensors.
+  TensorImpl& operator=(const TensorImpl& src) = delete;
+
+  virtual ~TensorImpl() noexcept {}
+
+  /*
+   * Since we removed template from tensor, we now store a static
+   * context pointer in tensor, which indicates the type of the tensor.
+   */
+  BaseStaticContext* GetStaticContext() const {
+    return get_static_context(GetDeviceType());
+  }
+
+  /* @brief
+   * Create a context that has the same device_type
+   * as the tensor.
+   * Note that this doesn't support passing in argument
+   * TODO(jerryzh): move this to a global registry
+   * that can create context for us
+   */
+  std::unique_ptr<BaseContext> CreateContext() const {
+    return GetStaticContext()->CreateContext();
+  }
+
+  at::DeviceType GetDeviceType() const {
+    return storage_.device_type();
+  }
+
+  /**
+   * @brief Copies the data from a source tensor, with a contex provided to
+   * carry out the underlying memcpy operation.
+   */
+  void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) {
+    if ((void*)&src == (void*)this) {
+      return;
+    }
+    if (storage_.dtype() != src.meta()) {
+      storage_ = Storage(GetDeviceType(), src.meta());
+    }
+    if (src.size() == -1) {
+      dims_.clear();
+      numel_ = -1;
+      storage_.reset();
+      return;
+    }
+    Resize(src.dims());
+    if (size() > 0) {
+      if (storage_.dtype().copy()) {
+        CAFFE_ENFORCE(
+            GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        CAFFE_ENFORCE(
+            src.GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size());
+      } else {
+        // We'll need to use a non-CPU context to perform the copy if
+        // one of the context is not CPU since only non-CPU context
+        // knows how to copy between CPU and that context
+        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
+          if (!context) {
+            src.CreateContext()->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          } else {
+            CAFFE_ENFORCE(
+                context->GetDevicetype() == src.GetDeviceType(),
+                "Type for provided context does not match the type of source");
+            context->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          }
+        } else {
+          // In case source context is CPU, and target context is non-CPU
+          // We'll have to create a Context from target and perform the
+          // copy using that context
+          CreateContext()->CopyBytesFromCPU(
+              nbytes(), src.raw_data(), raw_mutable_data());
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Extend the outer-most dimension of this tensor
+   *        to dimension of `num`.
+   */
+  void ExtendTo(TIndex num, float growthPct, BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
+    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
+    Extend(num - dims_[0], growthPct, context);
+  }
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   */
+  void Extend(TIndex num, float growthPct, BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        num, 0, "`num` must be non-negative for Extend");
+    auto newDims = dims_;
+    newDims[0] += num;
+    if (!storage_.data()) {
+      Resize(newDims);
+      return;
+    }
+    auto newNumel = std::accumulate(
+        newDims.begin(),
+        newDims.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      dims_ = newDims;
+      numel_ = newNumel;
+      return;
+    }
+    auto newCapacity = dims_;
+    newCapacity[0] = std::max<size_t>(
+        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
+    auto oldData = std::move(storage_.data_ptr());
+    auto oldSize = numel_;
+    auto oldDims = dims_;
+    Resize(newCapacity);
+    auto* newData = raw_mutable_data(storage_.dtype());
+    CAFFE_ENFORCE(
+        context != nullptr, "Context must be provided to Extend the tensor");
+    context->CopyItemsSameDevice(
+        storage_.dtype(), oldSize, oldData.get(), newData);
+    reserved_ = true;
+    dims_ = newDims;
+    numel_ = newNumel;
+  }
+
+  /**
+   * @brief Shrinks the outer-most dimension to given size, keeping the data.
+   *
+   * This method guarantees that no re-allocations are carried out, which means
+   * that the extra capacity after the end of the shurnk tensor is maintained.
+   */
+  void ShrinkTo(TIndex outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
+    CAFFE_ENFORCE_WITH_CALLER(
+        outer_dim <= dims_[0],
+        "New outer dimension must be smaller than current.");
+    CAFFE_ENFORCE(
+        storage_.unique(),
+        "Can't call ShrinkTo on shared storage, please call Resize instead.");
+    dims_[0] = outer_dim;
+    numel_ = std::accumulate(
+        dims_.begin(),
+        dims_.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+  }
+
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  template <class T>
+  void ReserveSpace(const T& outer_dim) {
+    CAFFE_ENFORCE(
+        numel_ != -1, "size should be initialized before calling ReserveSpace");
+    CAFFE_ENFORCE(
+        storage_.unique(), "Can't call ReserveSpace on shared storage.");
+    auto newCapacity = dims_;
+    newCapacity[0] = outer_dim;
+    auto newNumel = std::accumulate(
+        newCapacity.begin(),
+        newCapacity.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      return;
+    }
+    // Old data is discarded
+    storage_.data_ptr().clear();
+    auto oldSize = numel_;
+    auto oldDims = dims_;
+    Resize(newCapacity);
+    // Allocate new memory but don't copy over the data
+    raw_mutable_data(storage_.dtype());
+    dims_ = oldDims;
+    numel_ = oldSize;
+    reserved_ = true;
+  }
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool is_init = numel_ == -1;
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      // If needed, we will free the data. the next mutable_data() call
+      // will create the data storage.
+      bool reset_tensor = false;
+      if (reserved_) {
+        // If tensor is reserved then don't claim its memeory unless capacity()
+        // is smaller than new size
+        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize();
+      } else {
+        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() ||
+            !FLAGS_caffe2_keep_on_shrink ||
+            storage_.capacity() - numel_ * storage_.itemsize() >
+                FLAGS_caffe2_max_keep_on_shrink_memory;
+      }
+
+      if (reset_tensor && !is_init) {
+        FreeMemory();
+      }
+    }
+  }
+
+  /**
+   * Resize the tensor like the source tensor. Note that this is just a
+   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
+   */
+  inline void ResizeLike(const TensorImpl& src_tensor) {
+    // Note: need casting for different context types.
+    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
+      Resize(src_tensor.dims());
+    }
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  inline void Reshape(const vector<TIndex>& dims) {
+    TIndex new_size = 1;
+    for (auto d : dims) {
+      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
+      new_size *= d;
+    }
+    CAFFE_ENFORCE_WITH_CALLER(
+        new_size == numel_,
+        "New size and old size are not equal. You cannot use Reshape, "
+        "but should use Resize."
+        // TODO(jiayq): remove the following warning after pending diffs
+        // stabilize.
+        " The old caffe2 mixes Reshape and Resize but this behavior has "
+        "been changed. If you find this error, most likely you will need "
+        "to change corresponding code from Reshape to Resize.");
+    dims_ = dims;
+  }
+
+  inline void Reshape(const vector<int>& dims) {
+    Reshape(ToVectorTIndex(dims));
+  }
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  inline void FreeMemory() {
+    // We'll detach from the old Storage and create a new one
+    storage_ = Storage(storage_.device_type(), storage_.dtype());
+  }
+
+  /**
+   * A utility function to print the debug string for the tensor. Note that this
+   * is very slow since it involves quite some string operations, so do not use
+   * it in your performance-critical code.
+   */
+  string DebugString() const {
+    std::stringstream ss;
+    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
+       << storage_.dtype().name() << " and dimension (";
+    for (int d : dims_) {
+      ss << d << ",";
+    }
+    ss << ").";
+    return ss.str();
+  }
+
+  void swap(TensorImpl& other) noexcept {
+    std::swap(dims_, other.dims_);
+    std::swap(numel_, other.numel_);
+    std::swap(storage_, other.storage_);
+  }
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  void ShareData(const TensorImpl& src) {
+    // Right now, we are assuming the device_type are the same, since it is
+    // inherently the same in the non-templatized code. We should probably add
+    // an ENFORCE here which might affect perf a little bit.
+    CAFFE_ENFORCE_EQ_WITH_CALLER(
+        src.numel_,
+        numel_,
+        "Size mismatch - did you call reshape before sharing the data?");
+    // It is possible that the source tensor hasn't called mutable_data() yet,
+    // in which case ShareData() doesn't make much sense since we don't really
+    // know what to share yet.
+    CAFFE_ENFORCE_WITH_CALLER(
+        src.storage_.data() || src.numel_ == 0,
+        "Source tensor has no content and has size > 0");
+    // Finally, do sharing.
+    /* Since we create new Storage whenever we need to change data_type/capacity
+     * this still keeps the original semantics
+     */
+    storage_ = src.storage();
+  }
+
+  /**
+   * @brief Shares the data with an externally managed pointer.
+   *
+   * This is similar to ShareData() but the source is a pointer with an advanced
+   * deleter option. In default, no deletion takes place, and one needs to make
+   * sure that the external memory is deallocated only after the tensor finishes
+   * using it. If a Deleter object is passed in, when this tensor is reallocated
+   * or freed, the deleter function is going to be called.
+   */
+  template <typename T>
+  void
+  ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) {
+    ShareExternalPointer((void*)src, TypeMeta::Make<T>(), capacity, d);
+  }
+
+  template <typename T>
+  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
+    ShareExternalPointer(std::move(data_ptr), TypeMeta::Make<T>(), capacity);
+  }
+
+  void ShareExternalPointer(
+      void* src,
+      const TypeMeta& data_type,
+      size_t capacity = 0,
+      MemoryDeleter d = nullptr) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    ShareExternalPointer(
+        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    if (!capacity) {
+      capacity = numel_ * data_type.itemsize();
+    }
+    if (storage_.unique()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "To share data with a raw pointer, you need to set shape first.");
+      storage_.UniqueStorageShareExternalPointer(
+          std::move(data_ptr), data_type, capacity);
+    } else {
+      int64_t numel = capacity / data_type.itemsize();
+      // Create a new Storage
+      storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+    }
+  }
+
+  /**
+   * Returns a const raw void* pointer of the underlying storage. mutable_data()
+   * or raw_mutable_data() must have been called prior to this function call.
+   */
+  inline const void* raw_data() const {
+    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
+    return storage_.data();
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage. mutable_data() or
+   * raw_mutable_data() must have been called prior to this function call, and
+   * the data type must be of the correct type. If you want to get a void*
+   * pointer instead, use raw_data().
+   */
+  template <typename T>
+  inline const T* data() const {
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.data() || numel_ == 0,
+        "The tensor is of non-zero shape, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        IsType<T>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        TypeMeta::TypeName<T>(),
+        ", while tensor contains ",
+        storage_.dtype().name(),
+        ". ");
+    return static_cast<T*>(storage_.data());
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) {
+      return storage_.data();
+    } else {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "Tensor is not initialized. You probably need to call Resize() "
+          "before calling mutable_data()");
+      bool had_special_dtor = storage_.dtype().dtor() != nullptr;
+      if (storage_.unique()) {
+        storage_.set_dtype(meta);
+      } else {
+        if (storage_.dtype() != meta) {
+          storage_ = Storage(storage_.device_type(), meta);
+        }
+      }
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (numel_ == 0 ||
+          (meta.ctor() == nullptr && !had_special_dtor &&
+           storage_.numel() >= numel_)) {
+        return storage_.data();
+      }
+      const at::Allocator* allocator = storage_.allocator();
+      // TODO: Get rid of StaticContext
+      CAFFE_ENFORCE(
+          allocator == nullptr,
+          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
+      if (meta.ctor()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = numel_;
+        auto dtor = storage_.dtype().dtor();
+        void* ptr;
+        at::DeleterFnPtr deleter;
+        auto ptr_and_deleter = GetStaticContext()->New(
+            numel_ * storage_.itemsize()); // Removing this can get rid of
+                                           // InefficientStdFunctionContext
+        ptr = ptr_and_deleter.first;
+        deleter = ptr_and_deleter.second;
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr,
+            [size, dtor, deleter](void* local_ptr) -> void {
+              dtor(local_ptr, size);
+              deleter(local_ptr);
+            },
+            at::Device(storage_.device_type())));
+        storage_.dtype().ctor()(storage_.data(), numel_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        auto ptr_and_deleter =
+            GetStaticContext()->New(numel_ * storage_.itemsize());
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr_and_deleter.first,
+            ptr_and_deleter.second,
+            at::Device(storage_.device_type())));
+      }
+      storage_.set_numel(numel_);
+      return storage_.data();
+    }
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. This can only be
+   * used when you know for sure that the underlying storage of the tensor is
+   * already created via an earlier raw_mutable_data(meta) call or a
+   * mutable_data<T>() call.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data() {
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.dtype().id() != TypeIdentifier::uninitialized(),
+        "Calling raw_mutable_data() without meta, but the current meta is "
+        "of unknown type.");
+    return raw_mutable_data(storage_.dtype());
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+  template <typename T>
+  inline T* mutable_data() {
+    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
+      return static_cast<T*>(storage_.data());
+    }
+    // Check it here statically - otherwise TypeMeta would throw the runtime
+    // error in attempt to invoke TypeMeta::ctor()
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "Tensor can't hold non-default-constructible types");
+    return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
+  }
+
+  /**
+   * Returns the number of dimensions of the data.
+   */
+  inline int ndim() const {
+    return dims_.size();
+  }
+  /**
+   * Returns the size (i.e. the number of items) of the tensor.
+   */
+  inline TIndex size() const {
+    return numel_;
+  }
+  /**
+   * Return the number of bytes each item takes in the tensor.
+   */
+  inline size_t itemsize() const {
+    return storage_.itemsize();
+  }
+  /**
+   * Returns the total number of bytes of the storage.
+   *
+   * This is equivalent to calling size() * itemsize().
+   */
+  inline size_t nbytes() const {
+    return numel_ * itemsize();
+    ;
+  }
+
+  inline size_t capacity_nbytes() const {
+    return storage_.capacity();
+  }
+  /**
+   * Returns the dimensions of the tensor as a vector.
+   */
+  inline const vector<TIndex>& dims() const {
+    return dims_;
+  }
+
+  inline TIndex size_from_dim(int k) const {
+    return size_from_dim_(k, dims_);
+  }
+
+  inline TIndex size_to_dim(int k) const {
+    return size_to_dim_(k, dims_);
+  }
+
+  inline TIndex size_between_dim(int k, int l) const {
+    return size_between_dim_(k, l, dims_);
+  }
+
+  /**
+   * Returns the 'canonical' version of a (usually)  user-specified axis,
+   * allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < ndim(), return index.
+   *        If -ndim <= index <= -1, return (ndim() - (-index)),
+   *        e.g., the last axis index (ndim() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
+  inline int canonical_axis_index(int axis_index) const {
+    return canonical_axis_index_(axis_index, ndim());
+  }
+
+  /**
+   * Checks if the tensor content is of the given data type.
+   */
+  template <typename T>
+  inline bool IsType() const {
+    return storage_.IsType<T>();
+  }
+  /**
+   * Returns the TypeMeta object associated with the current data type.
+   */
+  inline const TypeMeta& meta() const {
+    return storage_.dtype();
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor in int.
+   *
+   * This function returns an int value instead of TIndex, which depending on
+   * the typedef could be int64. If you want int64 dim values, make sure you
+   * call dim() instead.
+   */
+  inline int dim32(const int i) const {
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
+    return static_cast<int>(dims_[i]);
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor. Note that the passed in index
+   * must be between 0 (inclusive) and the number of dimensions, otherwise
+   * this function will produce a fatal message.
+   */
+  inline TIndex dim(const int i) const {
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    return dims_[i];
+  }
+
+  void ExtractDeviceOption(DeviceOption* device) const {
+    GetStaticContext()->ExtractDeviceOption(device, raw_data());
+  }
+
+  const Storage& storage() {
+    return storage_;
+  }
+
+  const Storage& storage() const {
+    return storage_;
+  }
+
+ protected:
+  using DimVector = std::vector<TIndex>;
+  DimVector dims_; // sizes_
+  TIndex numel_ = -1; // numel_
+  // we decide to keep reserved_ and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
+  bool reserved_ = false;
+  Storage storage_;
+  // int64_t storage_offset_;
+
+ private:
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_integral<T>::value>::type>
+  bool SetDims(const vector<T>& src) {
+    auto old_numel = numel_;
+    dims_.resize(src.size());
+    TIndex new_numel = 1;
+    for (size_t i = 0; i < src.size(); ++i) {
+      new_numel *= src[i];
+      dims_[i] = src[i];
+    }
+    numel_ = new_numel;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims() {
+    auto old_numel = numel_;
+    dims_.resize(0);
+    numel_ = 1;
+    return numel_ != old_numel;
+  }
+
+  // TODO(jiayq): maybe rewrite the following functions with initializer list.
+  // NVCC does not play well with initializer lists last time, but worth
+  // another shot.
+  bool SetDims(const TIndex d0) {
+    auto old_numel = numel_;
+    dims_.resize(1);
+    dims_[0] = d0;
+    numel_ = d0;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1) {
+    auto old_numel = numel_;
+    dims_.resize(2);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    numel_ = d0 * d1;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+    auto old_numel = numel_;
+    dims_.resize(3);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    numel_ = d0 * d1 * d2;
+    return numel_ != old_numel;
+  }
+
+  bool
+  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
+    auto old_numel = numel_;
+    dims_.resize(4);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    dims_[3] = d3;
+    numel_ = d0 * d1 * d2 * d3;
+    return numel_ != old_numel;
+  }
+};
+
+}

From 75f49befeb922a577a1e0e38f26e963b57c1f8ea Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Thu, 13 Sep 2018 12:25:20 -0700
Subject: [PATCH 126/237] move instance_norm to aten (#10792)

Summary:
This also removes the usage of torch.onnx.symbolic_override in instance_norm. Fixes #8439.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10792

Differential Revision: D9800643

Pulled By: li-roy

fbshipit-source-id: fa13a57de5a31fbfa2d4d02639d214c867b9e1f1
---
 aten/src/ATen/native/Normalization.cpp     | 39 +++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  3 ++
 test/test_jit.py                           | 15 ++++++--
 torch/nn/functional.py                     | 44 ++--------------------
 torch/onnx/symbolic.py                     | 32 ++++++++--------
 5 files changed, 73 insertions(+), 60 deletions(-)

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 750e42568715d7..ed0a94ae496718 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -14,6 +14,13 @@ namespace {
     AT_CHECK(actual == expected,
              arg_name, " should contain ", expected, " elements not ", actual);
   }
+
+  static inline Tensor repeat_if_defined(const Tensor& t, int64_t repeat) {
+    if (t.defined()) {
+      return t.repeat(repeat);
+    }
+    return t;
+  }
 }
 
 Tensor batch_norm(
@@ -80,6 +87,38 @@ Tensor batch_norm(
             running_mean, running_var, training, momentum, eps);
 }
 
+Tensor instance_norm(
+    const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
+    bool use_input_stats, double momentum, double eps, bool cudnn_enabled) {
+  AT_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()),
+           "Expected running_mean and running_var to be defined when use_input_stats is false");
+  std::vector<int64_t> shape = input.sizes().vec();
+  int64_t b = input.size(0);
+  int64_t c = input.size(1);
+  shape[1] = b * c;
+  shape[0] = 1;
+
+  Tensor weight_ = repeat_if_defined(weight, b);
+  Tensor bias_ = repeat_if_defined(bias, b);
+  Tensor running_mean_ = repeat_if_defined(running_mean, b);
+  Tensor running_var_ = repeat_if_defined(running_var, b);
+
+  auto input_reshaped = input.contiguous().view(shape);
+  auto out = at::batch_norm(input_reshaped, weight_, bias_, running_mean_, running_var_,
+                            use_input_stats, momentum, eps, cudnn_enabled);
+
+  // we alias running_mean and running_var because they are const but we want to modify their data
+  if (running_mean.defined()) {
+    at::alias(running_mean).copy_(running_mean_.view({ b, c }).mean(0, false));
+  }
+  if (running_var.defined()) {
+    at::alias(running_var).copy_(running_var_.view({ b, c }).mean(0, false));
+  }
+
+  return out.view(input.sizes());
+}
+
 Tensor layer_norm(const Tensor& input, IntList normalized_shape,
     const Tensor& weight /* optional */, const Tensor& bias /* optional */,
     double eps, bool cudnn_enabled) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index af58e45ebe0ebd..c15aefa2f671d6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -873,6 +873,9 @@
 - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
   variants: function, method
 
+- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) -> Tensor
+  variants: function
+
 - func: inverse(Tensor self) -> Tensor
   variants: function, method
 
diff --git a/test/test_jit.py b/test/test_jit.py
index 61a6b94db9935c..f6a4347eae86cb 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -6966,9 +6966,8 @@ def test_dcgan_models_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_dcgan_models(self, device='cuda', check_export_import=False)
 
-    # XXX: When this is fixed, write a CUDA test for this.
-    @unittest.skip('https://github.com/pytorch/pytorch/issues/8439 InstanceNormalization bug')
-    def test_neural_style(self):
+    @staticmethod
+    def _test_neural_style(self, device, check_export_import=True):
         class TransformerNet(torch.nn.Module):
             def __init__(self):
                 super(TransformerNet, self).__init__()
@@ -7065,7 +7064,15 @@ def forward(self, x):
                 out = self.conv2d(out)
                 return out
 
-        self.checkTrace(TransformerNet(), (torch.rand(5, 3, 224, 224),))
+        self.checkTrace(TransformerNet(), (torch.rand(5, 3, 64, 64),), export_import=check_export_import)
+
+    def test_neural_style(self):
+        self._test_neural_style(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_neural_style_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_neural_style(self, device='cuda', check_export_import=False)
 
     @staticmethod
     def _test_mnist(self, device, check_export_import=True):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 7823d445a0c13a..690112424db560 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1357,46 +1357,10 @@ def instance_norm(input, running_mean=None, running_var=None, weight=None,
     See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`,
     :class:`~torch.nn.InstanceNorm3d` for details.
     """
-    if not use_input_stats and (running_mean is None or running_var is None):
-        raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False')
-
-    b, c = input.size(0), input.size(1)
-    if weight is not None:
-        weight = weight.repeat(b)
-    if bias is not None:
-        bias = bias.repeat(b)
-
-    import torch.onnx.symbolic
-
-    @torch.onnx.symbolic_override(torch.onnx.symbolic.instance_norm)
-    def _instance_norm(input, running_mean=None, running_var=None, weight=None,
-                       bias=None, use_input_stats=None, momentum=None, eps=None):
-        # Repeat stored stats and affine transform params if necessary
-        if running_mean is not None:
-            running_mean_orig = running_mean
-            running_mean = running_mean_orig.repeat(b)
-        if running_var is not None:
-            running_var_orig = running_var
-            running_var = running_var_orig.repeat(b)
-
-        # Apply instance norm
-        input_reshaped = input.contiguous().view(1, b * c, *input.size()[2:])
-
-        out = batch_norm(
-            input_reshaped, running_mean, running_var, weight=weight, bias=bias,
-            training=use_input_stats, momentum=momentum, eps=eps)
-
-        # Reshape and copy back
-        if running_mean is not None:
-            running_mean_orig.copy_(running_mean.view(b, c).mean(0, keepdim=False))
-        if running_var is not None:
-            running_var_orig.copy_(running_var.view(b, c).mean(0, keepdim=False))
-
-        return out.view(b, c, *input.size()[2:])
-    return _instance_norm(input, running_mean=running_mean,
-                          running_var=running_var, weight=weight, bias=bias,
-                          use_input_stats=use_input_stats, momentum=momentum,
-                          eps=eps)
+    return torch.instance_norm(
+        input, weight, bias, running_mean, running_var,
+        use_input_stats, momentum, eps, torch.backends.cudnn.enabled
+    )
 
 
 def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index d9c5964109a0c4..30e8672be601fd 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -716,6 +716,22 @@ def batch_norm(g, input, weight, bias, running_mean, running_var, training, mome
         return res
 
 
+@parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i')
+def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled):
+    input_sizes = input.type().sizes()
+    if weight is None or weight.node().kind() == "prim::Undefined":
+        assert len(input_sizes) > 1
+        weight_value = torch.tensor([1.] * input_sizes[1]).type(
+            'torch.' + input.type().scalarType() + 'Tensor')
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().kind() == "prim::Undefined":
+        assert len(input_sizes) > 1
+        bias_value = torch.tensor([0.] * input_sizes[1]).type(
+            'torch.' + input.type().scalarType() + 'Tensor')
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+
+
 @parse_args('v', 'i', 'i', 'i')
 def unfold(g, input, dimension, size, step):
     return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
@@ -1027,22 +1043,6 @@ def repeat(g, self, repeats):
     return g.op("Tile", self, repeats)
 
 
-def instance_norm(g, input, **kwargs):
-    input_type = input.type().scalarType()
-    weight = kwargs.get("weight", None)
-    bias = kwargs.get("bias", None)
-    eps = kwargs.get("eps", 1e-5)
-    if weight is None:
-        weight = g.constant(1.0, [input.type().sizes()[1]], input_type)
-    else:
-        weight = g.op('Constant', value_t=weight)
-    if bias is None:
-        bias = g.constant(0.0, [input.type().sizes()[1]], input_type)
-    else:
-        bias = g.op('Constant', value_t=bias)
-    return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
-
-
 def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
                  num_layers, dropout, train, bidirectional, batch_first=None, batch_sizes=None):
     weights_per_layer = 4 if has_biases else 2

From acb6f18bab4bf7c801e445cf9b438cec827829ae Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Thu, 13 Sep 2018 12:26:58 -0700
Subject: [PATCH 127/237] fix generate_code.py caching (#11644)

Summary:
Currently, because of some setup.py logic, `ninja` caching of the `generate_code.py` build step was broken. This resulted in `generate_code.py` running every single time builds were happening, regardless of whether inputs changed.

This updated logic fixes the input caching
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11644

Reviewed By: orionr

Differential Revision: D9814348

Pulled By: soumith

fbshipit-source-id: 2012960908d0f600488d410094095cfd72adc34f
---
 setup.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 03e7b745a83539..533b6dee4fcc53 100644
--- a/setup.py
+++ b/setup.py
@@ -123,6 +123,7 @@
 import distutils.command.build
 import distutils.command.clean
 import distutils.sysconfig
+import filecmp
 import platform
 import subprocess
 import shutil
@@ -457,9 +458,14 @@ def check_file(f):
         sym_files = ['tools/shared/cwrap_common.py', 'tools/shared/_utils_internal.py']
         orig_files = ['aten/src/ATen/common_with_cwrap.py', 'torch/_utils_internal.py']
         for sym_file, orig_file in zip(sym_files, orig_files):
+            same = False
             if os.path.exists(sym_file):
-                os.remove(sym_file)
-            shutil.copyfile(orig_file, sym_file)
+                if filecmp.cmp(sym_file, orig_file):
+                    same = True
+                else:
+                    os.remove(sym_file)
+            if not same:
+                shutil.copyfile(orig_file, sym_file)
 
         # Copy headers necessary to compile C++ extensions.
         #

From 0f1ca569ceae07b800f037805aa60691b8a2e801 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 13 Sep 2018 12:32:41 -0700
Subject: [PATCH 128/237] End-to-end dynamic slicing with ONNX DynamicSlice
 experimental operator (#11255)

Summary:
Requires https://github.com/onnx/onnx/pull/1377

This PR makes it so that slices with dynamic boundary values can be exported from pytorch and run in caffe2 via ONNX.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11255

Differential Revision: D9790216

Pulled By: jamesr66a

fbshipit-source-id: 6adfcddc5788df4d34d7ca98341077140402a3e2
---
 caffe2/onnx/backend.cc                        | 169 +++++++++++++++-
 caffe2/onnx/backend.h                         |  13 +-
 caffe2/operators/slice_op.cc                  |   4 +-
 caffe2/operators/slice_op.cu                  | 180 ++++++++++++------
 caffe2/operators/slice_op.h                   |  49 +++--
 caffe2/python/onnx/backend.py                 |   2 +-
 ...estScript.test_export_dynamic_slice.expect |  40 ++++
 .../expect/TestOperators.test_acos.expect     |   2 +-
 .../TestOperators.test_add_broadcast.expect   |   2 +-
 ...stOperators.test_add_left_broadcast.expect |   2 +-
 ...tOperators.test_add_size1_broadcast.expect |   2 +-
 ...tors.test_add_size1_right_broadcast.expect |   2 +-
 ....test_add_size1_singleton_broadcast.expect |   2 +-
 .../TestOperators.test_addconstant.expect     |   2 +-
 .../expect/TestOperators.test_addmm.expect    |   2 +-
 .../expect/TestOperators.test_asin.expect     |   2 +-
 .../expect/TestOperators.test_at_op.expect    |   2 +-
 .../expect/TestOperators.test_atan.expect     |   2 +-
 .../expect/TestOperators.test_basic.expect    |   2 +-
 .../TestOperators.test_batchnorm.expect       |   2 +-
 .../TestOperators.test_batchnorm_1d.expect    |   2 +-
 ...stOperators.test_batchnorm_noaffine.expect |   2 +-
 ...stOperators.test_batchnorm_training.expect |   2 +-
 .../expect/TestOperators.test_chunk.expect    |   2 +-
 .../expect/TestOperators.test_clip.expect     |   2 +-
 .../expect/TestOperators.test_clip_max.expect |   2 +-
 .../expect/TestOperators.test_clip_min.expect |   2 +-
 .../expect/TestOperators.test_concat2.expect  |   2 +-
 .../expect/TestOperators.test_conv.expect     |   2 +-
 .../TestOperators.test_convtranspose.expect   |   2 +-
 .../onnx/expect/TestOperators.test_cos.expect |   2 +-
 .../onnx/expect/TestOperators.test_elu.expect |   2 +-
 .../expect/TestOperators.test_equal.expect    |   2 +-
 .../onnx/expect/TestOperators.test_exp.expect |   2 +-
 .../expect/TestOperators.test_flatten.expect  |   2 +-
 test/onnx/expect/TestOperators.test_ge.expect |   2 +-
 test/onnx/expect/TestOperators.test_gt.expect |   2 +-
 .../expect/TestOperators.test_hardtanh.expect |   2 +-
 .../expect/TestOperators.test_index.expect    |   2 +-
 test/onnx/expect/TestOperators.test_le.expect |   2 +-
 .../TestOperators.test_logsoftmax.expect      |   2 +-
 test/onnx/expect/TestOperators.test_lt.expect |   2 +-
 .../onnx/expect/TestOperators.test_max.expect |   2 +-
 .../expect/TestOperators.test_maxpool.expect  |   2 +-
 .../expect/TestOperators.test_mean.expect     |   2 +-
 .../onnx/expect/TestOperators.test_min.expect |   2 +-
 test/onnx/expect/TestOperators.test_mm.expect |   2 +-
 .../expect/TestOperators.test_norm.expect     |   2 +-
 .../onnx/expect/TestOperators.test_pad.expect |   2 +-
 .../expect/TestOperators.test_params.expect   |   2 +-
 .../expect/TestOperators.test_permute2.expect |   2 +-
 .../onnx/expect/TestOperators.test_pow.expect |   2 +-
 .../expect/TestOperators.test_prod.expect     |   2 +-
 .../TestOperators.test_reduced_mean.expect    |   2 +-
 ...Operators.test_reduced_mean_keepdim.expect |   2 +-
 .../TestOperators.test_reduced_prod.expect    |   2 +-
 ...Operators.test_reduced_prod_keepdim.expect |   2 +-
 .../TestOperators.test_reduced_sum.expect     |   2 +-
 ...tOperators.test_reduced_sum_keepdim.expect |   2 +-
 .../expect/TestOperators.test_repeat.expect   |   2 +-
 ...tOperators.test_repeat_dim_overflow.expect |   2 +-
 .../expect/TestOperators.test_rsub.expect     |   2 +-
 .../expect/TestOperators.test_selu.expect     |   2 +-
 .../onnx/expect/TestOperators.test_sin.expect |   2 +-
 .../expect/TestOperators.test_slice.expect    |   2 +-
 .../expect/TestOperators.test_sqrt.expect     |   2 +-
 .../onnx/expect/TestOperators.test_sum.expect |   2 +-
 ...estOperators.test_symbolic_override.expect |   2 +-
 ...ators.test_symbolic_override_nested.expect |   2 +-
 .../onnx/expect/TestOperators.test_tan.expect |   2 +-
 .../TestOperators.test_transpose.expect       |   2 +-
 .../expect/TestOperators.test_type_as.expect  |   2 +-
 .../TestOperators.test_unsqueeze.expect       |   2 +-
 .../expect/TestOperators.test_upsample.expect |   2 +-
 .../expect/TestOperators.test_view.expect     |   2 +-
 test/onnx/test_pytorch_onnx_caffe2.py         |  50 +++--
 test/test_jit.py                              |  19 ++
 torch/onnx/symbolic.py                        |  16 +-
 78 files changed, 503 insertions(+), 175 deletions(-)
 create mode 100644 test/expect/TestScript.test_export_dynamic_slice.expect

diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 5ccea70926fb6f..2350910febff27 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -301,7 +301,8 @@ Caffe2Backend::get_renamed_operators() const {
       {"Less", "LT"},
       {"Greater", "GT"},
       {"Unsqueeze", "ExpandDims"},
-      {"Tile", "NumpyTile"}};
+      {"Tile", "NumpyTile"},
+      {"DynamicSlice", "Slice"}};
   return kRenamedOperators;
 }
 
@@ -356,7 +357,8 @@ Caffe2Backend::get_special_operators() const {
               {"MatMul", &Caffe2Backend::CreateMatMul},
               {"Upsample", &Caffe2Backend::CreateUpsample},
               {"Dropout", &Caffe2Backend::CreateDropout},
-              {"LRN", &Caffe2Backend::CreateLRN}};
+              {"LRN", &Caffe2Backend::CreateLRN},
+              {"DynamicSlice", &Caffe2Backend::CreateDynamicSlice}};
   return kSpecialOperators;
 }
 
@@ -899,7 +901,6 @@ Caffe2Ops Caffe2Backend::CreateSlice(
 
   auto starts_vals_tensor = dummy_->NewDummyName();
   auto starts_tensor = dummy_->NewDummyName();
-  auto casted_starts_tensor = dummy_->NewDummyName();
   c2_op = ret.ops.Add();
   {
     caffe2::Argument shape_starts;
@@ -936,12 +937,9 @@ Caffe2Ops Caffe2Backend::CreateSlice(
   caffe2::Argument to;
   to.set_name("to");
   to.set_i(static_cast<int64_t>(caffe2::TensorProto::INT32));
-  c2_op = ret.ops.Add();
-  BuildOperator(c2_op, "Cast", {starts_tensor}, {casted_starts_tensor}, {to});
 
   auto ends_vals_tensor = dummy_->NewDummyName();
   auto ends_tensor = dummy_->NewDummyName();
-  auto casted_ends_tensor = dummy_->NewDummyName();
   c2_op = ret.ops.Add();
   {
     caffe2::Argument shape_ends;
@@ -965,17 +963,168 @@ Caffe2Ops Caffe2Backend::CreateSlice(
       "ScatterAssign",
       {ends_tensor, axes_tensor, ends_vals_tensor},
       {ends_tensor});
-  // Slice only accepts ends as int
+
+  // attach the original op at the end
+  c2_op = ret.ops.Add();
+  c2_op->CopyFrom(*op);
+  c2_op->mutable_input()->Clear();
+  c2_op->add_input(data);
+  c2_op->add_input(starts_tensor);
+  c2_op->add_input(ends_tensor);
+  c2_op->mutable_arg()->Clear();
+  for (const auto& kv : args) {
+    c2_op->add_arg()->CopyFrom(*kv.second);
+  }
+
+  return ret;
+}
+
+// Do the following:
+// for a given index tensor (i.e. `starts` or `ends`):
+// 1) Hilariously subtract 1 from the value if it is negative. This due to
+//    the behavior of Caffe2's slice operator not matching that of ONNX's slice
+// 2) Fully expand the index tensor out to the rank of the data tensor.
+//    pseudocode: indices_full = zeros(rank); indices_full[axes] = indices.int()
+std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node,
+                                                      Caffe2Ops& ret,
+                                                      std::string indices_tensor,
+                                                      std::string axes_tensor,
+                                                      std::string rank_tensor,
+                                                      std::string zero_tensor,
+                                                      std::string one_tensor,
+                                                      int default_value) {
+  auto indices_tensor_full = dummy_->NewDummyName();
+
+  {
+    caffe2::Argument value;
+    value.set_name("value");
+    value.set_i(default_value);
+    caffe2::Argument dtype;
+    dtype.set_name("dtype");
+    dtype.set_i(static_cast<int64_t>(caffe2::TensorProto::INT64));
+    caffe2::Argument input_as_shape;
+    input_as_shape.set_name("input_as_shape");
+    input_as_shape.set_i(1);
+    auto c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "ConstantFill", {rank_tensor}, {indices_tensor_full},
+                  {value, dtype, input_as_shape});
+  }
+
+  // Subtract 1 from each element of the indices tensor that is negative
+  auto lt_tensor = dummy_->NewDummyName();
+  {
+    caffe2::Argument broadcast;
+    broadcast.set_name("broadcast");
+    broadcast.set_i(1);
+    auto c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "LT", {indices_tensor, zero_tensor}, {lt_tensor}, {broadcast});
+  }
+
+  auto sub_one_tensor = dummy_->NewDummyName();
+  {
+    caffe2::Argument broadcast;
+    broadcast.set_name("broadcast");
+    broadcast.set_i(1);
+    auto c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "Sub", {indices_tensor, one_tensor}, {sub_one_tensor}, {broadcast});
+  }
+
+  auto indices_tensor_adjusted = dummy_->NewDummyName();
+  auto c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Conditional", {lt_tensor, sub_one_tensor, indices_tensor}, {indices_tensor_adjusted}, {});
+
+  // Fill in values specified from the partially-specified ONNX indices tensor
+  c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "ScatterAssign",
+                {indices_tensor_full, axes_tensor, indices_tensor_adjusted},
+                {indices_tensor_full});
+
+  return indices_tensor_full;
+}
+
+Caffe2Ops Caffe2Backend::CreateDynamicSlice(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
+  CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1);
+  auto* op = op_tmp.ops.Mutable(0);
+  std::unordered_map<std::string, caffe2::Argument*> args;
+  for (auto& arg : *op->mutable_arg()) {
+    args.emplace(arg.name(), &arg);
+  }
+
+  CAFFE_ENFORCE_GE(op->input_size(), 1);
+  auto data = op->input(0);
+  Caffe2Ops ret;
+
+  // First get the shape of the input tensor
+  auto* c2_op = ret.ops.Add();
+  auto size_tensor = dummy_->NewDummyName();
+  BuildOperator(c2_op, "Shape", {data}, {size_tensor});
+
+  // Now get the rank of the tensor by getting the shape of the shape of
+  // the input tensor
   c2_op = ret.ops.Add();
-  BuildOperator(c2_op, "Cast", {ends_tensor}, {casted_ends_tensor}, {to});
+  auto rank_tensor = dummy_->NewDummyName();
+  BuildOperator(c2_op, "Shape", {size_tensor}, {rank_tensor});
+
+  // Axes tensor will be used to populate the fully-specified starts and ends
+  // arguments to the caffe2 Slice operator.
+  std::string axes_tensor;
+  if (onnx_node->node.input_size() > 2) {
+    axes_tensor = onnx_node->node.input(3);
+  } else {
+    axes_tensor = dummy_->NewDummyName();
+    auto* c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "Range", {rank_tensor}, {axes_tensor}, {});
+  }
+
+  // Useful int tensors
+  auto define_integer_constant = [this, &ret](int val) {
+    caffe2::Argument value;
+    value.set_name("value");
+    value.set_i(val);
+    caffe2::Argument dtype;
+    dtype.set_name("dtype");
+    dtype.set_i(static_cast<int64_t>(caffe2::TensorProto::INT64));
+    caffe2::Argument shape;
+    shape.set_name("shape");
+    shape.add_ints(1);
+    auto c2_op = ret.ops.Add();
+    auto name = dummy_->NewDummyName();
+    BuildOperator(c2_op, "ConstantFill", {}, {name},
+                  {value, dtype, shape});
+    return name;
+  };
+
+  auto zero_tensor = define_integer_constant(0);
+  auto one_tensor = define_integer_constant(1);
+
+  auto starts_tensor_full = PreprocessSliceIndexTensor(onnx_node,
+                                                       ret,
+                                                       onnx_node->node.input(1), // starts
+                                                       axes_tensor,
+                                                       rank_tensor,
+                                                       zero_tensor,
+                                                       one_tensor,
+                                                       0);
+
+  auto ends_tensor_full = PreprocessSliceIndexTensor(onnx_node,
+                                                     ret,
+                                                     onnx_node->node.input(2), // ends
+                                                     axes_tensor,
+                                                     rank_tensor,
+                                                     zero_tensor,
+                                                     one_tensor,
+                                                     -1);
 
   // attach the original op at the end
   c2_op = ret.ops.Add();
   c2_op->CopyFrom(*op);
   c2_op->mutable_input()->Clear();
   c2_op->add_input(data);
-  c2_op->add_input(casted_starts_tensor);
-  c2_op->add_input(casted_ends_tensor);
+  c2_op->add_input(starts_tensor_full);
+  c2_op->add_input(ends_tensor_full);
   c2_op->mutable_arg()->Clear();
   for (const auto& kv : args) {
     c2_op->add_arg()->CopyFrom(*kv.second);
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index eab0e2f7e1f131..2b74dec1e3ccea 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -11,7 +11,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-constexpr int kKnownOpsetVersion = 7;
+constexpr int kKnownOpsetVersion = 9;
 
 namespace caffe2 {
 namespace onnx {
@@ -212,6 +212,17 @@ class CAFFE2_API Caffe2Backend {
 
   Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx);
 
+  std::string PreprocessSliceIndexTensor(OnnxNode* onnx_node,
+                                                        Caffe2Ops& ret,
+                                                        std::string indices_tensor,
+                                                        std::string axes_tensor,
+                                                        std::string rank_tensor,
+                                                        std::string zero_tensor,
+                                                        std::string one_tensor,
+                                                        int default_value);
+
+  Caffe2Ops CreateDynamicSlice(OnnxNode* onnx_node, const ConversionContext& ctx);
+
   Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx);
 
   Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx);
diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc
index 529394b9092555..93c039c965f448 100644
--- a/caffe2/operators/slice_op.cc
+++ b/caffe2/operators/slice_op.cc
@@ -3,8 +3,8 @@
 
 namespace caffe2 {
 
-REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
-REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(Slice, SliceOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp<CPUContext>);
 
 OPERATOR_SCHEMA(Slice)
     .NumInputs(1, 3)
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index e2523ad7cbf3fc..5de302814ba2aa 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -231,79 +231,133 @@ bool SliceImplGpu(
 
 } // namespace
 
-template <>
-bool SliceOp<int, CUDAContext>::RunOnDevice() {
-  auto* output = Output(0);
-  auto& data = Input(0);
+template<>
+class SliceOp<CUDAContext> : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  SliceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        statically_inited_(false) {}
+
+  bool RunOnDevice() override {
+    if (InputSize() > 1) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
+  }
 
-  if (InputSize() > 1) {
-    starts_host_.CopyFrom(Input(1));
-    ends_host_.CopyFrom(Input(2));
-  } else {
-    if (!statically_inited_) {
-      CAFFE_ENFORCE(HasArgument("starts"));
-      CAFFE_ENFORCE(HasArgument("ends"));
-      CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
-
-      starts_host_.Resize(starts_.size());
-      ends_host_.Resize(ends_.size());
-
-      memcpy(
-          starts_host_.mutable_data<int>(),
-          starts_.data(),
-          sizeof(int) * starts_.size());
-      memcpy(
-          ends_host_.mutable_data<int>(),
-          ends_.data(),
-          sizeof(int) * ends_.size());
-      statically_inited_ = true;
+  template <typename SIndex>
+  bool DoRunWithType() {
+    auto* output = Output(0);
+    auto& data = Input(0);
+
+    if (InputSize() > 1) {
+      starts_host_.CopyFrom(Input(1));
+      ends_host_.CopyFrom(Input(2));
+    } else {
+      if (!statically_inited_) {
+        CAFFE_ENFORCE(HasArgument("starts"));
+        CAFFE_ENFORCE(HasArgument("ends"));
+        CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+        starts_host_.Resize(starts_.size());
+        ends_host_.Resize(ends_.size());
+
+        memcpy(
+            starts_host_.mutable_data<SIndex>(),
+            starts_.data(),
+            sizeof(SIndex) * starts_.size());
+        memcpy(
+            ends_host_.mutable_data<SIndex>(),
+            ends_.data(),
+            sizeof(SIndex) * ends_.size());
+        statically_inited_ = true;
+      }
     }
+
+    return SliceImplGpu<SIndex, CUDAContext>(
+        output, data, starts_host_, ends_host_, &context_);
   }
+ private:
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
+  bool statically_inited_;
+  Tensor starts_host_{CPU};
+  Tensor ends_host_{CPU};
 
-  return SliceImplGpu<int, CUDAContext>(
-      output, data, starts_host_, ends_host_, &context_);
-}
+};  // class SliceOp<CUDAContext>
 
-REGISTER_CUDA_OPERATOR(Slice, SliceOp<int, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Slice, SliceOp<CUDAContext>);
 
 template <>
-bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
-  auto* gdata = Output(0);
-  auto& data = Input(0);
+class SliceGradientOp<CUDAContext> : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        statically_inited_(false) {}
+
+  AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+
+  bool RunOnDevice() override {
+    if (InputSize() == 4) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
+  }
 
-  if (InputSize() == 4) {
-    starts_host_.CopyFrom(Input(1));
-    ends_host_.CopyFrom(Input(2));
+  template <typename SIndex>
+  bool DoRunWithType() {
+    auto* gdata = Output(0);
+    auto& data = Input(0);
 
-    auto& go = Input(3);
+    if (InputSize() == 4) {
+      starts_host_.CopyFrom(Input(1));
+      ends_host_.CopyFrom(Input(2));
 
-    return SliceImplGpu<int, CUDAContext>(
-        nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
-  } else {
-    if (!statically_inited_) {
-      CAFFE_ENFORCE(HasArgument("starts"));
-      CAFFE_ENFORCE(HasArgument("ends"));
-      CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
-
-      starts_host_.Resize(starts_.size());
-      ends_host_.Resize(ends_.size());
-
-      memcpy(
-          starts_host_.mutable_data<int>(),
-          starts_.data(),
-          sizeof(int) * starts_.size());
-      memcpy(
-          ends_host_.mutable_data<int>(),
-          ends_.data(),
-          sizeof(int) * ends_.size());
-
-      statically_inited_ = true;
-    }
-    auto& go = Input(1);
+      auto& go = Input(3);
 
-    return SliceImplGpu<int, CUDAContext>(
-        nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+      return SliceImplGpu<SIndex, CUDAContext>(
+          nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+    } else {
+      if (!statically_inited_) {
+        CAFFE_ENFORCE(HasArgument("starts"));
+        CAFFE_ENFORCE(HasArgument("ends"));
+        CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+        starts_host_.Resize(starts_.size());
+        ends_host_.Resize(ends_.size());
+
+        memcpy(
+            starts_host_.mutable_data<SIndex>(),
+            starts_.data(),
+            sizeof(SIndex) * starts_.size());
+        memcpy(
+            ends_host_.mutable_data<SIndex>(),
+            ends_.data(),
+            sizeof(SIndex) * ends_.size());
+
+        statically_inited_ = true;
+      }
+      auto& go = Input(1);
+
+      return SliceImplGpu<SIndex, CUDAContext>(
+          nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+    }
   }
-}
-REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<int, CUDAContext>);
+ private:
+
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
+  bool statically_inited_;
+  Tensor starts_host_{CPU};
+  Tensor ends_host_{CPU};
+};  // class SliceGradientOp<CUDAContext>
+REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<CUDAContext>);
 } // namespace caffe2
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index 8f291affb8e8d0..aa8d4e50f0f9d9 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -198,22 +198,26 @@ bool SliceImpl(
 
 } // namespace
 
-template <class SIndex, class Context>
+template <class Context>
 class SliceOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SliceOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<SIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<SIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
         statically_inited_(false) {}
 
   bool RunOnDevice() override {
-    return RunOnDeviceImpl(Input(0), Output(0));
+    if (InputSize() > 1) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
   }
 
- protected:
-  bool RunOnDeviceImpl(const Tensor& data, Tensor* output) {
+  template <typename SIndex>
+  bool DoRunWithType() {
     if (InputSize() > 1) {
       starts_host_.CopyFrom(Input(1));
       ends_host_.CopyFrom(Input(2));
@@ -238,31 +242,45 @@ class SliceOp : public Operator<Context> {
       }
     }
 
+    auto data = Input(0);
+    auto output = Output(0);
+
     return SliceImpl<SIndex, Context>(
         output, data, starts_host_, ends_host_, &context_);
   }
 
   AT_DISABLE_COPY_AND_ASSIGN(SliceOp);
 
- private:
-  std::vector<SIndex> starts_;
-  std::vector<SIndex> ends_;
+ protected:
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
 };
 
-template <class SIndex, class Context>
+template <class Context>
 class SliceGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<SIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<SIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
         statically_inited_(false) {}
 
+        AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+
   bool RunOnDevice() override {
+    if (InputSize() == 4) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType()  {
     auto* gdata = Output(0);
     auto& data = Input(0);
 
@@ -301,11 +319,10 @@ class SliceGradientOp : public Operator<Context> {
     }
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
-
  private:
-  std::vector<SIndex> starts_;
-  std::vector<SIndex> ends_;
+
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 79dadb091488e6..3d9239c8b5c92b 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -140,7 +140,7 @@ class Caffe2Backend(Backend):
     # If you increase this, make SURE you cross-reference all BC-breaking
     # changes from one version to the next, and any that you did not
     # implement, mark as broken in _broken_operators
-    _known_opset_version = 7
+    _known_opset_version = 9
 
     # This dictionary will record operators which are KNOWN to be
     # broken, so we give a good error message rather than do something
diff --git a/test/expect/TestScript.test_export_dynamic_slice.expect b/test/expect/TestScript.test_export_dynamic_slice.expect
new file mode 100644
index 00000000000000..7182fec515ae74
--- /dev/null
+++ b/test/expect/TestScript.test_export_dynamic_slice.expect
@@ -0,0 +1,40 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 3 4 5}]
+      outputs: [{name: "7", type:Tensor dims: 4 5}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Gather", inputs: [x,2], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Shape", inputs: [x], outputs: [4], attributes: []},
+        Node {type: "Gather", inputs: [4,1], outputs: [5], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Loop", inputs: [5,6,3], outputs: [7], attributes: [{ name: 'body', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "10", type:Tensor dims: }]
+              outputs: [{name: "18", type:Tensor dims: },{name: "17", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                Node {type: "Constant", inputs: [], outputs: [11], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+                Node {type: "Unsqueeze", inputs: [2], outputs: [12], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [i], outputs: [13], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [11], outputs: [14], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "DynamicSlice", inputs: [x,12,13,14], outputs: [15], attributes: []},
+                Node {type: "ReduceSum", inputs: [15], outputs: [16], attributes: [{ name: 'axes', type: ints, values: [0]},{ name: 'keepdims', type: int, value: 0}]},
+                Node {type: "Add", inputs: [10,16], outputs: [17], attributes: []},
+                Node {type: "Constant", inputs: [], outputs: [18], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/onnx/expect/TestOperators.test_acos.expect b/test/onnx/expect/TestOperators.test_acos.expect
index 219f74f6b2fcb1..acecfeba820a25 100644
--- a/test/onnx/expect/TestOperators.test_acos.expect
+++ b/test/onnx/expect/TestOperators.test_acos.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_broadcast.expect b/test/onnx/expect/TestOperators.test_add_broadcast.expect
index 097cd600278781..4062b9ed9fe87b 100644
--- a/test/onnx/expect/TestOperators.test_add_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_broadcast.expect
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
index 5b6ade0101acfc..b78cf56b5d4723 100644
--- a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
index edad6530a8083f..181800f7bba55e 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
index 097cd600278781..4062b9ed9fe87b 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
index 832e076c2e3ad4..19557dc56aff51 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_addconstant.expect b/test/onnx/expect/TestOperators.test_addconstant.expect
index 6aa04d81a0cbe0..5657e671e47653 100644
--- a/test/onnx/expect/TestOperators.test_addconstant.expect
+++ b/test/onnx/expect/TestOperators.test_addconstant.expect
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_addmm.expect b/test/onnx/expect/TestOperators.test_addmm.expect
index 972e2b07217a7d..deb017b257f321 100644
--- a/test/onnx/expect/TestOperators.test_addmm.expect
+++ b/test/onnx/expect/TestOperators.test_addmm.expect
@@ -100,5 +100,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_asin.expect b/test/onnx/expect/TestOperators.test_asin.expect
index 2f9438488521da..230374f5c9b857 100644
--- a/test/onnx/expect/TestOperators.test_asin.expect
+++ b/test/onnx/expect/TestOperators.test_asin.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_at_op.expect b/test/onnx/expect/TestOperators.test_at_op.expect
index 61478b558dcfaf..b3e06b0be8fb9b 100644
--- a/test/onnx/expect/TestOperators.test_at_op.expect
+++ b/test/onnx/expect/TestOperators.test_at_op.expect
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_atan.expect b/test/onnx/expect/TestOperators.test_atan.expect
index b265c17a1a4a5b..6b5f71786aea5b 100644
--- a/test/onnx/expect/TestOperators.test_atan.expect
+++ b/test/onnx/expect/TestOperators.test_atan.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_basic.expect b/test/onnx/expect/TestOperators.test_basic.expect
index 94b569e8f1b925..be9eb4647779a6 100644
--- a/test/onnx/expect/TestOperators.test_basic.expect
+++ b/test/onnx/expect/TestOperators.test_basic.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm.expect b/test/onnx/expect/TestOperators.test_batchnorm.expect
index 18d9e5d56ac6c2..0ddd04cd86141e 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm.expect
@@ -159,5 +159,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
index c4be699a7111ac..88af65817b710b 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
@@ -167,5 +167,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
index d807b30800d398..a96638c7bf4c8f 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
@@ -147,5 +147,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_training.expect b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
index 79211b2b3cc3fa..1c53cfe0aa4a93 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_training.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
@@ -163,5 +163,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_chunk.expect b/test/onnx/expect/TestOperators.test_chunk.expect
index ddd8ba28ec6517..cb31cfcbeb7cb9 100644
--- a/test/onnx/expect/TestOperators.test_chunk.expect
+++ b/test/onnx/expect/TestOperators.test_chunk.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_clip.expect b/test/onnx/expect/TestOperators.test_clip.expect
index 9581da849754a0..6a2d3e544b1e44 100644
--- a/test/onnx/expect/TestOperators.test_clip.expect
+++ b/test/onnx/expect/TestOperators.test_clip.expect
@@ -52,5 +52,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_clip_max.expect b/test/onnx/expect/TestOperators.test_clip_max.expect
index 30160f26432abe..48f3ddc37a4e69 100644
--- a/test/onnx/expect/TestOperators.test_clip_max.expect
+++ b/test/onnx/expect/TestOperators.test_clip_max.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_clip_min.expect b/test/onnx/expect/TestOperators.test_clip_min.expect
index 015c474ebed9f4..55516c3f8f1aba 100644
--- a/test/onnx/expect/TestOperators.test_clip_min.expect
+++ b/test/onnx/expect/TestOperators.test_clip_min.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_concat2.expect b/test/onnx/expect/TestOperators.test_concat2.expect
index af84517fcdd867..3efe305e422abd 100644
--- a/test/onnx/expect/TestOperators.test_concat2.expect
+++ b/test/onnx/expect/TestOperators.test_concat2.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_conv.expect b/test/onnx/expect/TestOperators.test_conv.expect
index 9686a55318160e..bc07077b8a9535 100644
--- a/test/onnx/expect/TestOperators.test_conv.expect
+++ b/test/onnx/expect/TestOperators.test_conv.expect
@@ -117,5 +117,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_convtranspose.expect b/test/onnx/expect/TestOperators.test_convtranspose.expect
index cdaf80988335a2..323dd426a4c510 100644
--- a/test/onnx/expect/TestOperators.test_convtranspose.expect
+++ b/test/onnx/expect/TestOperators.test_convtranspose.expect
@@ -123,5 +123,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_cos.expect b/test/onnx/expect/TestOperators.test_cos.expect
index fc4d799ec0e87d..db4ddebecf2e6a 100644
--- a/test/onnx/expect/TestOperators.test_cos.expect
+++ b/test/onnx/expect/TestOperators.test_cos.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect
index 6bb3b42496c5c8..f80dabe1cdbb56 100644
--- a/test/onnx/expect/TestOperators.test_elu.expect
+++ b/test/onnx/expect/TestOperators.test_elu.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect
index eb66a1149599f6..1577dae0fdd20d 100644
--- a/test/onnx/expect/TestOperators.test_equal.expect
+++ b/test/onnx/expect/TestOperators.test_equal.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_exp.expect b/test/onnx/expect/TestOperators.test_exp.expect
index 326273b565ac7a..5536e52b300402 100644
--- a/test/onnx/expect/TestOperators.test_exp.expect
+++ b/test/onnx/expect/TestOperators.test_exp.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect
index 87075f3aeb1134..f127eebbbee288 100644
--- a/test/onnx/expect/TestOperators.test_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_flatten.expect
@@ -192,5 +192,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index 9e02d7d92e1bd6..63f1f3cc563951 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect
index a1958aeb7d1f1a..ab107f9e0233c5 100644
--- a/test/onnx/expect/TestOperators.test_gt.expect
+++ b/test/onnx/expect/TestOperators.test_gt.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_hardtanh.expect b/test/onnx/expect/TestOperators.test_hardtanh.expect
index 9581da849754a0..6a2d3e544b1e44 100644
--- a/test/onnx/expect/TestOperators.test_hardtanh.expect
+++ b/test/onnx/expect/TestOperators.test_hardtanh.expect
@@ -52,5 +52,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_index.expect b/test/onnx/expect/TestOperators.test_index.expect
index 6d2a95332cbb94..577244747d9aef 100644
--- a/test/onnx/expect/TestOperators.test_index.expect
+++ b/test/onnx/expect/TestOperators.test_index.expect
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index 881c199d949f3c..fb36f3449f2664 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_logsoftmax.expect b/test/onnx/expect/TestOperators.test_logsoftmax.expect
index 650662a76685af..fcfa594b8ea881 100644
--- a/test/onnx/expect/TestOperators.test_logsoftmax.expect
+++ b/test/onnx/expect/TestOperators.test_logsoftmax.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect
index 4edf94b2aa4e6c..901aa61666c7df 100644
--- a/test/onnx/expect/TestOperators.test_lt.expect
+++ b/test/onnx/expect/TestOperators.test_lt.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_max.expect b/test/onnx/expect/TestOperators.test_max.expect
index fda649076cb375..62378fbae6afe6 100644
--- a/test/onnx/expect/TestOperators.test_max.expect
+++ b/test/onnx/expect/TestOperators.test_max.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_maxpool.expect b/test/onnx/expect/TestOperators.test_maxpool.expect
index e788300b089586..3e7ce68b8d42c8 100644
--- a/test/onnx/expect/TestOperators.test_maxpool.expect
+++ b/test/onnx/expect/TestOperators.test_maxpool.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_mean.expect b/test/onnx/expect/TestOperators.test_mean.expect
index c575109faa8d86..5476ca2298cba6 100644
--- a/test/onnx/expect/TestOperators.test_mean.expect
+++ b/test/onnx/expect/TestOperators.test_mean.expect
@@ -47,5 +47,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_min.expect b/test/onnx/expect/TestOperators.test_min.expect
index c0a51ca2bb7bf6..7b66f3661bd3d0 100644
--- a/test/onnx/expect/TestOperators.test_min.expect
+++ b/test/onnx/expect/TestOperators.test_min.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_mm.expect b/test/onnx/expect/TestOperators.test_mm.expect
index 129981ea61a1d6..0dcede4964c410 100644
--- a/test/onnx/expect/TestOperators.test_mm.expect
+++ b/test/onnx/expect/TestOperators.test_mm.expect
@@ -83,5 +83,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_norm.expect b/test/onnx/expect/TestOperators.test_norm.expect
index c53938f94a8fca..6b6ff1846fdeb4 100644
--- a/test/onnx/expect/TestOperators.test_norm.expect
+++ b/test/onnx/expect/TestOperators.test_norm.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_pad.expect b/test/onnx/expect/TestOperators.test_pad.expect
index f94b67f3616057..343230d4c02a66 100644
--- a/test/onnx/expect/TestOperators.test_pad.expect
+++ b/test/onnx/expect/TestOperators.test_pad.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_params.expect b/test/onnx/expect/TestOperators.test_params.expect
index c4a718f940944e..91b64310a3006c 100644
--- a/test/onnx/expect/TestOperators.test_params.expect
+++ b/test/onnx/expect/TestOperators.test_params.expect
@@ -87,5 +87,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_permute2.expect b/test/onnx/expect/TestOperators.test_permute2.expect
index 4e2e7e19d27b6f..d20f35b1077e04 100644
--- a/test/onnx/expect/TestOperators.test_permute2.expect
+++ b/test/onnx/expect/TestOperators.test_permute2.expect
@@ -76,5 +76,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_pow.expect b/test/onnx/expect/TestOperators.test_pow.expect
index 595c51dae740d8..49bec073317701 100644
--- a/test/onnx/expect/TestOperators.test_pow.expect
+++ b/test/onnx/expect/TestOperators.test_pow.expect
@@ -77,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_prod.expect b/test/onnx/expect/TestOperators.test_prod.expect
index 2a01bf76bfbe1b..235dde0d4462de 100644
--- a/test/onnx/expect/TestOperators.test_prod.expect
+++ b/test/onnx/expect/TestOperators.test_prod.expect
@@ -47,5 +47,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean.expect b/test/onnx/expect/TestOperators.test_reduced_mean.expect
index 7449a2a53c6fbb..6c846801887ee8 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
index 45d4eaa4663882..dd4cbb90efbba7 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod.expect b/test/onnx/expect/TestOperators.test_reduced_prod.expect
index d835c77b86c209..3dbabb1fcb0890 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
index 138e39ec9b5134..fe612211c7c8cb 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum.expect b/test/onnx/expect/TestOperators.test_reduced_sum.expect
index b602edbb2fc451..b42d577db7f534 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
index 0e4bf5da96be3a..9e238ad24da393 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat.expect b/test/onnx/expect/TestOperators.test_repeat.expect
index eb95f268857c5d..d8e9337890f3f1 100644
--- a/test/onnx/expect/TestOperators.test_repeat.expect
+++ b/test/onnx/expect/TestOperators.test_repeat.expect
@@ -68,5 +68,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
index 33b15851531c09..235a5a13d42d36 100644
--- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -81,5 +81,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect
index 9e6c112c8768c4..49fa976a5e1bd6 100644
--- a/test/onnx/expect/TestOperators.test_rsub.expect
+++ b/test/onnx/expect/TestOperators.test_rsub.expect
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_selu.expect b/test/onnx/expect/TestOperators.test_selu.expect
index 92af508521a538..117d25a0d19192 100644
--- a/test/onnx/expect/TestOperators.test_selu.expect
+++ b/test/onnx/expect/TestOperators.test_selu.expect
@@ -54,5 +54,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_sin.expect b/test/onnx/expect/TestOperators.test_sin.expect
index a6d733d7b7e031..32ceff124dde7d 100644
--- a/test/onnx/expect/TestOperators.test_sin.expect
+++ b/test/onnx/expect/TestOperators.test_sin.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_slice.expect b/test/onnx/expect/TestOperators.test_slice.expect
index 5965137e8de6ec..0e24cafa6e5c97 100644
--- a/test/onnx/expect/TestOperators.test_slice.expect
+++ b/test/onnx/expect/TestOperators.test_slice.expect
@@ -77,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_sqrt.expect b/test/onnx/expect/TestOperators.test_sqrt.expect
index 5b56bea7eb24b3..5344a0aff20c46 100644
--- a/test/onnx/expect/TestOperators.test_sqrt.expect
+++ b/test/onnx/expect/TestOperators.test_sqrt.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_sum.expect b/test/onnx/expect/TestOperators.test_sum.expect
index dff021eb26af16..caba3c16ba70a0 100644
--- a/test/onnx/expect/TestOperators.test_sum.expect
+++ b/test/onnx/expect/TestOperators.test_sum.expect
@@ -47,5 +47,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_symbolic_override.expect b/test/onnx/expect/TestOperators.test_symbolic_override.expect
index f4e6965a6cafbb..6ccb4ff03f8d5d 100644
--- a/test/onnx/expect/TestOperators.test_symbolic_override.expect
+++ b/test/onnx/expect/TestOperators.test_symbolic_override.expect
@@ -99,5 +99,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
index 757609e63740f2..6bec0381a55fa0 100644
--- a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
+++ b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
@@ -100,5 +100,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_tan.expect b/test/onnx/expect/TestOperators.test_tan.expect
index 58b7ff689b9930..e36c718bb00bc4 100644
--- a/test/onnx/expect/TestOperators.test_tan.expect
+++ b/test/onnx/expect/TestOperators.test_tan.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_transpose.expect b/test/onnx/expect/TestOperators.test_transpose.expect
index f2c634daebf589..394bb253f4e3f6 100644
--- a/test/onnx/expect/TestOperators.test_transpose.expect
+++ b/test/onnx/expect/TestOperators.test_transpose.expect
@@ -37,5 +37,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_type_as.expect b/test/onnx/expect/TestOperators.test_type_as.expect
index 5107617395e3cb..bdc6dd44a3f97b 100644
--- a/test/onnx/expect/TestOperators.test_type_as.expect
+++ b/test/onnx/expect/TestOperators.test_type_as.expect
@@ -31,5 +31,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect
index 3a8e01092f8d0b..8ecb4314d347f0 100644
--- a/test/onnx/expect/TestOperators.test_unsqueeze.expect
+++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect
@@ -50,5 +50,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_upsample.expect b/test/onnx/expect/TestOperators.test_upsample.expect
index 8e7a6a8fa1f7a7..7cc176ff57b8dc 100644
--- a/test/onnx/expect/TestOperators.test_upsample.expect
+++ b/test/onnx/expect/TestOperators.test_upsample.expect
@@ -67,5 +67,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect
index 1354b4271067af..be69ffb4cdab95 100644
--- a/test/onnx/expect/TestOperators.test_view.expect
+++ b/test/onnx/expect/TestOperators.test_view.expect
@@ -44,5 +44,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 1a9e9cdbff1442..a889e7b1fa0f89 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -127,13 +127,14 @@ def convert_cuda(self, model, input):
         return cuda_model, cuda_input
 
     def run_debug_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True):
+                       input=None, use_gpu=True, example_outputs=None):
         """
         # TODO: remove this from the final release version
         This test is for our debugging only for the case where
         embed_params=False
         """
-        model.train(train)
+        if not isinstance(model, torch.jit.ScriptModule):
+            model.train(train)
         if state_dict is not None:
             model.load_state_dict(state_dict)
 
@@ -144,7 +145,8 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
         if use_gpu:
             model, input = self.convert_cuda(model, input)
 
-        onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False)
+        onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False,
+                                      example_outputs=example_outputs)
         if isinstance(torch_out, torch.autograd.Variable):
             torch_out = (torch_out,)
 
@@ -153,12 +155,14 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
             np.testing.assert_almost_equal(x.data.cpu().numpy(), y, decimal=3)
 
     def run_actual_test(self, model, train, batch_size, state_dict=None,
-                        input=None, use_gpu=True, rtol=0.001, atol=1e-7):
+                        input=None, use_gpu=True, rtol=0.001, atol=1e-7,
+                        example_outputs=None):
         """
         This is what the user facing version will look like
         """
         # set the training/test mode for the model
-        model.train(train)
+        if not isinstance(model, torch.jit.ScriptModule):
+            model.train(train)
         # use the pre-trained model params if available
         if state_dict is not None:
             model.load_state_dict(state_dict)
@@ -175,14 +179,16 @@ def run_actual_test(self, model, train, batch_size, state_dict=None,
         verify.verify(model, input, c2, rtol=rtol, atol=atol)
 
     def run_model_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True, rtol=0.001, atol=1e-7):
+                       input=None, use_gpu=True, rtol=0.001, atol=1e-7,
+                       example_outputs=None):
         use_gpu_ = torch.cuda.is_available() and use_gpu
         if self.embed_params:
             self.run_actual_test(model, train, batch_size, state_dict, input,
-                                 use_gpu=use_gpu_, rtol=rtol, atol=atol)
+                                 use_gpu=use_gpu_, rtol=rtol, atol=atol,
+                                 example_outputs=example_outputs)
         else:
             self.run_debug_test(model, train, batch_size, state_dict, input,
-                                use_gpu=use_gpu_)
+                                use_gpu=use_gpu_, example_outputs=example_outputs)
 
     def test_linear(self):
         model = nn.Linear(1, 1)
@@ -346,11 +352,11 @@ def test_rnn_init_predict_split(self):
         mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params)[0])
         prepared = c2.prepare(mp, device='CPU')
         if self.embed_params:
-            assert len(prepared.init_net.op) == 1019
-            assert len(prepared.predict_net.op) == 142
+            assert len(prepared.init_net.op) == 875
+            assert len(prepared.predict_net.op) == 130
         else:
             assert len(prepared.init_net.op) == 8
-            assert len(prepared.predict_net.op) == 1153
+            assert len(prepared.predict_net.op) == 997
 
     def test_alexnet(self):
         state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False)
@@ -891,6 +897,28 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
 
+    def test_dynamic_slice(self):
+        class DynamicSliceExportMod(torch.nn.Module):
+            def forward(self, x):
+                results = []
+                for i in range(4):
+                    results.append(x[:x.size(0) - i, i:x.size(2), i:3])
+                return tuple(results)
+
+        x = torch.rand(5, 5, 5)
+        self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_dynamic_slice_to_the_end(self):
+        class DynamicSliceExportMod(torch.nn.Module):
+            def forward(self, x):
+                results = []
+                for i in range(4):
+                    results.append(x[:, i:, x.size(2) - 5])
+                return tuple(results)
+
+        x = torch.rand(5, 5, 5)
+        self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+
 # a bit of metaprogramming to set up all the rnn tests
 
 
diff --git a/test/test_jit.py b/test/test_jit.py
index f6a4347eae86cb..54bc33e8542e43 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -6870,6 +6870,25 @@ def foo(x):
 
         self.assertEqual(foo(input), input)
 
+    def test_export_dynamic_slice(self):
+        class DynamicSliceExportMod(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                retval = x[0]
+                for i in range(x.size(1)):
+                    retval += torch.sum(x[0:i], dim=0)
+                return retval
+
+        mod = DynamicSliceExportMod()
+
+        input = torch.rand(3, 4, 5)
+        example_outs = mod(input)
+
+        f = io.BytesIO()
+        exported = torch.onnx.export_to_pretty_string(
+            DynamicSliceExportMod(), (input,), f, example_outputs=example_outs)
+        self.assertExpected(exported)
+
 
 class MnistNet(nn.Module):
     def __init__(self):
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 30e8672be601fd..d5b586c3846d6f 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -151,7 +151,7 @@ def _unimplemented(op, msg):
 # increasing this number.  This includes symbolic definitions NOT in this
 # file, so grep for "OpName" (with quotes)
 
-_onnx_opset_version = 7
+_onnx_opset_version = 9
 
 
 # ---------------------------------------------------------------------
@@ -981,11 +981,21 @@ def full_like(g, input, fill_value):
     return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1)))
 
 
-@parse_args('v', 'i', 'i', 'i', 'i')
+@parse_args('v', 'v', 'v', 'v', 'i')
 def slice(g, self, dim, start, end, step):
     if step != 1:
         _unimplemented("slice", "step!=1 is currently not supported")
-    return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end])
+    if start.node().kind() != 'onnx::Constant' or \
+            end.node().kind() != 'onnx::Constant' or dim.node().kind() != 'onnx::Constant':
+        start_unsqueezed = g.op("Unsqueeze", start, axes_i=[0])
+        end_unsqueezed = g.op("Unsqueeze", end, axes_i=[0])
+        dim_unsqueezed = g.op("Unsqueeze", dim, axes_i=[0])
+        return g.op("DynamicSlice", self, start_unsqueezed, end_unsqueezed, dim_unsqueezed)
+    else:
+        start = _parse_arg(start, 'i')
+        end = _parse_arg(end, 'i')
+        dim = _parse_arg(dim, 'i')
+        return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end])
 
 
 @parse_args('v', 'f', 'f')

From 90537289a04ef5d572496240e2ac3a881be518d2 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Thu, 13 Sep 2018 12:41:11 -0700
Subject: [PATCH 129/237] Constexpr std::move / std::forward for C++11 (#11396)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11396

std::move and std::forward in C++11 aren't constexpr (they are in C++14).
This caused a build issue orionr was working on.
It should be fixed by this diff

Reviewed By: orionr

Differential Revision: D9724805

fbshipit-source-id: 0d9047dce611385d659cc71a6c04cc7a6a40a5ae
---
 aten/src/ATen/core/C++17.h  | 82 ++++++++++++++++++++++++-------------
 caffe2/utils/Array.h        |  6 +--
 caffe2/utils/Array_test.cpp |  8 ++--
 3 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/aten/src/ATen/core/C++17.h b/aten/src/ATen/core/C++17.h
index 5964a9dea36b2f..ac5b3022ed5ff1 100644
--- a/aten/src/ATen/core/C++17.h
+++ b/aten/src/ATen/core/C++17.h
@@ -12,6 +12,53 @@
 
 namespace c10 { namespace guts {
 
+
+
+#ifdef __cpp_lib_transformation_trait_aliases
+template<bool B, class T, class F> using conditional_t = std::conditional_t<B, T, F>;
+template<bool B, class T = void> using enable_if_t = std::enable_if_t<B, T>;
+template<class T> using add_lvalue_reference_t = std::add_lvalue_reference_t<T>;
+template<class T> using remove_reference_t = std::remove_reference_t<T>;
+template<class T> using remove_cv_t = std::remove_cv_t<T>;
+template<class T> using result_of_t = std::result_of_t<T>;
+template<class T> using decay_t = std::decay_t<T>;
+template<class T> using remove_const_t = std::remove_const_t<T>;
+template<class T> using remove_pointer_t = std::remove_pointer_t<T>;
+#else
+template<bool B, class T, class F> using conditional_t = typename std::conditional<B, T, F>::type;
+template<bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template<class T> using add_lvalue_reference_t = typename std::add_lvalue_reference<T>::type;
+template<class T> using remove_reference_t = typename std::remove_reference<T>::type;
+template<class T> using remove_cv_t = typename std::remove_cv<T>::type;
+template<class T> using result_of_t = typename std::result_of<T>::type;
+template<class T> using decay_t = typename std::decay<T>::type;
+template<class T> using remove_const_t = typename std::remove_const<T>::type;
+template<class T> using remove_pointer_t = typename std::remove_pointer<T>::type;
+#endif
+
+
+
+
+// C++11 doesn't have constexpr std::move / std::forward.
+// Implementation taken from libc++.
+template<class T>
+constexpr inline guts::remove_reference_t<T>&& move(T&& t) noexcept {
+  return static_cast<guts::remove_reference_t<T>&&>(t);
+}
+template <class T>
+constexpr inline T&& forward(guts::remove_reference_t<T>& t) noexcept {
+    return static_cast<T&&>(t);
+}
+template <class T>
+constexpr inline T&& forward(guts::remove_reference_t<T>&& t) noexcept {
+    static_assert(!std::is_lvalue_reference<T>::value,
+                  "can not forward an rvalue as an lvalue.");
+    return static_cast<T&&>(t);
+}
+
+
+
+
 #if __cplusplus >= 201402L || defined(__cpp_lib_make_unique) && __cpp_lib_make_unique >= 201304L || \
   (defined(__ANDROID__) && __ANDROID__ && __cplusplus >= 201300L) || defined(_MSC_VER) && _MSC_VER >= 1900
 
@@ -23,7 +70,7 @@ namespace c10 { namespace guts {
 template <typename T, typename... Args>
 typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
 make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+  return std::unique_ptr<T>(new T(forward<Args>(args)...));
 }
 // Allows 'make_unique<T[]>(10)'. (N3690 s20.9.1.4 p3-4)
 template <typename T>
@@ -39,6 +86,7 @@ make_unique(Args&&...) = delete;
 #endif
 
 
+
 #ifdef __cpp_lib_integer_sequence
 
 template<class T, T... Ints> using integer_sequence = std::integer_sequence<T, Ints...>;
@@ -73,28 +121,6 @@ template<class... T> using index_sequence_for = make_index_sequence<sizeof...(T)
 #endif
 
 
-#ifdef __cpp_lib_transformation_trait_aliases
-template<bool B, class T, class F> using conditional_t = std::conditional_t<B, T, F>;
-template<bool B, class T = void> using enable_if_t = std::enable_if_t<B, T>;
-template<class T> using add_lvalue_reference_t = std::add_lvalue_reference_t<T>;
-template<class T> using remove_reference_t = std::remove_reference_t<T>;
-template<class T> using remove_cv_t = std::remove_cv_t<T>;
-template<class T> using result_of_t = std::result_of_t<T>;
-template<class T> using decay_t = std::decay_t<T>;
-template<class T> using remove_const_t = std::remove_const_t<T>;
-template<class T> using remove_pointer_t = std::remove_pointer_t<T>;
-#else
-template<bool B, class T, class F> using conditional_t = typename std::conditional<B, T, F>::type;
-template<bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
-template<class T> using add_lvalue_reference_t = typename std::add_lvalue_reference<T>::type;
-template<class T> using remove_reference_t = typename std::remove_reference<T>::type;
-template<class T> using remove_cv_t = typename std::remove_cv<T>::type;
-template<class T> using result_of_t = typename std::result_of<T>::type;
-template<class T> using decay_t = typename std::decay<T>::type;
-template<class T> using remove_const_t = typename std::remove_const<T>::type;
-template<class T> using remove_pointer_t = typename std::remove_pointer<T>::type;
-#endif
-
 
 
 #ifdef __cpp_lib_logical_traits
@@ -155,7 +181,7 @@ template<typename... Ts> using void_t = typename make_void<Ts...>::type;
 
 template <class F, class Tuple>
 inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
-  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
+  return std::apply(forward<F>(f), forward<Tuple>(t));
 }
 
 #else
@@ -164,19 +190,19 @@ inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
 // TODO This is an incomplete implementation of std::apply, not working for member functions.
 namespace detail {
 template <class F, class Tuple, std::size_t... I>
-constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence<I...>) -> decltype(std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...))
+constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence<I...>) -> decltype(forward<F>(f)(std::get<I>(forward<Tuple>(t))...))
 {
-    return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
+    return forward<F>(f)(std::get<I>(forward<Tuple>(t))...);
 }
 }  // namespace detail
 
 template <class F, class Tuple>
 constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl(
-    std::forward<F>(f), std::forward<Tuple>(t),
+    forward<F>(f), forward<Tuple>(t),
     guts::make_index_sequence<std::tuple_size<guts::remove_reference_t<Tuple>>::value>{}))
 {
     return detail::apply_impl(
-        std::forward<F>(f), std::forward<Tuple>(t),
+        forward<F>(f), forward<Tuple>(t),
         guts::make_index_sequence<std::tuple_size<guts::remove_reference_t<Tuple>>::value>{});
 }
 
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
index 55c4a6a6e50af7..3217198dd8cdf3 100644
--- a/caffe2/utils/Array.h
+++ b/caffe2/utils/Array.h
@@ -259,7 +259,7 @@ template<std::size_t _Int, typename _Tp, std::size_t _Nm>
 constexpr _Tp&& get(array<_Tp, _Nm>&& __arr) noexcept
 {
   static_assert(_Int < _Nm, "array index is within bounds");
-  return std::move(get<_Int>(__arr));
+  return guts::move(get<_Int>(__arr));
 }
 
 template<std::size_t _Int, typename _Tp, std::size_t _Nm>
@@ -292,12 +292,12 @@ constexpr inline array<T, N-1> tail(const array<T, N>& arg) {
 namespace detail {
 template<class T, size_t N, size_t... I>
 constexpr inline array<T, N+1> prepend_(T&& head, const array<T, N>& tail, guts::index_sequence<I...>) {
-  return {{std::forward<T>(head), get<I>(tail)...}};
+  return {{guts::forward<T>(head), get<I>(tail)...}};
 }
 }
 template<class T, size_t N>
 constexpr inline array<T, N+1> prepend(T&& head, const array<T, N>& tail) {
-  return detail::prepend_(std::forward<T>(head), tail, guts::make_index_sequence<N>());
+  return detail::prepend_(guts::forward<T>(head), tail, guts::make_index_sequence<N>());
 }
 
 /**
diff --git a/caffe2/utils/Array_test.cpp b/caffe2/utils/Array_test.cpp
index 1d8c290b8a2249..1f3171ebe88eb0 100644
--- a/caffe2/utils/Array_test.cpp
+++ b/caffe2/utils/Array_test.cpp
@@ -78,11 +78,9 @@ namespace test_tail {
     static_assert(array < int, 0 > {{}} == tail(array < int, 1 > {{3}}), "");
 }
 
-TEST(ArrayTest, TestPrepend) {
-  // Some compilers can't handle move results as constexpr, so use
-  // gtest assert for this test
-  ASSERT_EQ((array<int, 3> {{2, 3, 4}}), (prepend(2, array<int, 2> {{3, 4}})));
-  ASSERT_EQ((array<int, 1> {{3}}), (prepend(3, array<int, 0> {{}})));
+namespace test_prepend {
+    static_assert(array < int, 3 > {{2, 3, 4}} == prepend(2, array < int, 2 > {{3, 4}}), "");
+    static_assert(array < int, 1 > {{3}} == prepend(3, array < int, 0 > {{}}), "");
 }
 
 namespace test_to_std_array {

From f129da1a47274615bce094e949b77c8791de9a3b Mon Sep 17 00:00:00 2001
From: Marc Ferradou <zippeurfou@gmail.com>
Date: Thu, 13 Sep 2018 14:38:17 -0700
Subject: [PATCH 130/237] Add max to the ValueError for EmbeddingBag mode check
 (#11655)

Summary:
Related to #11624
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11655

Differential Revision: D9815454

Pulled By: SsnL

fbshipit-source-id: 8dd82e0c0aa68362e12b301e095a85af7d7fd71a
---
 torch/nn/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 690112424db560..243974e152bb27 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1316,7 +1316,7 @@ def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
             raise ValueError("max mode does not support sparse weights")
 
     else:
-        raise ValueError("mode has to be one of sum or mean")
+        raise ValueError("mode has to be one of sum, mean or max")
 
     if max_norm is not None:
         with torch.no_grad():

From 29e29ca6ee82ae40da6142faeba3bb6f78a17b63 Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pcnoordhuis@gmail.com>
Date: Thu, 13 Sep 2018 14:53:23 -0700
Subject: [PATCH 131/237] Use MPI_Isend/MPI_Irecv to back send/recv (#11630)

Summary:
The isCompleted function is changed to being non-const to accomodate
setting some internal status on the work object in the case of
completion. Previously, it was only checking a member field, but for the
MPI backend it calls MPI_Test to poll for completion of an asynchronous
request.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11630

Reviewed By: SsnL

Differential Revision: D9808008

Pulled By: pietern

fbshipit-source-id: 18b70825b1fb4d561a552fa75e9475a522852cd4
---
 torch/lib/c10d/ProcessGroup.hpp     |   2 +-
 torch/lib/c10d/ProcessGroupGloo.cpp |   6 +-
 torch/lib/c10d/ProcessGroupGloo.hpp |   6 +-
 torch/lib/c10d/ProcessGroupMPI.cpp  | 175 ++++++++++++++++++++--------
 torch/lib/c10d/ProcessGroupMPI.hpp  |  24 +++-
 torch/lib/c10d/ProcessGroupNCCL.cpp |   2 +-
 torch/lib/c10d/ProcessGroupNCCL.hpp |   2 +-
 7 files changed, 156 insertions(+), 61 deletions(-)

diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp
index f312ed779e43ae..2ac2948457a7f6 100644
--- a/torch/lib/c10d/ProcessGroup.hpp
+++ b/torch/lib/c10d/ProcessGroup.hpp
@@ -38,7 +38,7 @@ class ProcessGroup {
     virtual ~Work();
 
     // Checks if request has completed. Non-blocking operation.
-    virtual bool isCompleted() const = 0;
+    virtual bool isCompleted() = 0;
 
     // Returns if the work completed successfully.
     // If false, the exception function can be called to get details.
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 124b4d4a9d0cc4..e110a78001c134 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -145,7 +145,7 @@ ProcessGroupGloo::WorkGloo::WorkGloo()
 
 ProcessGroupGloo::WorkGloo::~WorkGloo() {}
 
-bool ProcessGroupGloo::WorkGloo::isCompleted() const {
+bool ProcessGroupGloo::WorkGloo::isCompleted() {
   return completed_;
 }
 
@@ -222,7 +222,7 @@ ProcessGroupGloo::SendWork::SendWork(
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer)
     : tensor_(tensor), buffer_(std::move(buffer)) {}
 
-bool ProcessGroupGloo::SendWork::isCompleted() const {
+bool ProcessGroupGloo::SendWork::isCompleted() {
   // No way to poll for completion yet
   return true;
 }
@@ -252,7 +252,7 @@ ProcessGroupGloo::RecvWork::RecvWork(
     int* srcRank)
     : tensor_(tensor), buffer_(std::move(buffer)), srcRank_(srcRank) {}
 
-bool ProcessGroupGloo::RecvWork::isCompleted() const {
+bool ProcessGroupGloo::RecvWork::isCompleted() {
   // No way to poll for completion yet
   return true;
 }
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index 0fe69250b31528..e78b1e36f8b94e 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -177,7 +177,7 @@ class ProcessGroupGloo : public ProcessGroup {
     explicit WorkGloo();
     virtual ~WorkGloo();
 
-    bool isCompleted() const override;
+    bool isCompleted() override;
     bool isSuccess() const override;
     void synchronize() override;
     bool wait() override;
@@ -235,7 +235,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
     virtual ~SendWork() = default;
 
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     bool isSuccess() const override;
 
@@ -259,7 +259,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
     virtual ~RecvWork() = default;
 
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     bool isSuccess() const override;
 
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 31578089677acd..39e5e5b609c1e8 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -93,7 +93,7 @@ ProcessGroupMPI::WorkMPI::WorkMPI() : completed_(false) {}
 
 ProcessGroupMPI::WorkMPI::~WorkMPI() {}
 
-bool ProcessGroupMPI::WorkMPI::isCompleted() const {
+bool ProcessGroupMPI::WorkMPI::isCompleted() {
   return completed_;
 }
 
@@ -137,6 +137,78 @@ const std::exception& ProcessGroupMPI::WorkMPI::exception() const {
   }
 }
 
+ProcessGroupMPI::AsyncWork::AsyncWork(
+    at::Tensor tensor,
+    MPI_Request request,
+    int* srcRank)
+    : tensor_(std::move(tensor)), request_(request), srcRank_(srcRank) {
+  memset(&status_, 0, sizeof(status_));
+}
+
+ProcessGroupMPI::AsyncWork::~AsyncWork() {
+  if (request_ != MPI_REQUEST_NULL) {
+    throw std::runtime_error(
+        "Attempted destruction of AsyncWork before work has completed");
+  }
+}
+
+bool ProcessGroupMPI::AsyncWork::isCompleted() {
+  if (request_ == MPI_REQUEST_NULL) {
+    return true;
+  }
+
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  int flag = 0;
+  MPI_CHECK(MPI_Test(&request_, &flag, &status_));
+  if (request_ != MPI_REQUEST_NULL) {
+    return false;
+  }
+
+  // request_ == MPI_REQUEST_NULL; the work has completed
+  if (srcRank_ != nullptr) {
+    *srcRank_ = status_.MPI_SOURCE;
+  }
+
+  return true;
+}
+
+bool ProcessGroupMPI::AsyncWork::isSuccess() const {
+  if (request_ != MPI_REQUEST_NULL) {
+    throw std::runtime_error(
+        "Invalid call to AsyncWork::isSuccess before work has completed");
+  }
+
+  return status_.MPI_ERROR == MPI_SUCCESS;
+}
+
+void ProcessGroupMPI::AsyncWork::synchronize() {}
+
+bool ProcessGroupMPI::AsyncWork::wait() {
+  if (request_ == MPI_REQUEST_NULL) {
+    return true;
+  }
+
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  MPI_CHECK(MPI_Wait(&request_, &status_));
+  if (srcRank_ != nullptr && status_.MPI_ERROR == MPI_SUCCESS) {
+    *srcRank_ = status_.MPI_SOURCE;
+  }
+
+  return status_.MPI_ERROR == MPI_SUCCESS;
+}
+
+const std::exception& ProcessGroupMPI::AsyncWork::exception() const {
+  if (request_ != MPI_REQUEST_NULL) {
+    throw std::runtime_error(
+        "Invalid call to AsyncWork::exception before work has completed");
+  }
+
+  std::array<char, MPI_MAX_ERROR_STRING> buf;
+  int len = buf.size();
+  MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
+  return std::runtime_error(std::string(buf.data(), len));
+}
+
 // Static global states
 int ProcessGroupMPI::numProcessGroups_ = 0;
 int ProcessGroupMPI::mpiThreadSupport_ = 0;
@@ -581,23 +653,25 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
+
   checkSingleTensor(tensors);
-  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [dstRank, this](std::unique_ptr<WorkEntry>& entry) {
-        auto data = (entry->src)[0];
 
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Send(
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.type().scalarType()),
-            dstRank,
-            0,
-            pgComm_));
-      };
-  auto entry = std::unique_ptr<WorkEntry>(
-      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
-  return enqueue(std::move(entry));
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+    MPI_CHECK(MPI_Isend(
+        tensor.data_ptr(),
+        tensor.numel(),
+        mpiDatatype.at(tensor.type().scalarType()),
+        dstRank,
+        0,
+        pgComm_,
+        &request));
+  }
+
+  return std::make_shared<AsyncWork>(tensor, request);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
@@ -606,24 +680,25 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
+
   checkSingleTensor(tensors);
-  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [srcRank, this](std::unique_ptr<WorkEntry>& entry) {
-        auto data = (entry->src)[0];
 
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Recv(
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.type().scalarType()),
-            srcRank,
-            0,
-            pgComm_,
-            MPI_STATUS_IGNORE));
-      };
-  auto entry = std::unique_ptr<WorkEntry>(
-      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
-  return enqueue(std::move(entry));
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+    MPI_CHECK(MPI_Irecv(
+        tensor.data_ptr(),
+        tensor.numel(),
+        mpiDatatype.at(tensor.type().scalarType()),
+        srcRank,
+        0,
+        pgComm_,
+        &request));
+  }
+
+  return std::make_shared<AsyncWork>(tensor, request);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
@@ -632,27 +707,25 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
+
   checkSingleTensor(tensors);
-  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [srcRank, this](std::unique_ptr<WorkEntry>& entry) {
-        auto data = (entry->src)[0];
-        MPI_Status status;
 
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Recv(
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.type().scalarType()),
-            MPI_ANY_SOURCE,
-            0,
-            pgComm_,
-            &status));
-        *(entry->srcRank) = status.MPI_SOURCE;
-      };
-  auto entry = std::unique_ptr<WorkEntry>(
-      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
-  entry->srcRank = srcRank;
-  return enqueue(std::move(entry));
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+    MPI_CHECK(MPI_Irecv(
+        tensor.data_ptr(),
+        tensor.numel(),
+        mpiDatatype.at(tensor.type().scalarType()),
+        MPI_ANY_SOURCE,
+        0,
+        pgComm_,
+        &request));
+  }
+
+  return std::make_shared<AsyncWork>(tensor, request, srcRank);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier() {
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
index b7989aeba69e32..4a72570759ce91 100644
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -78,7 +78,7 @@ class ProcessGroupMPI : public ProcessGroup {
     virtual ~WorkMPI();
 
     // Checks if request has completed. Non-blocking operation.
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     // Returns if the work completed successfully
     // if false, the exception function can be called to get details.
@@ -107,6 +107,28 @@ class ProcessGroupMPI : public ProcessGroup {
     friend class ProcessGroupMPI;
   };
 
+  class AsyncWork : public ProcessGroup::Work {
+   public:
+    AsyncWork(at::Tensor tensor, MPI_Request request, int* srcRank = nullptr);
+    virtual ~AsyncWork();
+
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    void synchronize() override;
+
+    bool wait() override;
+
+    const std::exception& exception() const override;
+
+   protected:
+    at::Tensor tensor_;
+    MPI_Request request_;
+    int* const srcRank_;
+    MPI_Status status_;
+  };
+
   // Constructor will spawn up the worker thread loop
   explicit ProcessGroupMPI(int rank, int size, MPI_Comm pgComm);
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index a28757ede24c3e..157a9917f6803e 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -100,7 +100,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
 ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {}
 
 // Check if the NCCL kernels are queued on the GPUs
-bool ProcessGroupNCCL::WorkNCCL::isCompleted() const {
+bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
   return true;
 }
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 282c679f0dd1e8..d3862fc044321d 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -66,7 +66,7 @@ class ProcessGroupNCCL : public ProcessGroup {
     // Checks if request has completed. In this specific case of NCCL, it checks
     // if the NCCL operation has completed on the GPU in its own NCCL stream.
     // Non-blocking operation.
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     // Let current THC stream wait on the completing of the NCCL work
     // always return true and will throw if there are exceptions

From 05e06f7de2c84b8dd770c2f5008f5b7620e97d39 Mon Sep 17 00:00:00 2001
From: Jeff Smith <jeffksmith@fb.com>
Date: Thu, 13 Sep 2018 15:04:43 -0700
Subject: [PATCH 132/237] migrating deprecated calls without abc module for
 containers (#11515)

Summary:
Implementing #10540.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11515

Reviewed By: apaszke

Differential Revision: D9771045

Pulled By: jeffreyksmithjr

fbshipit-source-id: 85ea39abaa9b465805a969f122b626b11fc85ef6
---
 caffe2/python/compatibility.py |  8 ++++++++
 caffe2/python/onnx/backend.py  |  3 ++-
 caffe2/python/onnx/frontend.py |  4 ++--
 caffe2/python/utils.py         |  4 ++--
 test/test_legacy_nn.py         |  4 ++--
 torch/_six.py                  |  7 +++++++
 torch/autograd/gradcheck.py    |  8 ++++----
 torch/jit/__init__.py          |  2 +-
 torch/nn/modules/container.py  | 19 ++++++++++---------
 torch/nn/modules/utils.py      |  4 ++--
 torch/onnx/utils.py            |  4 ++--
 torch/optim/optimizer.py       |  5 +++--
 torch/utils/data/dataloader.py | 10 +++++-----
 13 files changed, 50 insertions(+), 32 deletions(-)
 create mode 100644 caffe2/python/compatibility.py

diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py
new file mode 100644
index 00000000000000..9d615a30833371
--- /dev/null
+++ b/caffe2/python/compatibility.py
@@ -0,0 +1,8 @@
+from six import PY2, PY3
+
+if PY2:
+    import collections
+    container_abcs = collections
+elif PY3:
+    import collections.abc
+    container_abcs = collections.abc
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 3d9239c8b5c92b..7eacaf327ad264 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -25,6 +25,7 @@
 
 import caffe2
 from caffe2.python import core, workspace, rnn_cell, gru_cell
+from caffe2.python.compatibility import container_abcs
 from caffe2.python.model_helper import ModelHelper
 from caffe2.proto import caffe2_pb2
 import caffe2.python.utils
@@ -778,7 +779,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version
         ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version)
         if isinstance(ops, Caffe2Ops):
             return ops
-        if not isinstance(ops, collections.Iterable):
+        if not isinstance(ops, container_abcs.Iterable):
             ops = [ops]
         return Caffe2Ops(ops, [], [])
 
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index 5fd470c932ac59..379ef65af904a6 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -12,11 +12,11 @@
 from __future__ import unicode_literals
 
 import itertools
-import collections
 import logging
 import re
 
 from caffe2.python import core as caffe2_core
+from caffe2.python.compatibility import container_abcs
 from caffe2.proto import caffe2_legacy_pb2
 from enum import Enum
 from onnx import (defs, checker, helper, numpy_helper, mapping,
@@ -156,7 +156,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes):
         const_tensors = []
         if isinstance(nodes, tuple):
             nodes, const_tensors = nodes
-        if not isinstance(nodes, collections.Iterable):
+        if not isinstance(nodes, container_abcs.Iterable):
             nodes = [nodes]
         return nodes, const_tensors
 
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 75124add41cecd..5e87df8058e017 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -6,13 +6,13 @@
 from __future__ import unicode_literals
 
 from caffe2.proto import caffe2_pb2
+from caffe2.python.compatibility import container_abcs
 from future.utils import viewitems
 from google.protobuf.message import DecodeError, Message
 from google.protobuf import text_format
 
 import sys
 import copy
-import collections
 import functools
 import numpy as np
 from six import integer_types, binary_type, text_type, string_types
@@ -120,7 +120,7 @@ def MakeArgument(key, value):
     """Makes an argument based on the value type."""
     argument = caffe2_pb2.Argument()
     argument.name = key
-    iterable = isinstance(value, collections.Iterable)
+    iterable = isinstance(value, container_abcs.Iterable)
 
     # Fast tracking common use case where a float32 array of tensor parameters
     # needs to be serialized.  The entire array is guaranteed to have the same
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index b446920c4fec65..f3a807a0a6d640 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -1,10 +1,10 @@
 import math
 import random
 import unittest
-import collections
 from copy import deepcopy
 
 import torch
+from torch._six import container_abcs
 import torch.legacy.nn as nn
 from common import to_gpu, freeze_rng_state, run_tests, skipIfRocm, TEST_WITH_ROCM
 from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \
@@ -701,7 +701,7 @@ def require_grad(input):
         input = input.detach()
         input.requires_grad = True
         return input
-    elif isinstance(input, collections.Iterable):
+    elif isinstance(input, container_abcs.Iterable):
         return type(input)(require_grad(e) for e in input)
     return input
 
diff --git a/torch/_six.py b/torch/_six.py
index 1d70df51830d5e..84ba9a464891bb 100644
--- a/torch/_six.py
+++ b/torch/_six.py
@@ -108,3 +108,10 @@ def exec_(_code_, _globs_=None, _locs_=None):
 else:
     def raise_from(value, from_value):
         raise value
+
+if PY2:
+    import collections
+    container_abcs = collections
+elif PY3:
+    import collections.abc
+    container_abcs = collections.abc
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 2cc4ebbfacd4a3..26dc9daf4a7350 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -1,5 +1,5 @@
 import torch
-from collections import Iterable
+from torch._six import container_abcs
 import torch.testing
 import sys
 from itertools import product
@@ -11,7 +11,7 @@ def zero_gradients(x):
         if x.grad is not None:
             x.grad.detach_()
             x.grad.data.zero_()
-    elif isinstance(x, Iterable):
+    elif isinstance(x, container_abcs.Iterable):
         for elem in x:
             zero_gradients(elem)
 
@@ -23,7 +23,7 @@ def make_jacobian(input, num_out):
         if not input.requires_grad:
             return None
         return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
-    elif isinstance(input, Iterable):
+    elif isinstance(input, container_abcs.Iterable):
         jacobians = list(filter(
             lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
         if not jacobians:
@@ -37,7 +37,7 @@ def iter_tensors(x, only_requiring_grad=False):
     if isinstance(x, torch.Tensor):
         if x.requires_grad or not only_requiring_grad:
             yield x
-    elif isinstance(x, Iterable):
+    elif isinstance(x, container_abcs.Iterable):
         for elem in x:
             for result in iter_tensors(elem, only_requiring_grad):
                 yield result
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index b32edf715a6c00..e4543a6be12b87 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -6,7 +6,7 @@
 import torch.jit.annotations
 from torch._six import raise_from, with_metaclass
 import torch.testing
-from collections import defaultdict, OrderedDict, namedtuple, Iterable
+from collections import defaultdict, OrderedDict, namedtuple
 import sys
 import warnings
 import itertools
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index ef79b1ffb6a289..01e12e621ba4ac 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -1,5 +1,6 @@
 import warnings
-from collections import OrderedDict, Iterable, Mapping
+from collections import OrderedDict
+from torch._six import container_abcs
 from itertools import islice
 import operator
 
@@ -178,7 +179,7 @@ def extend(self, modules):
         Arguments:
             modules (iterable): iterable of modules to append
         """
-        if not isinstance(modules, Iterable):
+        if not isinstance(modules, container_abcs.Iterable):
             raise TypeError("ModuleList.extend should be called with an "
                             "iterable, but got " + type(modules).__name__)
         offset = len(self)
@@ -278,12 +279,12 @@ def update(self, modules):
             modules (iterable): a mapping (dictionary) of (string: :class:`~torch.nn.Module``) or
                 an iterable of key/value pairs of type (string, :class:`~torch.nn.Module``)
         """
-        if not isinstance(modules, Iterable):
+        if not isinstance(modules, container_abcs.Iterable):
             raise TypeError("ModuleDict.update should be called with an "
                             "iterable of key/value pairs, but got " +
                             type(modules).__name__)
 
-        if isinstance(modules, Mapping):
+        if isinstance(modules, container_abcs.Mapping):
             if isinstance(modules, OrderedDict):
                 for key, module in modules.items():
                     self[key] = module
@@ -292,7 +293,7 @@ def update(self, modules):
                     self[key] = module
         else:
             for j, m in enumerate(modules):
-                if not isinstance(m, Iterable):
+                if not isinstance(m, container_abcs.Iterable):
                     raise TypeError("ModuleDict update sequence element "
                                     "#" + str(j) + " should be Iterable; is" +
                                     type(m).__name__)
@@ -375,7 +376,7 @@ def extend(self, parameters):
         Arguments:
             parameters (iterable): iterable of parameters to append
         """
-        if not isinstance(parameters, Iterable):
+        if not isinstance(parameters, container_abcs.Iterable):
             raise TypeError("ParameterList.extend should be called with an "
                             "iterable, but got " + type(parameters).__name__)
         offset = len(self)
@@ -483,12 +484,12 @@ def update(self, parameters):
                 (string : :class:`~torch.nn.Parameter`) or an iterable of
                 key/value pairs of type (string, :class:`~torch.nn.Parameter`)
         """
-        if not isinstance(parameters, Iterable):
+        if not isinstance(parameters, container_abcs.Iterable):
             raise TypeError("ParametersDict.update should be called with an "
                             "iterable of key/value pairs, but got " +
                             type(parameters).__name__)
 
-        if isinstance(parameters, Mapping):
+        if isinstance(parameters, container_abcs.Mapping):
             if isinstance(parameters, OrderedDict):
                 for key, parameter in parameters.items():
                     self[key] = parameter
@@ -497,7 +498,7 @@ def update(self, parameters):
                     self[key] = parameter
         else:
             for j, p in enumerate(parameters):
-                if not isinstance(p, Iterable):
+                if not isinstance(p, container_abcs.Iterable):
                     raise TypeError("ParameterDict update sequence element "
                                     "#" + str(j) + " should be Iterable; is" +
                                     type(p).__name__)
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 3cff6a9e9ffba9..2b8ebd642b000a 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -1,10 +1,10 @@
-import collections
+from torch._six import container_abcs
 from itertools import repeat
 
 
 def _ntuple(n):
     def parse(x):
-        if isinstance(x, collections.Iterable):
+        if isinstance(x, container_abcs.Iterable):
             return x
         return tuple(repeat(x, n))
     return parse
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 963e0bc9591255..d027267053052a 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -9,7 +9,7 @@
 import torch.autograd
 import torch.serialization
 import re
-import collections
+from torch._six import container_abcs
 import contextlib
 import numbers
 import warnings
@@ -354,7 +354,7 @@ def _run_symbolic_method(op_name, symbolic_fn, args):
 def _is_onnx_list(value):
     if not isinstance(value, string_classes) and \
             not isinstance(value, torch.Tensor) and \
-            isinstance(value, collections.Iterable):
+            isinstance(value, container_abcs.Iterable):
         return True
     return False
 
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 21f69bb82804d3..41c1e916f4d8d1 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -1,4 +1,5 @@
-from collections import defaultdict, Iterable
+from collections import defaultdict
+from torch._six import container_abcs
 
 import torch
 from copy import deepcopy
@@ -123,7 +124,7 @@ def cast(param, value):
                 return value
             elif isinstance(value, dict):
                 return {k: cast(param, v) for k, v in value.items()}
-            elif isinstance(value, Iterable):
+            elif isinstance(value, container_abcs.Iterable):
                 return type(value)(cast(param, v) for v in value)
             else:
                 return value
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 9d69ab4daf0fc8..0874fd1185028d 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -6,7 +6,7 @@
 from . import SequentialSampler, RandomSampler, BatchSampler
 import signal
 import functools
-import collections
+from torch._six import container_abcs
 import re
 import sys
 import threading
@@ -187,9 +187,9 @@ def default_collate(batch):
         return torch.DoubleTensor(batch)
     elif isinstance(batch[0], string_classes):
         return batch
-    elif isinstance(batch[0], collections.Mapping):
+    elif isinstance(batch[0], container_abcs.Mapping):
         return {key: default_collate([d[key] for d in batch]) for key in batch[0]}
-    elif isinstance(batch[0], collections.Sequence):
+    elif isinstance(batch[0], container_abcs.Sequence):
         transposed = zip(*batch)
         return [default_collate(samples) for samples in transposed]
 
@@ -201,9 +201,9 @@ def pin_memory_batch(batch):
         return batch.pin_memory()
     elif isinstance(batch, string_classes):
         return batch
-    elif isinstance(batch, collections.Mapping):
+    elif isinstance(batch, container_abcs.Mapping):
         return {k: pin_memory_batch(sample) for k, sample in batch.items()}
-    elif isinstance(batch, collections.Sequence):
+    elif isinstance(batch, container_abcs.Sequence):
         return [pin_memory_batch(sample) for sample in batch]
     else:
         return batch

From 4672280b550928e9d4c41adc8a7cecf652b0b0c4 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Thu, 13 Sep 2018 15:13:05 -0700
Subject: [PATCH 133/237] Pass Storage by value (#11546)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11546

-

Reviewed By: ezyang

Differential Revision: D9778043

fbshipit-source-id: b1381cd60a826055ce8771d6c67eac4cc375b3b4
---
 aten/src/ATen/core/TensorImpl.cpp | 36 +++++++++++++++++--------------
 aten/src/ATen/core/TensorImpl.h   |  5 +----
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index 1bf6802c7d4d8e..577959c57a65c2 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -17,28 +17,32 @@ const Tensor& TensorImpl::grad() const {
   AT_ERROR("grad is not implemented for Tensor");
 }
 
-TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
-    : TensorImpl({}, type_id, scalar_type, is_variable) {
+namespace {
+Storage createStorage_(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator) {
   // UndefinedTensors and SparseTensors don't have storages.
   if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined
       && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true);
+    return Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true);
+  } else {
+    return Storage();
   }
 }
+}
+
+TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
+    : TensorImpl(createStorage_(type_id, scalar_type, allocator), type_id, is_variable) {
+}
 
-TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable)
-    : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {}
-
-TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable)
-    : storage_(std::move(storage)),
-      storage_offset_(0),
-      sizes_{0},
-      strides_{1},
-      is_contiguous_(true),
-      numel_(0),
-      type_id_(type_id),
-      scalar_type_(scalar_type),
-      is_variable_(is_variable) {}
+TensorImpl::TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable)
+    :storage_(std::move(storage)),
+    storage_offset_(0),
+    sizes_{0},
+    strides_{1},
+    is_contiguous_(true),
+    numel_(0),
+    type_id_(type_id),
+    scalar_type_(storage_ ? dataTypeToScalarType(storage_.dtype().id()) : ScalarType::Undefined),
+    is_variable_(is_variable) {}
 
 IntList TensorImpl::sizes() const {
   return sizes_;
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 831c53e23b6d49..acc8efbf86c986 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -23,7 +23,7 @@ namespace at {
 struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
   TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
-  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
+  TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable);
 
   virtual void release_resources() override;
 
@@ -209,8 +209,5 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   ScalarType scalar_type_;
   bool is_variable_ = false;
   bool is_wrapped_number_ = false;
-
- private:
-  TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
 };
 } // namespace at

From 85ff72348d92c9e4b38dd00bbe42f0ca3e66667f Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Thu, 13 Sep 2018 16:15:56 -0700
Subject: [PATCH 134/237] Only involve tensor device in CUDA -> CPU copy, not
 current device. (#11592)

Summary:
This also unifies the device usage between the async and sync case.

Fixes https://github.com/pytorch/pytorch/issues/10832.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11592

Differential Revision: D9797355

Pulled By: gchanan

fbshipit-source-id: e496cd371111cfaf9a6c664167967b395e3d72e9
---
 aten/src/THC/generic/THCTensorCopy.cpp | 11 +++++
 test/run_test.py                       |  1 +
 test/test_cuda_primary_ctx.py          | 58 ++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 test/test_cuda_primary_ctx.py

diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp
index 96ab307182639c..0c20edfbd9fd36 100644
--- a/aten/src/THC/generic/THCTensorCopy.cpp
+++ b/aten/src/THC/generic/THCTensorCopy.cpp
@@ -58,6 +58,13 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src)
 
   {
     THTensor *selfc = THTensor_(newContiguous)(self);
+    int tensorDevice = THCTensor_(getDevice)(state, src);
+    int currentDevice;
+    THCudaCheck(cudaGetDevice(&currentDevice));
+
+    if (currentDevice != tensorDevice) {
+      THCudaCheck(cudaSetDevice(tensorDevice));
+    }
     src = THCTensor_(newContiguous)(state, src);
 
     cudaStream_t stream = THCState_getCurrentStream(state);
@@ -68,6 +75,10 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src)
                                 stream));
     THCudaCheck(cudaStreamSynchronize(stream));
 
+    if (currentDevice != tensorDevice) {
+      THCudaCheck(cudaSetDevice(currentDevice));
+    }
+
     THCTensor_(free)(state, src);
     THTensor_(freeCopyTo)(selfc, self);
   }
diff --git a/test/run_test.py b/test/run_test.py
index d7af8e47ab8767..1e3c2f60e1df34 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -21,6 +21,7 @@
     'cpp_extensions',
     'c10d',
     'cuda',
+    'cuda_primary_ctx',
     'dataloader',
     'distributed',
     'distributions',
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
new file mode 100644
index 00000000000000..2006b340aa22de
--- /dev/null
+++ b/test/test_cuda_primary_ctx.py
@@ -0,0 +1,58 @@
+import ctypes
+import torch
+from common import TestCase, run_tests, skipIfRocm
+import unittest
+
+# NOTE: this needs to be run in a brand new process
+
+# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
+# because if we do that, the TEST_CUDNN line from common_cuda will be executed
+# multiple times as well during the execution of this test suite, and it will
+# cause CUDA OOM error on Windows.
+TEST_CUDA = torch.cuda.is_available()
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests')
+    TestCase = object  # noqa: F811
+
+
+def get_is_primary_context_created(device):
+    flags = ctypes.cast((ctypes.c_uint * 1)(), ctypes.POINTER(ctypes.c_uint))
+    active = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+    result = torch.cuda.cudart().cuDevicePrimaryCtxGetState(ctypes.c_int(device), flags, active)
+    assert result == 0, 'cuDevicePrimaryCtxGetState failed'
+    return bool(active[0])
+
+
+class TestCudaPrimaryCtx(TestCase):
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
+    def test_cuda_primary_ctx(self):
+        # Ensure context has not been created beforehand
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertFalse(get_is_primary_context_created(1))
+
+        x = torch.randn(1, device='cuda:1')
+
+        # We should have only created context on 'cuda:1'
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertTrue(get_is_primary_context_created(1))
+
+        print(x)
+
+        # We should still have only created context on 'cuda:1'
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertTrue(get_is_primary_context_created(1))
+
+        y = torch.randn(1, device='cpu')
+        y.copy_(x)
+
+        # We should still have only created context on 'cuda:1'
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertTrue(get_is_primary_context_created(1))
+
+    # DO NOT ADD ANY OTHER TESTS HERE!  ABOVE TEST REQUIRES FRESH PROCESS
+
+if __name__ == '__main__':
+    run_tests()

From 8402fde27946d56bf2236f9415864f99b989936c Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 16:26:34 -0700
Subject: [PATCH 135/237] Revert D9778043: Pass Storage by value

Differential Revision:
D9778043

Original commit changeset: b1381cd60a82

fbshipit-source-id: 40f1de67e939cb41605978d632105a48a91e7629
---
 aten/src/ATen/core/TensorImpl.cpp | 36 ++++++++++++++-----------------
 aten/src/ATen/core/TensorImpl.h   |  5 ++++-
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index 577959c57a65c2..1bf6802c7d4d8e 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -17,32 +17,28 @@ const Tensor& TensorImpl::grad() const {
   AT_ERROR("grad is not implemented for Tensor");
 }
 
-namespace {
-Storage createStorage_(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator) {
+TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
+    : TensorImpl({}, type_id, scalar_type, is_variable) {
   // UndefinedTensors and SparseTensors don't have storages.
   if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined
       && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    return Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true);
-  } else {
-    return Storage();
+    storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true);
   }
 }
-}
-
-TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
-    : TensorImpl(createStorage_(type_id, scalar_type, allocator), type_id, is_variable) {
-}
 
-TensorImpl::TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable)
-    :storage_(std::move(storage)),
-    storage_offset_(0),
-    sizes_{0},
-    strides_{1},
-    is_contiguous_(true),
-    numel_(0),
-    type_id_(type_id),
-    scalar_type_(storage_ ? dataTypeToScalarType(storage_.dtype().id()) : ScalarType::Undefined),
-    is_variable_(is_variable) {}
+TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable)
+    : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {}
+
+TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable)
+    : storage_(std::move(storage)),
+      storage_offset_(0),
+      sizes_{0},
+      strides_{1},
+      is_contiguous_(true),
+      numel_(0),
+      type_id_(type_id),
+      scalar_type_(scalar_type),
+      is_variable_(is_variable) {}
 
 IntList TensorImpl::sizes() const {
   return sizes_;
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index acc8efbf86c986..831c53e23b6d49 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -23,7 +23,7 @@ namespace at {
 struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
   TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
-  TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable);
+  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
 
   virtual void release_resources() override;
 
@@ -209,5 +209,8 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   ScalarType scalar_type_;
   bool is_variable_ = false;
   bool is_wrapped_number_ = false;
+
+ private:
+  TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
 };
 } // namespace at

From c18510463b36b2c825ae29a120f2481b097f06ed Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 16:29:02 -0700
Subject: [PATCH 136/237] Reduce includes in tensor_impl.h (#11643)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11643

- Reduce the tensor_impl.h includes to the bare
  minimum necessary
- Explicitly namespace std::

Reviewed By: jerryzh168

Differential Revision: D9811028

fbshipit-source-id: 44e32720962b35c12a7b2c93605721b9f6c5b254
---
 caffe2/core/tensor_impl.h | 70 +++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index ffa3ef61c4cc27..2b07537d43774b 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -1,8 +1,14 @@
 #pragma once
 
-#include "caffe2/core/storage.h"
+#include <ATen/core/TensorImpl.h>
 
-#include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/context_base.h>
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/context_base.h"
 
 // A global boolean variable to control whether we free memory when a Tensor
 // is shrinked to a smaller size. As a result, a Tensor is always going to
@@ -19,14 +25,14 @@ namespace caffe2 {
 /**
  * A utility function to convert vector<int> to vector<TIndex>.
  */
-inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
-  return vector<TIndex>(src.begin(), src.end());
+inline std::vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
+  return std::vector<TIndex>(src.begin(), src.end());
 }
 
 /**
  * Return product of all dimensions starting from k
  */
-inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
+inline TIndex size_from_dim_(int k, const std::vector<TIndex>& dims) {
   TIndex r = 1;
   for (size_t i = k; i < dims.size(); ++i) {
     r *= dims[i];
@@ -35,7 +41,7 @@ inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
 }
 
 // Product of all dims up to k (not including dims[k])
-inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
+inline TIndex size_to_dim_(int k, const std::vector<TIndex>& dims) {
   CAFFE_ENFORCE((unsigned)k <= dims.size());
   TIndex r = 1;
   for (int i = 0; i < k; ++i) {
@@ -45,7 +51,7 @@ inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
 }
 
 // Product of all dims between k and l (not including dims[k] and dims[l])
-inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
+inline TIndex size_between_dim_(int k, int l, const std::vector<TIndex>& dims) {
   CAFFE_ENFORCE((unsigned)l < dims.size());
   TIndex r = 1;
   if (k < l) {
@@ -94,12 +100,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   // and immediately discard it in Resize() since
   // reset_tensor will be true and FreeMemory will be called,
   // we might want to avoid creating Storage twice?
-  explicit TensorImpl(const vector<TIndex>& dims, at::DeviceType device_type)
+  explicit TensorImpl(const std::vector<TIndex>& dims, at::DeviceType device_type)
       : storage_(device_type) {
     Resize(dims);
   }
 
-  explicit TensorImpl(const vector<int>& dims, at::DeviceType device_type)
+  explicit TensorImpl(const std::vector<int>& dims, at::DeviceType device_type)
       : storage_(device_type) {
     Resize(dims);
   }
@@ -109,7 +115,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   TensorImpl(
       const TensorImpl& src,
-      BaseContext* context_for_copy,
+      at::BaseContext* context_for_copy,
       at::DeviceType device_type)
       : storage_(device_type) {
     CopyFrom(src, context_for_copy);
@@ -130,9 +136,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   template <typename T>
   TensorImpl(
-      const vector<TIndex>& dims,
-      const vector<T>& values,
-      BaseContext* context)
+      const std::vector<TIndex>& dims,
+      const std::vector<T>& values,
+      at::BaseContext* context)
       : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
     Resize(dims);
     CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
@@ -147,9 +153,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   template <
       typename T,
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  TensorImpl(const T& value, BaseContext* context)
+  TensorImpl(const T& value, at::BaseContext* context)
       : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
-    Resize(vector<TIndex>{});
+    Resize(std::vector<TIndex>{});
     context->CopyItemsFromCPU(
         storage_.dtype(), numel_, &value, mutable_data<T>());
   }
@@ -174,7 +180,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * Since we removed template from tensor, we now store a static
    * context pointer in tensor, which indicates the type of the tensor.
    */
-  BaseStaticContext* GetStaticContext() const {
+  at::BaseStaticContext* GetStaticContext() const {
     return get_static_context(GetDeviceType());
   }
 
@@ -185,7 +191,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * TODO(jerryzh): move this to a global registry
    * that can create context for us
    */
-  std::unique_ptr<BaseContext> CreateContext() const {
+  std::unique_ptr<at::BaseContext> CreateContext() const {
     return GetStaticContext()->CreateContext();
   }
 
@@ -197,12 +203,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.
    */
-  void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) {
+  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
     if ((void*)&src == (void*)this) {
       return;
     }
     if (storage_.dtype() != src.meta()) {
-      storage_ = Storage(GetDeviceType(), src.meta());
+      storage_ = at::Storage(GetDeviceType(), src.meta());
     }
     if (src.size() == -1) {
       dims_.clear();
@@ -250,7 +256,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * @brief Extend the outer-most dimension of this tensor
    *        to dimension of `num`.
    */
-  void ExtendTo(TIndex num, float growthPct, BaseContext* context) {
+  void ExtendTo(TIndex num, float growthPct, at::BaseContext* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
     CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
@@ -266,7 +272,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * growthPct. This ensures that Extend runs on an amortized O(1) time
    * complexity.
    */
-  void Extend(TIndex num, float growthPct, BaseContext* context) {
+  void Extend(TIndex num, float growthPct, at::BaseContext* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(
         num, 0, "`num` must be non-negative for Extend");
@@ -412,7 +418,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * Resizes the tensor without touching underlying storage.
    * This requires the total size of the tensor to remains constant.
    */
-  inline void Reshape(const vector<TIndex>& dims) {
+  inline void Reshape(const std::vector<TIndex>& dims) {
     TIndex new_size = 1;
     for (auto d : dims) {
       CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
@@ -430,7 +436,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     dims_ = dims;
   }
 
-  inline void Reshape(const vector<int>& dims) {
+  inline void Reshape(const std::vector<int>& dims) {
     Reshape(ToVectorTIndex(dims));
   }
 
@@ -441,7 +447,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   inline void FreeMemory() {
     // We'll detach from the old Storage and create a new one
-    storage_ = Storage(storage_.device_type(), storage_.dtype());
+    storage_ = at::Storage(storage_.device_type(), storage_.dtype());
   }
 
   /**
@@ -449,7 +455,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * is very slow since it involves quite some string operations, so do not use
    * it in your performance-critical code.
    */
-  string DebugString() const {
+  std::string DebugString() const {
     std::stringstream ss;
     ss << "A Tensor of item size " << storage_.itemsize() << " and type "
        << storage_.dtype().name() << " and dimension (";
@@ -552,7 +558,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     } else {
       int64_t numel = capacity / data_type.itemsize();
       // Create a new Storage
-      storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+      storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
     }
   }
 
@@ -613,7 +619,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         storage_.set_dtype(meta);
       } else {
         if (storage_.dtype() != meta) {
-          storage_ = Storage(storage_.device_type(), meta);
+          storage_ = at::Storage(storage_.device_type(), meta);
         }
       }
 
@@ -735,7 +741,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   /**
    * Returns the dimensions of the tensor as a vector.
    */
-  inline const vector<TIndex>& dims() const {
+  inline const std::vector<TIndex>& dims() const {
     return dims_;
   }
 
@@ -813,11 +819,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     GetStaticContext()->ExtractDeviceOption(device, raw_data());
   }
 
-  const Storage& storage() {
+  const at::Storage& storage() {
     return storage_;
   }
 
-  const Storage& storage() const {
+  const at::Storage& storage() const {
     return storage_;
   }
 
@@ -830,14 +836,14 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   // The logic is that if Extend() or ReserveSpace() were ever called,
   // then subsequent Resize()s will not free up Storage.
   bool reserved_ = false;
-  Storage storage_;
+  at::Storage storage_;
   // int64_t storage_offset_;
 
  private:
   template <
       typename T,
       typename = typename std::enable_if<std::is_integral<T>::value>::type>
-  bool SetDims(const vector<T>& src) {
+  bool SetDims(const std::vector<T>& src) {
     auto old_numel = numel_;
     dims_.resize(src.size());
     TIndex new_numel = 1;

From 7607b495383397cb89920f7cd794933dfe9a89fa Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 16:29:04 -0700
Subject: [PATCH 137/237] s/GetDevicetype/device_type/ (#11656)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11656

The mis-capitalization really sticks up my craw.  I know why (we
already have a static function named GetDeviceType), but let's
name it differently.

```
codemod -d . --extensions cc,cpp,cu,cuh,h,py,hpp,TARGETS GetDevicetype device_type
```

Reviewed By: jerryzh168

Differential Revision: D9813544

fbshipit-source-id: fe462f4bc40b03e74921f8cf5ebd9cfc52e7e636
---
 aten/src/ATen/core/context_base.h  | 6 +++---
 caffe2/core/context.h              | 2 +-
 caffe2/core/context_gpu.h          | 2 +-
 caffe2/core/hip/context_hip.h      | 2 +-
 caffe2/core/tensor_impl.h          | 6 +++---
 caffe2/ideep/utils/ideep_context.h | 2 +-
 caffe2/mkl/utils/mkl_context.h     | 2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 2ca9a7f6851102..7cf1b7cc174980 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -61,7 +61,7 @@ class AT_CORE_API BaseContext {
   virtual BaseStaticContext* GetStaticContext() const = 0;
 
   /* Sorry for the naming, will get rid of this in future diff */
-  virtual DeviceType GetDevicetype() const = 0;
+  virtual DeviceType device_type() const = 0;
 
   virtual void SwitchToDevice(int /*stream_id*/) = 0;
 
@@ -96,13 +96,13 @@ class AT_CORE_API BaseContext {
       DeviceType type) {
     if (type == DeviceType::CPU) {
       CopyBytesToCPU(nbytes, src, dst);
-    } else if (type == GetDevicetype()) {
+    } else if (type == device_type()) {
       CopyBytesSameDevice(nbytes, src, dst);
     } else {
       AT_ERROR(
           "CopyBytesToDevice can only copy to CPU or between same "
           "device. Can't copy from: ",
-          GetDevicetype(),
+          device_type(),
           " to",
           type);
     }
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index 4faaea93c6da12..a4e87e8110dc83 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -153,7 +153,7 @@ class CAFFE2_API CPUContext final : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return CPU;
   }
 
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index cca1b8be6500a8..987c9ffe35299d 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -285,7 +285,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return CUDA;
   }
 
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index cd309e6473be4f..5a7613cf934fd0 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -269,7 +269,7 @@ class HIPContext final : public BaseContext {
     return hipStreamQuery(stream) == hipSuccess;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return HIP;
   }
 
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 2b07537d43774b..4abe8c910e3361 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -139,7 +139,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       const std::vector<TIndex>& dims,
       const std::vector<T>& values,
       at::BaseContext* context)
-      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
+      : storage_(context->device_type(), TypeMeta::Make<T>()) {
     Resize(dims);
     CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
     context->CopyItemsFromCPU(
@@ -154,7 +154,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       typename T,
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
   TensorImpl(const T& value, at::BaseContext* context)
-      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
+      : storage_(context->device_type(), TypeMeta::Make<T>()) {
     Resize(std::vector<TIndex>{});
     context->CopyItemsFromCPU(
         storage_.dtype(), numel_, &value, mutable_data<T>());
@@ -236,7 +236,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
                 nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
           } else {
             CAFFE_ENFORCE(
-                context->GetDevicetype() == src.GetDeviceType(),
+                context->device_type() == src.GetDeviceType(),
                 "Type for provided context does not match the type of source");
             context->CopyBytesToDevice(
                 nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index c8657728c57e76..f50a4f34c66789 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -119,7 +119,7 @@ class IDEEPContext final : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return IDEEP;
   }
 
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 636ebf2217eac8..0a7b5808a446be 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -127,7 +127,7 @@ class MKLContext : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return MKLDNN;
   }
 

From 02980d7f8ca707be9f111d5b34d7654dafdefb03 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 16:29:06 -0700
Subject: [PATCH 138/237] Refactor Tensor/TensorImpl constructors. (#11657)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11657

Previously, we had a constructor in TensorImpl for every constructor in Tensor.
This was unnecessary and wordy: Tensor is the user-visible class, so it deserves
the constructors, but TensorImpl is internal and doesn't need it.  So
I replaced TensorImpl with a single, Storage accepting constructor, and then
rewrote Tensor to use that constructor.

Reviewed By: jerryzh168

Differential Revision: D9813742

fbshipit-source-id: 7501b54fe5f39180f1bc07573fd7c1640b0f4e89
---
 caffe2/core/tensor.h      | 76 ++++++++++++++++++++++++++++-----------
 caffe2/core/tensor_impl.h | 71 +-----------------------------------
 2 files changed, 56 insertions(+), 91 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 5564034e545837..f3ecfae765a615 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -50,45 +50,75 @@ class CAFFE2_API Tensor final {
     return impl_.get();
   }
 
-  explicit Tensor(DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(type)) {}
-
+  explicit Tensor(Storage storage)
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(std::move(storage))) {}
+
+  /**
+   * @brief Creates a tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   */
   explicit Tensor(const vector<TIndex>& dims, DeviceType type)
-      : impl_(
-            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
+      : Tensor(Storage(type)) {
+    // TODO: here, we create a Storage
+    // and immediately discard it in Resize() since
+    // reset_tensor will be true and FreeMemory will be called,
+    // we might want to avoid creating Storage twice?
+    Resize(dims);
+  }
 
   explicit Tensor(const vector<int>& dims, DeviceType type)
-      : impl_(
-            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
+      : Tensor(Storage(type)) {
+    Resize(dims);
+  }
 
+  /**
+   * context_for_copy is required to have the same DeviceType as src
+   */
   Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            *src.impl_,
-            context_for_copy,
-            type)) {}
+      : Tensor(Storage(type)) {
+    CopyFrom(src, context_for_copy);
+  }
 
+  /**
+   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
+   * src Tensor
+   */
   Tensor(const Tensor& src, DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            *src.impl_,
-            type)) {}
+      : Tensor(Storage(type)) {
+    CopyFrom(src);
+  }
 
+  /**
+   * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
+   */
   template <typename T>
   Tensor(
       const vector<TIndex>& dims,
       const vector<T>& values,
       BaseContext* context)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            dims,
-            values,
-            context)) {}
+      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
+    Resize(dims);
+    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size());
+    context->CopyItemsFromCPU(
+        storage().dtype(), size(), values.data(), mutable_data<T>());
+  }
 
+  /**
+   * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
+   */
   template <
       typename T,
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
   Tensor(const T& value, BaseContext* context)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            value,
-            context)) {}
+      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
+    Resize(std::vector<TIndex>{});
+    context->CopyItemsFromCPU(
+        storage().dtype(), size(), &value, mutable_data<T>());
+  }
 
   Tensor Clone() const {
     Tensor x(GetDeviceType());
@@ -275,6 +305,10 @@ class CAFFE2_API Tensor final {
   inline void ExtractDeviceOption(DeviceOption* device) const {
     return impl_.get()->ExtractDeviceOption(device);
   }
+
+  const Storage& storage() {
+    return impl_->storage();
+  }
 };
 
 using TensorCPU = Tensor;
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 4abe8c910e3361..6771612022302b 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -88,77 +88,8 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
 class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  public:
   TensorImpl() = delete;
-  explicit TensorImpl(DeviceType device_type) : storage_(device_type) {}
 
-  /**
-   * @brief Creates a tensor of the given dimension.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * the first time mutable_data() is called.
-   */
-  // TODO: here, we create a Storage
-  // and immediately discard it in Resize() since
-  // reset_tensor will be true and FreeMemory will be called,
-  // we might want to avoid creating Storage twice?
-  explicit TensorImpl(const std::vector<TIndex>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  explicit TensorImpl(const std::vector<int>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  /* Now we require that context_for_copy has the same device type as src since
-   * template is removed
-   */
-  TensorImpl(
-      const TensorImpl& src,
-      at::BaseContext* context_for_copy,
-      at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src, context_for_copy);
-  }
-
-  /**
-   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
-   * src Tensor
-   */
-  TensorImpl(const TensorImpl& src, at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src);
-  }
-
-  /**
-   * @brief Creates a tensor, and fills its contents with the given values.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <typename T>
-  TensorImpl(
-      const std::vector<TIndex>& dims,
-      const std::vector<T>& values,
-      at::BaseContext* context)
-      : storage_(context->device_type(), TypeMeta::Make<T>()) {
-    Resize(dims);
-    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, values.data(), mutable_data<T>());
-  }
-
-  /**
-   * @brief Creates a scalar tensor, and fills its content with the given value.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  TensorImpl(const T& value, at::BaseContext* context)
-      : storage_(context->device_type(), TypeMeta::Make<T>()) {
-    Resize(std::vector<TIndex>{});
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, &value, mutable_data<T>());
-  }
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {}
 
   /**
    * @brief Delete the copy constructor and use Clone explicitly

From e1cd220b901c7e1bb37bf822c356ba3f8e6b3134 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 16:29:09 -0700
Subject: [PATCH 139/237] Reimplement swap() using default move constructor.
 (#11659)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11659

This is less error-prone and less code.

Reviewed By: smessmer

Differential Revision: D9814536

fbshipit-source-id: 028510e31e2fa7a9fa11c1398b0743c5cd085dd5
---
 caffe2/core/tensor.h      |  3 ++-
 caffe2/core/tensor_impl.h | 21 +++------------------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index f3ecfae765a615..a167658060e7fc 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -188,7 +188,8 @@ class CAFFE2_API Tensor final {
   // swap method swaps the CONTENTS of the tensors, while std::swap
   // swaps the POINTERS.
   void swap(const Tensor& other) const noexcept {
-    impl_.get()->swap(*other.impl_.get());
+    // NB: use get() to get a non-const pointer!
+    std::swap(*impl_.get(), *other.impl_.get());
   }
 
   void ShareData(const Tensor& src) const {
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 6771612022302b..4c146e0d257530 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -91,19 +91,10 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
   explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {}
 
-  /**
-   * @brief Delete the copy constructor and use Clone explicitly
-   */
-  TensorImpl(const TensorImpl& src) = delete;
-
-  TensorImpl(TensorImpl&& src) noexcept {
-    swap(src);
-  }
-
+  TensorImpl(const TensorImpl&) = default;
+  TensorImpl& operator=(const TensorImpl&) = default;
+  TensorImpl(TensorImpl&&) = default;
   TensorImpl& operator=(TensorImpl&&) = default;
-  // Note(jiayq): possibly a rule-of-three violation, but we explicitly
-  // discourage the use of = for Tensors.
-  TensorImpl& operator=(const TensorImpl& src) = delete;
 
   virtual ~TensorImpl() noexcept {}
 
@@ -397,12 +388,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return ss.str();
   }
 
-  void swap(TensorImpl& other) noexcept {
-    std::swap(dims_, other.dims_);
-    std::swap(numel_, other.numel_);
-    std::swap(storage_, other.storage_);
-  }
-
   /**
    * @brief Shares the data with another tensor.
    *

From 760679352e8a6fda08b55c7e0cf2c916c7ed6d53 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Thu, 13 Sep 2018 18:19:24 -0700
Subject: [PATCH 140/237] Move Pixel Shuffle to ATen (#9721)

Summary:
<del>#9692 </del>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9721

Differential Revision: D8955829

Pulled By: SsnL

fbshipit-source-id: 4f4d1c7720b6f757fbef9a10f70209ae76f61399
---
 aten/src/ATen/native/PixelShuffle.cpp      | 34 +++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  2 ++
 torch/nn/functional.py                     | 39 ++++++++--------------
 torch/nn/modules/pixelshuffle.py           | 16 ++++-----
 4 files changed, 57 insertions(+), 34 deletions(-)
 create mode 100644 aten/src/ATen/native/PixelShuffle.cpp

diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
new file mode 100644
index 00000000000000..1f93ecbc8235ab
--- /dev/null
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -0,0 +1,34 @@
+#include "ATen/native/TensorTransformations.h"
+
+#include <ATen/NativeFunctions.h>
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace at {
+namespace native {
+
+Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
+  AT_ASSERTM(self.dim() == 4,
+             "pixel_shuffle expects 4D input, but got input with sizes ",self.sizes());
+  int64_t b = self.size(0);
+  int64_t c = self.size(1);
+  int64_t h = self.size(2);
+  int64_t w = self.size(3);
+  int64_t upscale_factor_squared = upscale_factor * upscale_factor;
+  AT_ASSERTM(c % upscale_factor_squared == 0,
+             "pixel_shuffle expects input channel to be divisible by square of "
+             "upscale_factor, but got input with sizes ", self.sizes(),
+             ", upscale_factor=", upscale_factor,
+             ", and self.size(1)=", c, " is not divisible by ", upscale_factor_squared);
+  int64_t oc = c / upscale_factor_squared;
+  int64_t oh = h * upscale_factor;
+  int64_t ow = w * upscale_factor;
+
+  auto input_reshaped = self.reshape({b, oc, upscale_factor, upscale_factor, h, w});
+  return input_reshaped.permute({0 /* b */, 1 /* oc */, 4 /* h */, 2 /* 1st upscale_factor */, 5 /* w */, 3 /* 2nd upscale_factor */})
+                       .reshape({b, oc, oh, ow});
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c15aefa2f671d6..5c99d7c97e9b3e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1213,6 +1213,8 @@
 - func: permute(Tensor self, IntList dims) -> Tensor
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
+- func: pixel_shuffle(Tensor self, int64_t upscale_factor) -> Tensor
+
 - func: pin_memory(Tensor self) -> Tensor
   variants: function, method
 
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 243974e152bb27..01d3775d5f0927 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1918,36 +1918,23 @@ def multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=No
     return torch._C._nn.multi_margin_loss(input, target, p, margin, weight, reduction)
 
 
-def pixel_shuffle(input, upscale_factor):
-    r"""Rearranges elements in a tensor of shape :math:`[*, C*r^2, H, W]` to a
-    tensor of shape :math:`[C, H*r, W*r]`.
+pixel_shuffle = _add_docstr(torch.pixel_shuffle, r"""
+Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a
+tensor of shape :math:`(C, H \times r, W \times r)`.
 
-    See :class:`~torch.nn.PixelShuffle` for details.
+See :class:`~torch.nn.PixelShuffle` for details.
 
-    Args:
-        input (Tensor): Input
-        upscale_factor (int): factor to increase spatial resolution by
-
-    Examples::
-
-        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.empty(1, 9, 4, 4)
-        >>> output = ps(input)
-        >>> print(output.size())
-        torch.Size([1, 1, 12, 12])
-    """
-    batch_size, channels, in_height, in_width = input.size()
-    channels //= upscale_factor ** 2
-
-    out_height = in_height * upscale_factor
-    out_width = in_width * upscale_factor
+Args:
+    input (Tensor): the input tensor
+    upscale_factor (int): factor to increase spatial resolution by
 
-    input_view = input.contiguous().view(
-        batch_size, channels, upscale_factor, upscale_factor,
-        in_height, in_width)
+Examples::
 
-    shuffle_out = input_view.permute(0, 1, 4, 2, 5, 3).contiguous()
-    return shuffle_out.view(batch_size, channels, out_height, out_width)
+    >>> input = torch.randn(1, 9, 4, 4)
+    >>> output = torch.nn.functional.pixel_shuffle(input, 3)
+    >>> print(output.size())
+    torch.Size([1, 1, 12, 12])
+""")
 
 
 def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index e68946d7c7fe90..0e3883120ca352 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -3,28 +3,28 @@
 
 
 class PixelShuffle(Module):
-    r"""Rearranges elements in a Tensor of shape :math:`(*, r^2C, H, W)` to a
-    tensor of shape :math:`(C, rH, rW)`.
+    r"""Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    to a tensor of shape :math:`(C, H \times r, W \times r)`.
 
     This is useful for implementing efficient sub-pixel convolution
     with a stride of :math:`1/r`.
 
     Look at the paper:
     `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
-    by Shi et. al (2016) for more details
+    by Shi et. al (2016) for more details.
 
     Args:
         upscale_factor (int): factor to increase spatial resolution by
 
     Shape:
-        - Input: :math:`(N, C * \text{upscale\_factor}^2, H, W)`
-        - Output: :math:`(N, C, H * \text{upscale\_factor}, W * \text{upscale\_factor})`
+        - Input: :math:`(N, C \times \text{upscale_factor}^2, H, W)`
+        - Output: :math:`(N, C, H \times \text{upscale_factor}, W \times \text{upscale_factor})`
 
     Examples::
 
-        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.tensor(1, 9, 4, 4)
-        >>> output = ps(input)
+        >>> pixel_shuffle = nn.PixelShuffle(3)
+        >>> input = torch.randn(1, 9, 4, 4)
+        >>> output = pixel_shuffle(input)
         >>> print(output.size())
         torch.Size([1, 1, 12, 12])
 

From 513fd3dd36f146cc0ed45437e58372a4b46b93b2 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Thu, 13 Sep 2018 19:15:15 -0700
Subject: [PATCH 141/237] Improve doc of `torch.nn.functional.pad` (#11623)

Summary:
I'm reading the doc of `torch.nn.functional.pad` and it looks a bit confusing to me. Hopefully this PR makes it clearer.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11623

Differential Revision: D9818255

Pulled By: soumith

fbshipit-source-id: 4f6b17b0211c6927007f44bfdf42df5f84d47536
---
 torch/nn/functional.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 01d3775d5f0927..61c0fdce5fe048 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2227,22 +2227,22 @@ def affine_grid(theta, size):
 def pad(input, pad, mode='constant', value=0):
     r"""Pads tensor.
 
-    `Nd` constant padding:  The number of dimensions to pad is
-        :math:`\left\lfloor\frac{len(padding)}{2}\right\rfloor` and the dimensions that get padded begins with the
-        last dimension and moves forward. See below for examples.
-
-    `1D`, `2D` and `3D` "reflect" / "replicate" padding:
-        for 1D:
-                3D input tensor with padding of the form `(padLeft, padRight)`
-        for 2D:
-                4D input tensor with padding of the form `(padLeft, padRight, padTop, padBottom)`.
-        for 3D:
-                5D input tensor with padding of the form
-                `(padLeft, padRight, padTop, padBottom, padFront, padBack)`. No "reflect" implementation.
-
-    See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
-    :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
-    padding modes works.
+    Pading size:
+        The number of dimensions to pad is :math:`\left\lfloor\frac{\text{len(pad)}}{2}\right\rfloor`
+        and the dimensions that get padded begins with the last dimension and moves forward.
+        For example, to pad the last dimension of the input tensor, then `pad` has form
+        `(padLeft, padRight)`; to pad the last 2 dimensions of the input tensor, then use
+        `(padLeft, padRight, padTop, padBottom)`; to pad the last 3 dimensions, use
+        `(padLeft, padRight, padTop, padBottom, padFront, padBack)`.
+
+    Padding mode:
+        See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
+        :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
+        padding modes works. Constant padding is implemented for arbitrary dimensions.
+        Replicate padding is implemented for padding the last 3 dimensions of 5D input
+        tensor, or the last 2 dimensions of 4D input tensor, or the last dimension of
+        3D input tensor. Reflect padding is only implemented for padding the last 2
+        dimensions of 4D input tensor, or the last dimension of 3D input tensor.
 
     Args:
         input (Tensor): `Nd` tensor

From 98e04db955f0d142a132ab615d3188db78c18608 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Thu, 13 Sep 2018 19:23:06 -0700
Subject: [PATCH 142/237] Implement requires_grad propagation in the JIT
 (#11586)

Summary:
Previously, we would pretty much assume that all floating point tensors do require grad, which might result in some unnecessary compute.

I don't really like the fact that `TensorType` uses `tensor.is_variable() && tensor.requires_grad()` to infer the value of `requires_grad`, but changing constants to keep variables turns out to be pretty hard. I got halfway there, but it would still need some more work.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11586

Reviewed By: ezyang

Differential Revision: D9813648

Pulled By: apaszke

fbshipit-source-id: 77f77756d18ff7632fca3aa68ce855e1d7f3bdb8
---
 test/expect/TestJit.test_cpp_cuda.expect      |  36 ++---
 ...pt.test_milstm_fusion_cuda-backward.expect |  91 ++++++-------
 test/test_jit.py                              |   6 +-
 torch/CMakeLists.txt                          |   1 +
 torch/csrc/jit/autodiff.cpp                   |  78 ++++-------
 torch/csrc/jit/autodiff.h                     |   4 +-
 torch/csrc/jit/graph_executor.cpp             |  14 +-
 torch/csrc/jit/init.cpp                       |   4 +-
 torch/csrc/jit/ir.h                           |   3 +
 .../jit/passes/requires_grad_analysis.cpp     | 128 ++++++++++++++++++
 .../csrc/jit/passes/requires_grad_analysis.h  |  15 ++
 torch/csrc/jit/test_jit.cpp                   |  23 ++--
 torch/csrc/jit/type.h                         |  44 ++++--
 13 files changed, 287 insertions(+), 160 deletions(-)
 create mode 100644 torch/csrc/jit/passes/requires_grad_analysis.cpp
 create mode 100644 torch/csrc/jit/passes/requires_grad_analysis.h

diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect
index 50d2cbe963833c..451f1f9329601c 100644
--- a/test/expect/TestJit.test_cpp_cuda.expect
+++ b/test/expect/TestJit.test_cpp_cuda.expect
@@ -129,38 +129,38 @@ graph(%0 : Float(2, 3, 4)
 }
 
 testDifferentiateWithRequiresGrad
-graph(%0 : Float(2, 3, 4)
-      %1 : Float(2, 3, 4)) {
-  %2 : Float(2, 3, 4) = aten::mul(%1, %1)
+graph(%0 : Float(*)
+      %1 : Float(*)) {
+  %2 : Float(*) = aten::mul(%1, %1)
   %3 : int = prim::Constant[value=1]()
-  %4 : Float(2, 3, 4) = aten::add(%2, %1, %3)
+  %4 : Float(*) = aten::add(%2, %1, %3)
   %5 : int = prim::Constant[value=1]()
-  %6 : Float(2, 3, 4) = aten::add(%4, %0, %5)
-  %7 : Float(2, 3, 4) = aten::mul(%6, %0)
+  %6 : Float(*) = aten::add(%4, %0, %5)
+  %7 : Float(*) = aten::mul(%6, %0)
   %8 : int = prim::Constant[value=1]()
-  %9 : Float(2, 3, 4) = aten::add(%7, %1, %8)
+  %9 : Float(*) = aten::add(%7, %1, %8)
   return (%4, %9, %6);
 }
-graph(%0 : Float(2, 3, 4)
-      %1 : Float(2, 3, 4)
-      %2 : Float(2, 3, 4)
-      %3 : Float(2, 3, 4)) {
+graph(%0 : Float(*)
+      %1 : Float(*)
+      %2 : Float(*)
+      %3 : Float(*)) {
   %4 : int = prim::Constant[value=1]()
-  %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0)
+  %5 : Float(*), %6 : Dynamic = prim::GradOf[name="aten::add"](%0)
     block0() {
-      %7 : Float(2, 3, 4) = aten::mul(%0, %4)
+      %7 : Dynamic = aten::mul(%0, %4)
       -> (%0, %7)
     }
-  %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%5)
+  %8 : Dynamic, %9 : Dynamic = prim::GradOf[name="aten::mul"](%5)
     block0() {
-      %10 : Float(2, 3, 4) = aten::mul(%5, %2)
-      %11 : Float(2, 3, 4) = aten::mul(%5, %3)
+      %10 : Dynamic = aten::mul(%5, %2)
+      %11 : Dynamic = aten::mul(%5, %3)
       -> (%10, %11)
     }
   %12 : Dynamic = prim::AutogradAdd(%1, %8)
-  %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%12)
+  %13 : Dynamic, %14 : Dynamic = prim::GradOf[name="aten::add"](%12)
     block0() {
-      %15 : Float(2, 3, 4) = aten::mul(%12, %4)
+      %15 : Dynamic = aten::mul(%12, %4)
       -> (%12, %15)
     }
   %16 : Dynamic = prim::AutogradAdd(%9, %14)
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index 5a099e3b3e6903..fb41a574947080 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -26,19 +26,15 @@ graph(%0 : Float(*, *)
       %cellgate : Float(*, *)
       %outgate : Float(*, *)
       %27 : Float(*, *)) {
-  %28 : Float(*, *), %29 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %1, %27, %0)
-  %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *), %35 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16)
-  %36 : Float(*, *) = aten::t(%20)
-  %37 : Float(*, *) = aten::mm(%32, %36)
-  %38 : Float(*, *) = aten::t(%13)
-  %39 : Float(*, *) = aten::mm(%38, %32)
+  %28 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %0, %27, %1)
+  %29 : Float(*, *), %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16)
+  %35 : Float(*, *) = aten::t(%13)
+  %36 : Float(*, *) = aten::mm(%35, %31)
+  %37 : Float(*, *) = aten::t(%36)
+  %38 : Float(*, *) = aten::t(%12)
+  %39 : Float(*, *) = aten::mm(%38, %29)
   %40 : Float(*, *) = aten::t(%39)
-  %41 : Float(*, *) = aten::t(%18)
-  %42 : Float(*, *) = aten::mm(%30, %41)
-  %43 : Float(*, *) = aten::t(%12)
-  %44 : Float(*, *) = aten::mm(%43, %30)
-  %45 : Float(*, *) = aten::t(%44)
-  return (%45, %42, %40, %37, %31, %33, %34, %35, %29);
+  return (%40, %37, %30, %32, %33, %34);
 }
 with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %1 : Float(*, *)
@@ -48,45 +44,42 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %5 : Float(*, *)
       %6 : Float(*, *)
       %7 : Float(*, *)) {
-  %8 : Float(*, *) = aten::mul(%5, %3)
-  %9 : Float(*, *) = aten::mul(%6, %6)
-  %10 : Float(*, *) = aten::neg(%9)
-  %11 : int = prim::Constant[value=1]()
+  %8 : Float(*, *) = aten::mul(%7, %6)
+  %9 : Float(*, *) = aten::mul(%7, %3)
+  %10 : Float(*, *) = aten::mul(%6, %6)
+  %11 : Float(*, *) = aten::neg(%10)
   %12 : int = prim::Constant[value=1]()
-  %13 : Float(*, *) = aten::add(%10, %12, %12)
-  %14 : Float(*, *) = aten::mul(%8, %13)
+  %13 : Float(*, *) = aten::add(%11, %12, %12)
+  %14 : Float(*, *) = aten::mul(%9, %13)
   %15 : int = prim::Constant[value=1]()
-  %16 : int = prim::Constant[value=1]()
-  %17 : Float(*, *) = aten::add(%7, %14, %16)
-  %18 : Float(*, *) = aten::mul(%17, %1)
-  %19 : Float(*, *) = aten::mul(%5, %6)
-  %20 : int = prim::Constant[value=1]()
-  %21 : Float(*, *) = aten::mul(%17, %20)
-  %22 : Float(*, *) = aten::mul(%21, %2)
-  %23 : Float(*, *) = aten::mul(%21, %0)
-  %24 : Float(*, *) = aten::mul(%17, %4)
-  %25 : Float(*, *) = aten::neg(%3)
-  %26 : int = prim::Constant[value=1]()
-  %27 : Float(*, *) = aten::add(%25, %26, %26)
-  %28 : Float(*, *) = aten::mul(%19, %3)
-  %29 : Float(*, *) = aten::mul(%28, %27)
-  %30 : Float(*, *) = aten::mul(%2, %2)
-  %31 : Float(*, *) = aten::neg(%30)
-  %32 : int = prim::Constant[value=1]()
-  %33 : Float(*, *) = aten::add(%31, %32, %32)
-  %34 : Float(*, *) = aten::mul(%23, %33)
-  %35 : Float(*, *) = aten::neg(%1)
-  %36 : int = prim::Constant[value=1]()
-  %37 : Float(*, *) = aten::add(%35, %36, %36)
-  %38 : Float(*, *) = aten::mul(%24, %1)
-  %39 : Float(*, *) = aten::mul(%38, %37)
-  %40 : Float(*, *) = aten::neg(%0)
-  %41 : int = prim::Constant[value=1]()
-  %42 : Float(*, *) = aten::add(%40, %41, %41)
-  %43 : Float(*, *) = aten::mul(%22, %0)
-  %44 : Float(*, *) = aten::mul(%43, %42)
-  %45 : Float(*, *) = prim::FusedConcat[dim=1](%44, %39, %34, %29)
-  return (%45, %18);
+  %16 : Float(*, *) = aten::add(%5, %14, %15)
+  %17 : int = prim::Constant[value=1]()
+  %18 : Float(*, *) = aten::mul(%16, %17)
+  %19 : Float(*, *) = aten::mul(%18, %2)
+  %20 : Float(*, *) = aten::mul(%18, %0)
+  %21 : Float(*, *) = aten::mul(%16, %4)
+  %22 : Float(*, *) = aten::neg(%3)
+  %23 : int = prim::Constant[value=1]()
+  %24 : Float(*, *) = aten::add(%22, %23, %23)
+  %25 : Float(*, *) = aten::mul(%8, %3)
+  %26 : Float(*, *) = aten::mul(%25, %24)
+  %27 : Float(*, *) = aten::mul(%2, %2)
+  %28 : Float(*, *) = aten::neg(%27)
+  %29 : int = prim::Constant[value=1]()
+  %30 : Float(*, *) = aten::add(%28, %29, %29)
+  %31 : Float(*, *) = aten::mul(%20, %30)
+  %32 : Float(*, *) = aten::neg(%1)
+  %33 : int = prim::Constant[value=1]()
+  %34 : Float(*, *) = aten::add(%32, %33, %33)
+  %35 : Float(*, *) = aten::mul(%21, %1)
+  %36 : Float(*, *) = aten::mul(%35, %34)
+  %37 : Float(*, *) = aten::neg(%0)
+  %38 : int = prim::Constant[value=1]()
+  %39 : Float(*, *) = aten::add(%37, %38, %38)
+  %40 : Float(*, *) = aten::mul(%19, %0)
+  %41 : Float(*, *) = aten::mul(%40, %39)
+  %42 : Float(*, *) = prim::FusedConcat[dim=1](%41, %36, %31, %26)
+  return (%42);
 }
 with prim::FusionGroup_1 = graph(%0 : Float(*)
       %1 : Float(*)
diff --git a/test/test_jit.py b/test/test_jit.py
index 54bc33e8542e43..985a4f760466b7 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -119,9 +119,9 @@ def canonical(graph):
 
 
 def get_lstm_inputs(device, training=False):
-    input = torch.randn(3, 10, dtype=torch.float, device=device)
-    hx = torch.randn(3, 20, dtype=torch.float, device=device)
-    cx = torch.randn(3, 20, dtype=torch.float, device=device)
+    input = torch.randn(3, 10, dtype=torch.float, device=device, requires_grad=training)
+    hx = torch.randn(3, 20, dtype=torch.float, device=device, requires_grad=training)
+    cx = torch.randn(3, 20, dtype=torch.float, device=device, requires_grad=training)
     module = nn.LSTMCell(10, 20).to(device, torch.float)  # Just to allocate weights with correct sizes
     if training:
         params = tuple(module.parameters())
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 427270d2a89fa1..04a72819717bad 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -174,6 +174,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 3d53ad3967c8cd..ab6a07cbb1d959 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -72,25 +72,15 @@ bool isDifferentiable(Graph & g) {
 }
 
 
-bool outputRequiresGrad(Node* node, std::function<bool(Value*)> requires_grad) {
-  switch (node->kind()) {
-    case aten::le:
-    case aten::ge:
-    case aten::lt:
-    case aten::gt:
-    case aten::ne:
-    case aten::eq:
-      return false;
-    case aten::type_as:
-      // type_as has two inputs, the second of which (setting type) might require grad,
-      // but it still won't affect the output of type_as requiring grad.
-      return requires_grad(node->inputs().at(0));
-    default:
-      return std::any_of(node->inputs().begin(), node->inputs().end(), requires_grad);
-  }
-}
-
 static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_values) {
+  static const OperatorSet comparison_ops = {
+    "aten::lt(Tensor self, Tensor other) -> Tensor",
+    "aten::le(Tensor self, Tensor other) -> Tensor",
+    "aten::gt(Tensor self, Tensor other) -> Tensor",
+    "aten::ge(Tensor self, Tensor other) -> Tensor",
+    "aten::eq(Tensor self, Tensor other) -> Tensor",
+    "aten::ne(Tensor self, Tensor other) -> Tensor"
+  };
   const auto build_sym_grad = [node](const std::vector<SymbolicVariable>& grads) -> std::vector<SymbolicVariable> {
     auto inputs = fmap<SymbolicVariable>(node->inputs());
     auto outputs = fmap<SymbolicVariable>(node->outputs());
@@ -212,6 +202,8 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         tensor_grads.push_back(nullptr); // for attr::dim
         return tensor_grads;
       }
+    } else if (comparison_ops.find(node)) {
+      return {nullptr, nullptr};
 
     } else if (node->kind() == prim::Constant) {
       return {};
@@ -226,28 +218,6 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
   return fmap(sym_grads, [](const SymbolicVariable &v) { return v.value(); });
 }
 
-static value_set findAllRequiresGradNodes(
-        Graph& graph, const std::vector<bool>& input_requires_grad) {
-  JIT_ASSERT(graph.inputs().size() == input_requires_grad.size());
-  std::unordered_set<Value*> requires_grad_set;
-  const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; };
-
-  auto inputs = graph.inputs();
-  for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
-    if (!input_requires_grad[i]) continue;
-    requires_grad_set.emplace(inputs[i]);
-  }
-
-  for (Node * node : graph.nodes()) {
-    if (!outputRequiresGrad(node, requires_grad)) continue;
-    for (Value * output : node->outputs())
-      requires_grad_set.emplace(output);
-  }
-
-  return requires_grad_set;
-}
-
-
 // If we have a function y = f(x) with jacobian J, the backwards of f is dx = J^t dy.
 // Note that because the backwards always implements this matrix multiply,
 // we know that it maps an input vector of zeros to an output vector of zero
@@ -277,13 +247,11 @@ static std::vector<Value*> linearGradientForNode(Node* node, ArrayRef<Value*> gr
 }
 
 struct ReverseDetails {
-  ReverseDetails(value_map&& grad_map, value_set&& requires_grad_set, Block * reverse_block)
+  ReverseDetails(value_map&& grad_map, Block * reverse_block)
     : grad_map(std::move(grad_map))
-    , requires_grad_set(std::move(requires_grad_set))
     , reverse_block(reverse_block) {}
 
   value_map grad_map;
-  value_set requires_grad_set;
   Block * reverse_block;
 };
 
@@ -305,8 +273,7 @@ static Value* createAutogradAdd(Value* a, Value* b) {
 //     and vjp outputs for all primal inputs that require_grad
 //   - grad_desc has df_input_vjps and df_output_vjps set
 //     (but df_input_vjps will be modified later as well)
-static ReverseDetails addReverseInline(Gradient& grad_desc,
-                                  const std::vector<bool>& input_requires_grad) {
+static ReverseDetails addReverseInline(Gradient& grad_desc) {
   auto & graph = *grad_desc.f;
   // note: reverse_node is intentionally not inserted to avoid
   // accidentally acting on it (e.g. in elminate dead code),
@@ -314,8 +281,6 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   auto reverse_node = graph.create(prim::Reverse, 0);
   auto reverse_block = reverse_node->addBlock();
   WithInsertPoint guard(reverse_block);
-  auto requires_grad_set = findAllRequiresGradNodes(graph, input_requires_grad);
-  const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; };
 
   value_map grad_map; // x -> dx mapping
   const auto get_grad = [&](Value* v) -> Value* {
@@ -337,7 +302,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   auto outputs = graph.outputs();
   for (size_t i = 0, num_outputs = outputs.size(); i < num_outputs; ++i) {
     Value * output = outputs[i];
-    if (!requires_grad(output))
+    if (!output->requires_grad())
       continue;
     Value * output_grad = reverse_block->addInput()->setType(output->type());
     set_grad(output, output_grad);
@@ -347,12 +312,15 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   for (auto it = graph.nodes().rbegin(), end = graph.nodes().rend(); it != end; ++it) {
     Node *node = *it;
     auto inputs = node->inputs();
-    if (!outputRequiresGrad(node, requires_grad)) continue;
+    auto outputs = node->outputs();
+    if (std::all_of(outputs.begin(), outputs.end(), [](Value *v) { return !v->requires_grad(); })) {
+      continue;
+    }
 
     value_list grad_inputs = linearGradientForNode(node, fmap(node->outputs(), get_grad));
     JIT_ASSERT(grad_inputs.size() == node->inputs().size());
     for (size_t i = 0, num_inputs = grad_inputs.size(); i < num_inputs; ++i) {
-      if (!requires_grad(inputs[i])) continue;
+      if (!inputs[i]->requires_grad()) continue;
       // NB: Not returning a gradient w.r.t. a value that requires grad is normal if the
       // input is non-differentiable. This happens e.g. in the aten::type_as case.
       if (!grad_inputs[i]) continue;
@@ -363,7 +331,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   auto inputs = graph.inputs();
   for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
     Value * input = inputs[i];
-    if (!requires_grad(input))
+    if (!input->requires_grad())
       continue;
     // NB: Not having a gradient defined w.r.t. an input to the graph which requires grad
     // can happen and is not an error. It might have been used only in non-differentiable
@@ -373,7 +341,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
     reverse_block->registerOutput(get_grad(input));
     grad_desc.df_output_vjps.push_back(i);
   }
-  return ReverseDetails(std::move(grad_map), std::move(requires_grad_set), reverse_block);
+  return ReverseDetails(std::move(grad_map), reverse_block);
 }
 
 // Any temporary value from the primal graphs needs to be captured for later use in the
@@ -500,7 +468,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   for (size_t i = grad_desc.f_real_outputs; i < graph.outputs().size(); ++i) {
     Value * tmp = graph.outputs().at(i);
     // Add VJP inputs only for intermediates that actually required grad.
-    if (rev_info.requires_grad_set.count(tmp) == 0) continue;
+    if (!tmp->requires_grad()) continue;
     Value * tmp_vjp_in = reverse_block->addInput()->setType(tmp->type());
     Value * tmp_vjp_prev = rev_info.grad_map.at(tmp);
     // This is quite weird because we can't first make a sum and then replace all uses
@@ -536,7 +504,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   reverse_block->owningNode()->destroy();
 }
 
-Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad) {
+Gradient differentiate(std::shared_ptr<Graph>& graph) {
   Gradient grad_desc;
   // Take ownership of the graph
   JIT_ASSERTM(graph.use_count() == 1,
@@ -547,7 +515,7 @@ Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& r
 
   WithInsertPoint guard(grad_desc.f->block());
   // Fills in df_input_vjps and df_output_vjps
-  auto rev_info = addReverseInline(grad_desc, requires_grad);
+  auto rev_info = addReverseInline(grad_desc);
   // Lift constants captured for the reverse graph into it
   liftConstants(grad_desc, rev_info);
   // addReverseInline has to call gradientForNode if *any* of the outputs
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
index ea2b7a1170efeb..ffb07a767083cc 100644
--- a/torch/csrc/jit/autodiff.h
+++ b/torch/csrc/jit/autodiff.h
@@ -85,9 +85,7 @@ struct Gradient {
   //   - Interpret df
   //   - Wrap outputs of df into Variables (that don't require grad)
 };
-// XXX: When calling this function, graph should have complete type information.
-// Use the shape analysis pass to fill in the gaps if it doesn't.
-TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad);
+TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph);
 
 // can we take a derivative of this node symbolically?
 TORCH_API bool isDifferentiable(Node * n);
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index d77f1b2be1aa28..1202713121362d 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -23,6 +23,7 @@
 #include "torch/csrc/jit/passes/lower_grad_of.h"
 #include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/passes/inline_autodiff_subgraphs.h"
+#include "torch/csrc/jit/passes/requires_grad_analysis.h"
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/ivalue.h"
 #include "torch/csrc/jit/custom_operator.h"
@@ -384,6 +385,7 @@ struct GraphExecutorImpl {
     //          anyway, so it's better to run it first.
     ConstantPropagation(opt_graph);
     PropagateInputShapes(*opt_graph, spec);
+    PropagateRequiresGrad(opt_graph, spec);
 
     // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites that
     //          we can still execute using autograd).
@@ -396,18 +398,8 @@ struct GraphExecutorImpl {
     if (needsGradient(opt_graph, spec)) {
       auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph);
       for (Node * dnode : diff_nodes) {
-        // XXX: we don't have requires_grad information on the intermediate values,
-        // so we conservatively assume it's always true (on tensor inputs).
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
-        auto requires_grads = fmap(diff_graph->inputs(), [](Value* v) {
-          // NB: only floating-point inputs can have requires_grad=True. If we
-          // don't have type information, we have to assume that it's true.
-          if (auto tensor_type = v->type()->cast<TensorType>()) {
-            return at::isFloatingType(tensor_type->scalarType());
-          }
-          return v->type()->isSubtypeOf(DynamicType::get());
-        });
-        Gradient gradient = differentiate(diff_graph, requires_grads);
+        Gradient gradient = differentiate(diff_graph);
         runNondiffOptimization(gradient.f);
         packGradient(gradient, dnode);
       }
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 0d607c3c364a80..5484c04716c406 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -116,12 +116,12 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_fixup_onnx_loops", FixupONNXLoops)
    .def("_jit_pass_canonicalize_ops", CanonicalizeOps)
     .def("_jit_pass_specialize_undef", specializeUndef)
-   .def("_jit_differentiate", [](Graph &g, const std::vector<bool>& requires_grad) {
+   .def("_jit_differentiate", [](Graph &g) {
        // the python binding slightly differs in semantics
        // it makes a copy of the input Graph, and works on that
        // jit::differentiate mutates the input Graph
        auto g_clone = g.copy();
-       return differentiate(g_clone, requires_grad);
+       return differentiate(g_clone);
    });
 
   py::class_<CompleteArgumentSpec>(m, "CompleteArgumentSpec")
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 5e70adfe400916..58e702b771acaf 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -190,6 +190,9 @@ struct Value {
     JIT_ASSERT(type_ != nullptr);
     return type_;
   }
+  bool requires_grad() const {
+    return type()->requires_grad();
+  }
   bool isTensor() const {
     return type()->kind() == TypeKind::CompleteTensorType;
   }
diff --git a/torch/csrc/jit/passes/requires_grad_analysis.cpp b/torch/csrc/jit/passes/requires_grad_analysis.cpp
new file mode 100644
index 00000000000000..8b17859af02c07
--- /dev/null
+++ b/torch/csrc/jit/passes/requires_grad_analysis.cpp
@@ -0,0 +1,128 @@
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/type.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/operator.h"
+
+#include <vector>
+
+namespace torch { namespace jit {
+
+namespace {
+
+bool getRequiresGrad(Value * value) {
+  return value->requires_grad();
+}
+
+void setRequiresGrad(Value * value, bool req_value) {
+  if (auto type = value->type()->cast<TensorType>()) {
+    value->setType(type->withRequiresGrad(req_value));
+  }
+}
+
+void setRequiresGrad(at::ArrayRef<Value*> outputs, const std::vector<bool>& values) {
+  JIT_ASSERT(outputs.size() == values.size());
+  for (size_t i = 0; i < values.size(); ++i) {
+    setRequiresGrad(outputs[i], values[i]);
+  }
+}
+
+void setRequiresGrad(Node * node, const std::vector<bool>& values) {
+  setRequiresGrad(node->outputs(), values);
+}
+
+std::vector<bool> bitwiseOr(std::vector<bool> a, const std::vector<bool>& b) {
+  JIT_ASSERT(a.size() == b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    a[i] = a[i] || b[i];
+  }
+  return a;
+}
+
+
+void PropagateRequiresGradSimpleNode(Node* node) {
+  static const OperatorSet comparison_ops = {
+    "aten::lt(Tensor self, Tensor other) -> Tensor",
+    "aten::le(Tensor self, Tensor other) -> Tensor",
+    "aten::gt(Tensor self, Tensor other) -> Tensor",
+    "aten::ge(Tensor self, Tensor other) -> Tensor",
+    "aten::eq(Tensor self, Tensor other) -> Tensor",
+    "aten::ne(Tensor self, Tensor other) -> Tensor",
+    "aten::lt(Tensor self, Scalar other) -> Tensor",
+    "aten::le(Tensor self, Scalar other) -> Tensor",
+    "aten::gt(Tensor self, Scalar other) -> Tensor",
+    "aten::ge(Tensor self, Scalar other) -> Tensor",
+    "aten::eq(Tensor self, Scalar other) -> Tensor",
+    "aten::ne(Tensor self, Scalar other) -> Tensor",
+  };
+
+  if (comparison_ops.find(node)) {
+    return setRequiresGrad(node->output(), false);
+  } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
+    return setRequiresGrad(node->output(), node->input(0)->requires_grad());
+  } else if (node->matches("aten::detach(Tensor self) -> Tensor")) {
+    return setRequiresGrad(node->output(), false);
+  }
+
+  auto inputs = node->inputs();
+  auto outputs = node->outputs();
+  bool should_require = std::any_of(inputs.begin(), inputs.end(), getRequiresGrad);
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    if (auto type = outputs[i]->type()->cast<TensorType>()) {
+      setRequiresGrad(outputs[i], should_require && at::isFloatingType(type->scalarType()));
+    }
+  }
+}
+
+void PropagateRequiresGrad(Block * block);
+
+void PropagateRequiresGrad(Node * node) {
+  if (node->kind() == prim::If) {
+    auto blocks = node->blocks();
+    auto true_block = blocks.at(0);
+    auto false_block = blocks.at(1);
+
+    PropagateRequiresGrad(true_block);
+    PropagateRequiresGrad(false_block);
+
+    auto outputs_require =
+      bitwiseOr(fmap(true_block->outputs(), getRequiresGrad),
+                fmap(false_block->outputs(), getRequiresGrad));
+    setRequiresGrad(node, outputs_require);
+  } else if (node->kind() == prim::Loop) {
+    auto body = node->blocks().at(0);
+    std::vector<bool> body_inputs_require = fmap(node->inputs().slice(2), getRequiresGrad);
+    std::vector<bool> body_outputs_require (node->outputs().size(), false);
+
+    while (body_inputs_require != body_outputs_require) {
+      body_inputs_require = bitwiseOr(body_inputs_require, body_outputs_require);
+      setRequiresGrad(body->param_node()->outputs().slice(1), body_inputs_require);
+      PropagateRequiresGrad(body);
+      body_outputs_require = fmap(body->return_node()->inputs().slice(1), getRequiresGrad);
+    }
+
+    setRequiresGrad(node, body_outputs_require);
+  } else {
+    PropagateRequiresGradSimpleNode(node);
+  }
+}
+
+void PropagateRequiresGrad(Block * block) {
+  for (Node * node : block->nodes()) {
+    PropagateRequiresGrad(node);
+  }
+}
+
+} // anonymous namespace
+
+void PropagateRequiresGrad(std::shared_ptr<Graph>& graph, const ArgumentSpec & spec) {
+  auto inputs = graph->inputs();
+  JIT_ASSERT(spec.size() == inputs.size());
+  for (size_t i = 0; i < spec.size(); ++i) {
+    auto & arg = spec.at(i);
+    if (!arg.isTensor()) continue;
+    setRequiresGrad(inputs[i], arg.requires_grad());
+  }
+  PropagateRequiresGrad(graph->block());
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/requires_grad_analysis.h b/torch/csrc/jit/passes/requires_grad_analysis.h
new file mode 100644
index 00000000000000..37e53461ef4b47
--- /dev/null
+++ b/torch/csrc/jit/passes/requires_grad_analysis.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include <memory>
+
+namespace torch { namespace jit {
+
+struct Graph;
+struct ArgumentSpec;
+
+TORCH_API void PropagateRequiresGrad(std::shared_ptr<Graph>& graph, const ArgumentSpec & spec);
+
+}}
+
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index 4ffd16ff24c95a..7a652970243f34 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -26,6 +26,7 @@ using Catch::StartsWith;
 #include "torch/csrc/utils/hash.h"
 #include "torch/csrc/jit/argument_spec.h"
 #include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/passes/requires_grad_analysis.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/passes/lower_grad_of.h"
 #include "torch/csrc/jit/operator.h"
@@ -34,7 +35,6 @@ using Catch::StartsWith;
 
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/autograd/engine.h"
-#include "torch/csrc/jit/passes/shape_analysis.h"
 
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/script/compiler.h"
@@ -618,7 +618,7 @@ void testADFormulas() {
     // Trace and differentiate the op
     auto graph = trace(test, vars_in);
     EliminateDeadCode(graph); // Tracing of some ops depends on the DCE trick
-    auto grad_spec = differentiate(graph, std::vector<bool>(vars_in.size(), true));
+    auto grad_spec = differentiate(graph);
     LowerGradOf(*grad_spec.df);
     // Get outputs from the interpreter
     auto tensors_in                = fmap(vars_in, unwrap);
@@ -651,7 +651,7 @@ void testDifferentiate(std::ostream & out) {
   auto c = a * b * a + b;
   graph->registerOutput(c.value());
 
-  auto grad_spec = differentiate(graph, {true, true});
+  auto grad_spec = differentiate(graph);
   std::vector<size_t> expected_captured_inputs = {0, 1};
   std::vector<size_t> expected_captured_outputs = {1};
   std::vector<size_t> expected_input_vjps = {0, 1};
@@ -668,19 +668,22 @@ void testDifferentiate(std::ostream & out) {
 }
 
 void testDifferentiateWithRequiresGrad(std::ostream & out) {
-  auto graph = std::make_shared<Graph>();
-  at::ScalarType s = at::ScalarType::Float;
-  auto type = CompleteTensorType::create(s, -1, {2, 3, 4}, {12, 4, 1});
-
   // Build up a fake graph
-  auto a = SymbolicVariable::asNewInput(*graph, type);
-  auto b = SymbolicVariable::asNewInput(*graph, type);
+  auto graph = std::make_shared<Graph>();
+  auto a = SymbolicVariable::asNewInput(*graph);
+  auto b = SymbolicVariable::asNewInput(*graph);
   auto d = b * b + b;
   auto e = (d + a) * a + b;
   graph->registerOutput(d.value());
   graph->registerOutput(e.value());
 
-  auto grad_spec = differentiate(graph, {true, false});
+  auto a_var = autograd::make_variable(at::CPU(at::kFloat).tensor(2, 2), true);
+  auto b_var = autograd::make_variable(at::CPU(at::kFloat).tensor(2, 2), false);
+  ArgumentSpec spec (true, {a_var, b_var});
+  PropagateInputShapes(*graph, spec);
+  PropagateRequiresGrad(graph, spec);
+
+  auto grad_spec = differentiate(graph);
   std::vector<size_t> expected_input_vjps = {1, 2};  // for e and %4 = (d + a)
   std::vector<size_t> expected_output_vjps = {0};    // only a requires grad
   REQUIRE(grad_spec.f_real_outputs == 2);              // we need one temporary %4 = (d + a)
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index e812f2e02fdee5..166dfd6e06b7d3 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -10,6 +10,7 @@
 
 #include <memory>
 #include <iostream>
+#include <type_traits>
 
 namespace torch { namespace jit {
 
@@ -51,13 +52,20 @@ struct cloneType<true, T> {
 template<typename T>
 struct cloneType<false, T> {
   std::shared_ptr<T> operator()(std::shared_ptr<const T> ptr) const {
-    return std::make_shared<T>(*ptr);
+    auto result = std::make_shared<typename std::remove_const<T>::type>(*ptr);
+    // XXX: the line above will correctly slice the struct, and make its runtype
+    // type exactly equal to T. However, kind_ is a field of Type, so it will simply
+    // be copied, and we need to fix it in here to match the dynamic type.
+    result->kind_ = T::Kind;
+    return result;
   }
 };
 
 struct TORCH_API Type : std::enable_shared_from_this<Type> {
 private:
   TypeKind kind_;
+  template<bool is_singleton, typename T>
+  friend struct cloneType;
 
 protected:
   Type(TypeKind kind)
@@ -85,6 +93,8 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     return kind_;
   }
 
+  virtual bool requires_grad() const { return false; }
+
   // Dynamically cast this object to the subclass indicated by the
   // template variable, returning nullptr if the cast is invalid.
   // NOTE: if the cast succeeds, but the casted kind is not the
@@ -138,6 +148,8 @@ struct TORCH_API DynamicType : public Type {
     return DynamicTypePtr(new DynamicType( std::forward<T>(all)... )); // NOLINT(modernize-make-shared)
   }
 
+  bool requires_grad() const override { return true; }
+
   bool operator==(const Type& rhs) const override {
     return rhs.kind() == kind();
   }
@@ -168,6 +180,7 @@ struct TORCH_API TensorType : public Type {
   at::ScalarType scalarType() const { return scalar_type_; }
   int device() const { return device_; }
   int dim() const { return dim_; }
+  bool requires_grad() const override { return requires_grad_; }
 
   TensorTypePtr toScalarType(at::ScalarType type){
     auto t = TensorType::create(*this);
@@ -179,6 +192,11 @@ struct TORCH_API TensorType : public Type {
     t->dim_ = new_dim;
     return t;
   }
+  TensorTypePtr withRequiresGrad(bool req) {
+    auto t = TensorType::create(*this);
+    t->requires_grad_ = req;
+    return t;
+  }
 
   bool operator==(const Type& rhs) const override {
     if (rhs.kind() != TypeKind::TensorType)
@@ -201,14 +219,20 @@ struct TORCH_API TensorType : public Type {
 
 protected:
   TensorType(const at::Tensor& tensor, TypeKind kind=TypeKind::TensorType)
-    : TensorType(tensor.type().scalarType(), tensor.type().is_cuda() ? tensor.get_device() : -1, tensor.dim(), kind) {}
-  TensorType(at::ScalarType scalar_type, int device, int dim, TypeKind kind=TypeKind::TensorType)
+    : TensorType(tensor.type().scalarType(),
+                 tensor.type().is_cuda() ? tensor.get_device() : -1,
+                 tensor.dim(),
+                 tensor.is_variable() && tensor.requires_grad(),
+                 kind) {}
+  TensorType(at::ScalarType scalar_type, int device, int dim, bool requires_grad=true, TypeKind kind=TypeKind::TensorType)
     : Type(kind)
     , scalar_type_(scalar_type)
+    , requires_grad_(at::isFloatingType(scalar_type) && requires_grad)
     , device_(device)
     , dim_(dim) {}
 
   at::ScalarType scalar_type_;
+  bool requires_grad_;
   int device_;
   int dim_;
 };
@@ -292,10 +316,10 @@ struct TORCH_API CompleteTensorType : public TensorType {
     : TensorType(tensor, TypeKind::CompleteTensorType)
     , sizes_(tensor.sizes().vec())
     , strides_(tensor.strides().vec()) {}
-  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes)
-    : CompleteTensorType(scalar_type, device, sizes, CompleteTensorType::contiguousStridesOf(sizes)) {}
-  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides)
-    : TensorType(scalar_type, device, sizes.size(), TypeKind::CompleteTensorType)
+  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, bool requires_grad=true)
+    : CompleteTensorType(scalar_type, device, sizes, CompleteTensorType::contiguousStridesOf(sizes), requires_grad) {}
+  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides, bool requires_grad=true)
+    : TensorType(scalar_type, device, sizes.size(), requires_grad, TypeKind::CompleteTensorType)
     , sizes_(sizes.vec())
     , strides_(strides.vec()) {}
 
@@ -349,10 +373,11 @@ struct TORCH_API ListType : public Type {
   static ListTypePtr ofTensors();
   static ListTypePtr ofInts();
   static ListTypePtr ofFloats();
+
+  static const TypeKind Kind = TypeKind::ListType;
 private:
   ListType(TypePtr elem)
   : Type(TypeKind::ListType), elem(std::move(elem)) {}
-  static const TypeKind Kind = TypeKind::ListType;
   TypePtr elem;
 };
 
@@ -401,11 +426,12 @@ struct TORCH_API TupleType : public Type {
     ss << "]";
     return ss.str();
   }
+
+  static const TypeKind Kind = TypeKind::TupleType;
 private:
   TupleType(std::vector<TypePtr> elements_)
   : Type(TypeKind::TupleType)
   , elements_(std::move(elements_)) {}
-  static const TypeKind Kind = TypeKind::TupleType;
 
   bool compare(const Type& rhs, std::function<bool(const TypePtr, const TypePtr)> fn) const {
     if(rhs.kind() != kind())

From 99c0b96f6891ddb62799466a9ab5c8068d2fe1d8 Mon Sep 17 00:00:00 2001
From: xhzhao <xiaohui.zhao@intel.com>
Date: Thu, 13 Sep 2018 19:25:43 -0700
Subject: [PATCH 143/237] optimize norm on ATen CPU backend (#11565)

Summary:
current torch.norm() runs sequentially on CPU. This PR did parallelization and vectorization of torch.norm() on ATen CPU path, roughly provide 2 order of magnitude performance boost.

Performance is benchmarks on Xeon skylake 8180, 2*28 cores 2.5GHz, using the following script:
```python
import torch
from time import time

count = 1000
size = 1000*1000

def test_norm(p=2):
    a = torch.randn(size)
    tstart = time()
    for i in range(count):
        torch.norm(a, p)
    tend = time()
    print("norm on size %d tensor p = %d: %f s" % (size, p, (tend-tstart)))

for p in range(4):
    test_norm(p)
```

without this optimization,
```
(intel-pytorch) [mingfeim@mlt-skx065 unit_tests]$ python test_norm.py
norm on size 1000000 tensor p = 0: 1.071235 s
norm on size 1000000 tensor p = 1: 1.069149 s
norm on size 1000000 tensor p = 2: 1.068212 s
norm on size 1000000 tensor p = 3: 69.735312 s
```

and with this optimization,
```
(pytorch-tf) [mingfeim@mlt-skx053 unit_tests]$ python test_norm.py
norm on size 1000000 tensor p = 0: 0.127507 s
norm on size 1000000 tensor p = 1: 0.011867 s
norm on size 1000000 tensor p = 2: 0.011907 s
norm on size 1000000 tensor p = 3: 0.014470 s
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11565

Differential Revision: D9804484

Pulled By: ezyang

fbshipit-source-id: 52899f30ac26139d00684d07edfb47cb9b25d871
---
 aten/src/ATen/native/LegacyBridge.cpp        |   8 -
 aten/src/ATen/native/ReduceOps.cpp           |  56 ++++++-
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp | 154 +++++++++++++++++++
 aten/src/ATen/native/cpu/ReduceOpsKernel.h   |   3 +
 4 files changed, 208 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 1364c0cceb1847..2803112b7692bb 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -22,14 +22,6 @@ namespace {
 
 // TODO: Maybe the foo_ variants should call th_foo_
 
-Tensor norm(const Tensor & self, Scalar p) {
-  if (_has_native(self)) {
-    return native_norm(self, p);
-  } else {
-    return th_norm(self, p);
-  }
-}
-
 Tensor clone(const Tensor& self) {
   if (_has_native(self)) {
     return native_clone(self);
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 2c7e641dcbe843..c976121e77ae3f 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -19,6 +19,7 @@ namespace native {
 
 DEFINE_DISPATCH(sum_kernel);
 DEFINE_DISPATCH(prod_kernel);
+DEFINE_DISPATCH(norm_kernel);
 
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
   ScalarType scalarType = self.type().scalarType();
@@ -584,12 +585,23 @@ Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)
   return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim);
 }
 
-Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
-  return at::native::norm_out(result, self, p, dim, keepdim);
+Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
+    return result;
+  if (self.is_contiguous() && result.is_contiguous()) {
+    _dimreduce_setup(result, self, dim);
+    norm_kernel(kCPU, result, self, p, dim);
+    if (!keepdim) {
+      result.squeeze_(dim);
+    }
+    return result;
+  } else {
+    return at::_th_norm_out(result, self, p, dim, keepdim);
+  }
 }
 
-Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
+Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
   AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
   AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
@@ -597,10 +609,44 @@ Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool
   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
     return result;
   } else {
-    return at::_th_norm_out(result, self, p, dim, keepdim);
+    if (self.is_cuda()) {
+      return at::_th_norm_out(result, self, p, dim, keepdim);
+    } else {
+      return _norm_out_cpu(result, self, p, dim, keepdim);
+    }
+  }
+}
+
+Tensor _norm(const Tensor &self, Scalar p) {
+  if (self.type().is_sparse()) {
+    return at::native_norm(self, p);
+  } else {
+    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+             "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
+    if (self.is_cuda()) {
+      return at::th_norm(self, p);
+    } else {
+      if (self.is_contiguous()) {
+        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
+        norm_kernel(kCPU, result, self, p, nullopt);
+        return result;
+      } else {
+        return at::th_norm(self, p);
+      }
+    }
   }
 }
 
+Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::norm_out(result, self, p, dim, keepdim);
+}
+
+Tensor norm(const Tensor& self, Scalar p) {
+  return at::native::_norm(self, p);
+}
+
 Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
   Tensor result = self.type().tensor();
   return at::native::all_out(result, self, dim, keepdim);
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index feea350fd08306..3be4ed4e06c28d 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -183,9 +183,163 @@ static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<in
   });
 }
 
+template<typename scalar_t>
+struct NormReduction {
+  // reduction width in number of scalar elements
+  static constexpr int WIDTH = 128 / sizeof(scalar_t);
+  using Vec = Vec256<scalar_t>;
+
+  static void apply(Tensor& res, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
+    auto out_ = res.data<scalar_t>();
+    auto data_ = self.data<scalar_t>();
+    auto numel = self.numel();
+    float pval = 0.0;
+    if (p.isIntegral()){
+      pval = p.to<int64_t>();
+    } else if (p.isFloatingPoint()) {
+      pval = p.to<float>();
+    }
+    if (!dim.has_value()) {
+      *out_ = reduce_all(data_, numel,  pval);
+      return;
+    }
+    int64_t n = self.size(*dim);
+    int64_t stride = self.stride(*dim);
+    // A contiguous tensor does not need to hold a meaningful stride
+    // if the corresponding size is 1
+    if (n == 1) {
+      stride = 1;
+      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
+        stride *= self.size(i);
+      }
+    }
+    int64_t batch = numel / n;
+    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
+      for (int64_t bi = begin; bi < end; bi++) {
+        int64_t b = bi / stride;
+        int64_t i = bi % stride;
+        const scalar_t* data = &data_[b * n * stride + i];
+        out_[bi] = norm_reduce(data, n, stride, pval);
+      }
+    });
+  }
+
+  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
+    scalar_t sum = parallel_reduce(
+      0,
+      size,
+      internal::GRAIN_SIZE,
+      (scalar_t)0,
+      [=](int64_t begin, int64_t end, scalar_t init) {
+        const scalar_t* data = &data_[begin];
+        int64_t n = end - begin;
+        scalar_t result = norm_reduce(data, n, 1, pval);
+        return result;
+      },
+      std::plus<scalar_t>());
+    return sum;
+  }
+
+  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
+    scalar_t result = 0.0;
+    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
+      int64_t n_rounded = round_down(n, WIDTH);
+      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
+      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
+      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
+    } else {
+      result = norm_reduce_sequential(data, n, stride, pval);
+    }
+    return result;
+  }
+
+  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
+    scalar_t result = 0.0;
+    if (pval == 0) {
+      for (int64_t k = 0; k < n; k++) {
+        result += (data[k * stride] != 0.0);
+      }
+    } else if (pval == 1) {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::abs(data[k * stride]);
+      }
+    } else if (pval == 2) {
+      for (int64_t k = 0; k < n; k++) {
+        result += data[k * stride] * data[k * stride];
+      }
+      result = std::sqrt(result);
+    } else if (pval == 3) {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
+      }
+      result = std::pow(result, 1.0/3);
+    } else if (std::isinf(pval)) {
+      for (int64_t k = 0; k < n; k++) {
+        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
+      }
+      result = result;
+    } else {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::pow(std::abs(data[k * stride]), pval);
+      }
+      result = std::pow(result, 1.0/pval);
+    }
+    return result;
+  }
+
+  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
+  // n is already rounded by 128
+  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
+    scalar_t result = 0.0;
+    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
+    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
+    int64_t rows = n / WIDTH;
+    if (pval == 1){
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + val.abs();
+        }
+      }
+    }
+    else if (pval == 2) {
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + val * val;
+        }
+      }
+    }
+    else if (pval == 3) {
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + (val * val * val).abs();
+        }
+      }
+    }
+    scalar_t buf[WIDTH] = {0};
+    for (int j = 0; j != 4; j++) {
+      acc[j].store(&buf[j * Vec::size]);
+    }
+    for (int i = 0; i < WIDTH; i++) {
+      result += buf[i];
+    }
+    result = std::pow(result, 1.0/pval);
+    return result;
+  }
+};
+
+static void norm_kernel_impl(Tensor& result, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
+    NormReduction<scalar_t>::apply(result, self, p, dim);
+  });
+}
+
 }  // anonymous namespace
 
 REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
 REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
+REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
index 4c5c8c15149a1b..5fc7c60ff2803d 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@@ -11,4 +11,7 @@ using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
 DECLARE_DISPATCH(reduce_fn, sum_kernel);
 DECLARE_DISPATCH(reduce_fn, prod_kernel);
 
+using reduce_norm_fn = void(*)(Tensor &, const Tensor &, Scalar, at::optional<int64_t>);
+DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
+
 }} // namespace at::native

From 2431eac7c011afe42d4c22b8b3f46dedae65e7c0 Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritzo@uber.com>
Date: Thu, 13 Sep 2018 19:45:02 -0700
Subject: [PATCH 144/237] Ensure most Distribution methods are jittable
 (#11560)

Summary:
This adds tests in tests/test_distributions.py to ensure that all methods of `Distribution` objects are jittable.

I've replaced a few samplers with jittable versions:
- `.uniform_()` -> `torch.rand()`
- `.exponential_()` -> `-(-torch.rand()).log1p()`
- `.normal_()` -> `torch.normal(torch.zeros(...), torch.ones(...), ...)`

Some jit failures remain, and are marked in test_distributions.py
- `Cauchy` and `HalfCauchy` do not support sampling due to missing `.cauchy_()`
- `Binomial` does not support `.enumerate_support()` due to `arange` ignoring its first arg.
- `MultivariateNormal`, `LowRankMultivariateNormal` do not support `.mean`, `.entropy`

- [x] Currently some tests fail (I've skipped those) due to unavailability of `aten::uniform` and `aten::cauchy` in the jit. Can someone suggest how to add these? I tried to add declarations to `torch/csrc/ir.cpp` and `torch/csrc/passes/shape_analysis.cpp`, but that resulted in "Couldn't find operator" errors.
- [x] There are still lots of `TracerWarning`s that something doesn't match something. I'm not sure whether these are real.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11560

Differential Revision: D9816327

Pulled By: apaszke

fbshipit-source-id: 72ec998ea13fc4c76d1ed003d9502e0fbaf728b8
---
 test/test_distributions.py                    | 288 +++++++++++++++++-
 torch/distributions/binomial.py               |   6 +-
 torch/distributions/constraint_registry.py    |   2 +
 torch/distributions/distribution.py           |   4 +-
 torch/distributions/exponential.py            |   4 +
 torch/distributions/geometric.py              |   7 +-
 torch/distributions/laplace.py                |   4 +
 .../lowrank_multivariate_normal.py            |   7 +-
 torch/distributions/multivariate_normal.py    |   4 +-
 torch/distributions/normal.py                 |   4 +-
 torch/distributions/one_hot_categorical.py    |   8 +-
 torch/distributions/relaxed_bernoulli.py      |   2 +-
 torch/distributions/relaxed_categorical.py    |   6 +-
 torch/distributions/studentT.py               |  10 +-
 torch/distributions/uniform.py                |   2 +-
 torch/distributions/utils.py                  |   8 +
 16 files changed, 343 insertions(+), 23 deletions(-)

diff --git a/test/test_distributions.py b/test/test_distributions.py
index 60bcec1605b6e5..61d5e57d910b54 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -647,7 +647,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params):
         distribution = dist_ctor(*ctor_params)
         s = distribution.sample()
         if s.is_floating_point():
-            s.detach_().requires_grad_()
+            s = s.detach().requires_grad_()
 
         expected_shape = distribution.batch_shape + distribution.event_shape
         self.assertEqual(s.size(), expected_shape)
@@ -3971,6 +3971,55 @@ def test_transformed_distribution_shapes(self):
             except NotImplementedError:
                 continue
 
+    def test_jit_fwd(self):
+        for transform in self.unique_transforms:
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+
+            def f(x):
+                return transform(x)
+
+            try:
+                traced_f = torch.jit.trace(f, (x,))
+            except NotImplementedError:
+                continue
+
+            # check on different inputs
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            self.assertEqual(f(x), traced_f(x))
+
+    def test_jit_inv(self):
+        for transform in self.unique_transforms:
+            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+
+            def f(y):
+                return transform.inv(y)
+
+            try:
+                traced_f = torch.jit.trace(f, (y,))
+            except NotImplementedError:
+                continue
+
+            # check on different inputs
+            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            self.assertEqual(f(y), traced_f(y))
+
+    def test_jit_jacobian(self):
+        for transform in self.unique_transforms:
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+
+            def f(x):
+                y = transform(x)
+                return transform.log_abs_det_jacobian(x, y)
+
+            try:
+                traced_f = torch.jit.trace(f, (x,))
+            except NotImplementedError:
+                continue
+
+            # check on different inputs
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            self.assertEqual(f(x), traced_f(x))
+
 
 class TestConstraintRegistry(TestCase):
     def get_constraints(self, is_cuda=False):
@@ -4092,5 +4141,242 @@ def tearDown(self):
         super(TestCase, self).tearDown()
         Distribution.set_default_validate_args(False)
 
+
+class TestJit(TestCase):
+    def _examples(self):
+        for Dist, params in EXAMPLES:
+            for param in params:
+                keys = param.keys()
+                values = tuple(param[key] for key in keys)
+                if not all(isinstance(x, torch.Tensor) for x in values):
+                    continue
+                sample = Dist(**param).sample()
+                yield Dist, keys, values, sample
+
+    def _perturb_tensor(self, value, constraint):
+        if isinstance(constraint, constraints._IntegerGreaterThan):
+            return value + 1
+        if isinstance(constraint, constraints._PositiveDefinite):
+            return value + torch.eye(value.shape[-1])
+        if value.dtype in [torch.float, torch.double]:
+            transform = transform_to(constraint)
+            delta = value.new(value.shape).normal_()
+            return transform(transform.inv(value) + delta)
+        if value.dtype == torch.long:
+            result = value.clone()
+            result[value == 0] = 1
+            result[value == 1] = 0
+            return result
+        raise NotImplementedError
+
+    def _perturb(self, Dist, keys, values, sample):
+        with torch.no_grad():
+            if Dist is Uniform:
+                param = dict(zip(keys, values))
+                param['low'] = param['low'] - torch.rand(param['low'].shape)
+                param['high'] = param['high'] + torch.rand(param['high'].shape)
+                values = [param[key] for key in keys]
+            else:
+                values = [self._perturb_tensor(value, Dist.arg_constraints.get(key, constraints.real))
+                          for key, value in zip(keys, values)]
+            param = dict(zip(keys, values))
+            sample = Dist(**param).sample()
+            return values, sample
+
+    def test_sample(self):
+        for Dist, keys, values, sample in self._examples():
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.sample()
+
+            traced_f = torch.jit.trace(f, values, check_trace=False)
+
+            # FIXME Schema not found for node
+            xfail = [
+                Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
+                HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
+            ]
+            if Dist in xfail:
+                continue
+
+            with torch.random.fork_rng():
+                sample = f(*values)
+            traced_sample = traced_f(*values)
+            self.assertEqual(sample, traced_sample)
+
+            # FIXME no nondeterministic nodes found in trace
+            xfail = [Beta, Dirichlet]
+            if Dist not in xfail:
+                self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
+
+    def test_rsample(self):
+        for Dist, keys, values, sample in self._examples():
+            if not Dist.has_rsample:
+                continue
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.rsample()
+
+            traced_f = torch.jit.trace(f, values, check_trace=False)
+
+            # FIXME Schema not found for node
+            xfail = [
+                Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
+                HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
+            ]
+            if Dist in xfail:
+                continue
+
+            with torch.random.fork_rng():
+                sample = f(*values)
+            traced_sample = traced_f(*values)
+            self.assertEqual(sample, traced_sample)
+
+            # FIXME no nondeterministic nodes found in trace
+            xfail = [Beta, Dirichlet]
+            if Dist not in xfail:
+                self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
+
+    def test_log_prob(self):
+        for Dist, keys, values, sample in self._examples():
+            # FIXME traced functions produce incorrect results
+            xfail = [LowRankMultivariateNormal, MultivariateNormal]
+            if Dist in xfail:
+                continue
+
+            def f(sample, *values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.log_prob(sample)
+
+            traced_f = torch.jit.trace(f, (sample,) + values)
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(sample, *values)
+            actual = traced_f(sample, *values)
+            self.assertEqual(expected, actual,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_enumerate_support(self):
+        for Dist, keys, values, sample in self._examples():
+            # FIXME traced functions produce incorrect results
+            xfail = [Binomial]
+            if Dist in xfail:
+                continue
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.enumerate_support()
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            self.assertEqual(expected, actual,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_mean(self):
+        for Dist, keys, values, sample in self._examples():
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.mean
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            expected[expected == float('inf')] = 0.
+            actual[actual == float('inf')] = 0.
+            self.assertEqual(expected, actual, allow_inf=True,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_variance(self):
+        for Dist, keys, values, sample in self._examples():
+            if Dist in [Cauchy, HalfCauchy]:
+                continue  # infinite variance
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.variance
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            expected[expected == float('inf')] = 0.
+            actual[actual == float('inf')] = 0.
+            self.assertEqual(expected, actual, allow_inf=True,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_entropy(self):
+        for Dist, keys, values, sample in self._examples():
+            # FIXME traced functions produce incorrect results
+            xfail = [LowRankMultivariateNormal, MultivariateNormal]
+            if Dist in xfail:
+                continue
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.entropy()
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            self.assertEqual(expected, actual, allow_inf=True,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_cdf(self):
+        for Dist, keys, values, sample in self._examples():
+
+            def f(sample, *values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                cdf = dist.cdf(sample)
+                return dist.icdf(cdf)
+
+            try:
+                traced_f = torch.jit.trace(f, (sample,) + values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(sample, *values)
+            actual = traced_f(sample, *values)
+            self.assertEqual(expected, actual,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 5e168ffe1d7a2f..e1763c1c6ecbfd 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -100,7 +100,11 @@ def sample(self, sample_shape=torch.Size()):
             if self.total_count.min() != max_count:
                 arange = torch.arange(max_count, dtype=self._param.dtype, device=self._param.device)
                 mask = arange >= self.total_count.unsqueeze(-1)
-                bernoullis.masked_fill_(mask, 0.)
+                if torch._C._get_tracing_state():
+                    # [JIT WORKAROUND] lack of support for .masked_fill_()
+                    bernoullis[mask.expand(shape)] = 0.
+                else:
+                    bernoullis.masked_fill_(mask, 0.)
             return bernoullis.sum(dim=-1)
 
     def log_prob(self, value):
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index f8688af3f3a392..0981a3ce85b3b6 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -152,7 +152,9 @@ def __call__(self, constraint):
 ################################################################################
 
 @biject_to.register(constraints.real)
+@biject_to.register(constraints.real_vector)
 @transform_to.register(constraints.real)
+@transform_to.register(constraints.real_vector)
 def _transform_to_real(constraint):
     return transforms.identity_transform
 
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index 8d357a4e9371a0..726158b8694432 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -215,7 +215,9 @@ def _extended_shape(self, sample_shape=torch.Size()):
         Args:
             sample_shape (torch.Size): the size of the sample to be drawn.
         """
-        return torch.Size(sample_shape + self._batch_shape + self._event_shape)
+        if not isinstance(sample_shape, torch.Size):
+            sample_shape = torch.Size(sample_shape)
+        return sample_shape + self._batch_shape + self._event_shape
 
     def _validate_sample(self, value):
         """
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index a2fde7b382593c..41d7cd9f9787a5 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -51,6 +51,10 @@ def expand(self, batch_shape, _instance=None):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for ._exponential()
+            u = torch.rand(shape, dtype=self.rate.dtype, device=self.rate.device)
+            return -(-u).log1p() / self.rate
         return self.rate.new(shape).exponential_() / self.rate
 
     def log_prob(self, value):
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index a84963664b4cf3..7b9e796b0a9ac2 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -75,7 +75,12 @@ def probs(self):
     def sample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         with torch.no_grad():
-            u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1)
+            if torch._C._get_tracing_state():
+                # [JIT WORKAROUND] lack of support for .uniform_()
+                u = torch.rand(shape, dtype=self.probs.dtype, device=self.probs.device)
+                u = u.clamp(min=_finfo(self.probs).tiny)
+            else:
+                u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1)
             return (u.log() / (-self.probs).log1p()).floor()
 
     def log_prob(self, value):
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index cc5b079fbdbdcb..c15a35ce7c7db1 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -54,6 +54,10 @@ def expand(self, batch_shape, _instance=None):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for .uniform_()
+            u = torch.rand(shape, dtype=self.loc.dtype, device=self.loc.device) * 2 - 1
+            return self.loc - self.scale * u.sign() * torch.log1p(-u.abs().clamp(min=_finfo(self.loc).tiny))
         u = self.loc.new(shape).uniform_(_finfo(self.loc).eps - 1, 1)
         # TODO: If we ever implement tensor.nextafter, below is what we want ideally.
         # u = self.loc.new(shape).uniform_(self.loc.nextafter(-.5, 0), .5)
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 5e6239a69d0d37..4cc3d986c7a6c1 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -5,7 +5,7 @@
 from torch.distributions.distribution import Distribution
 from torch.distributions.multivariate_normal import (_batch_diag, _batch_mahalanobis, _batch_mv,
                                                      _batch_potrf_lower, _batch_trtrs_lower)
-from torch.distributions.utils import lazy_property
+from torch.distributions.utils import _standard_normal, lazy_property
 
 
 def _batch_vector_diag(bvec):
@@ -169,8 +169,9 @@ def precision_matrix(self):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        eps_W = self.loc.new_empty(shape[:-1] + (self.cov_factor.size(-1),)).normal_()
-        eps_D = self.loc.new_empty(shape).normal_()
+        W_shape = shape[:-1] + self.cov_factor.shape[-1:]
+        eps_W = _standard_normal(W_shape, dtype=self.loc.dtype, device=self.loc.device)
+        eps_D = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
         return self.loc + _batch_mv(self.cov_factor, eps_W) + self.cov_diag.sqrt() * eps_D
 
     def log_prob(self, value):
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 855c727dc9b951..014a07e53c9532 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -3,7 +3,7 @@
 import torch
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
-from torch.distributions.utils import lazy_property
+from torch.distributions.utils import _standard_normal, lazy_property
 
 
 def _batch_mv(bmat, bvec):
@@ -194,7 +194,7 @@ def variance(self):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        eps = self.loc.new_empty(shape).normal_()
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
         return self.loc + _batch_mv(self._unbroadcasted_scale_tril, eps)
 
     def log_prob(self, value):
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 98a5318d7b9829..a125806108e8cd 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -4,7 +4,7 @@
 import torch
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
-from torch.distributions.utils import broadcast_all
+from torch.distributions.utils import _standard_normal, broadcast_all
 
 
 class Normal(ExponentialFamily):
@@ -64,7 +64,7 @@ def sample(self, sample_shape=torch.Size()):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        eps = self.loc.new(shape).normal_()
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
         return self.loc + eps * self.scale
 
     def log_prob(self, value):
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index dc37b4c7fabb8e..5165ed6768bd89 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -76,11 +76,15 @@ def param_shape(self):
     def sample(self, sample_shape=torch.Size()):
         sample_shape = torch.Size(sample_shape)
         probs = self._categorical.probs
-        one_hot = probs.new(self._extended_shape(sample_shape)).zero_()
         indices = self._categorical.sample(sample_shape)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for .scatter_()
+            eye = torch.eye(self.event_shape[-1], dtype=self._param.dtype, device=self._param.device)
+            return eye[indices]
+        one_hot = probs.new_zeros(self._extended_shape(sample_shape))
         if indices.dim() < one_hot.dim():
             indices = indices.unsqueeze(-1)
-        return one_hot.scatter_(-1, indices, 1)
+        return one_hot.scatter_(-1, indices, 1.)
 
     def log_prob(self, value):
         if self._validate_args:
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 643fddc2c96b68..f885bfea2bc422 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -78,7 +78,7 @@ def param_shape(self):
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         probs = clamp_probs(self.probs.expand(shape))
-        uniforms = clamp_probs(self.probs.new(shape).uniform_())
+        uniforms = clamp_probs(torch.rand(shape, dtype=probs.dtype, device=probs.device))
         return (uniforms.log() - (-uniforms).log1p() + probs.log() - (-probs).log1p()) / self.temperature
 
     def log_prob(self, value):
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index a43d631bec9b2b..59f5b5998a12d3 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -66,8 +66,8 @@ def probs(self):
         return self._categorical.probs
 
     def rsample(self, sample_shape=torch.Size()):
-        sample_shape = torch.Size(sample_shape)
-        uniforms = clamp_probs(self.logits.new(self._extended_shape(sample_shape)).uniform_())
+        shape = self._extended_shape(sample_shape)
+        uniforms = clamp_probs(torch.rand(shape, dtype=self.logits.dtype, device=self.logits.device))
         gumbels = -((-(uniforms.log())).log())
         scores = (self.logits + gumbels) / self.temperature
         return scores - scores.logsumexp(dim=-1, keepdim=True)
@@ -77,7 +77,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         logits, value = broadcast_all(self.logits, value)
-        log_scale = (self.temperature.new(self.temperature.shape).fill_(K).lgamma() -
+        log_scale = (self.temperature.new_tensor(float(K)).lgamma() -
                      self.temperature.log().mul(-(K - 1)))
         score = logits - value.mul(self.temperature)
         score = (score - score.logsumexp(dim=-1, keepdim=True)).sum(-1)
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index cefa4684298261..6530940b328e7f 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -1,11 +1,11 @@
+import math
 from numbers import Number
+
 import torch
 from torch._six import inf, nan
-import math
-from torch.distributions import constraints
+from torch.distributions import Chi2, constraints
 from torch.distributions.distribution import Distribution
-from torch.distributions import Chi2
-from torch.distributions.utils import broadcast_all
+from torch.distributions.utils import _standard_normal, broadcast_all
 
 
 class StudentT(Distribution):
@@ -65,7 +65,7 @@ def rsample(self, sample_shape=torch.Size()):
         #   Z ~ Chi2(df)
         #   Y = X / sqrt(Z / df) ~ StudentT(df)
         shape = self._extended_shape(sample_shape)
-        X = self.df.new(shape).normal_()
+        X = _standard_normal(shape, dtype=self.df.dtype, device=self.df.device)
         Z = self._chi2.rsample(sample_shape)
         Y = X * torch.rsqrt(Z / self.df)
         return self.loc + self.scale * Y
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index ce9cf496656346..071f086d782673 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -64,7 +64,7 @@ def support(self):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        rand = self.low.new(shape).uniform_()
+        rand = torch.rand(shape, dtype=self.low.dtype, device=self.low.device)
         return self.low + rand * (self.high - self.low)
 
     def log_prob(self, value):
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 0219942aac155a..1fcc7278e3f481 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -65,6 +65,14 @@ def broadcast_all(*values):
     return torch.broadcast_tensors(*values)
 
 
+def _standard_normal(shape, dtype, device):
+    if torch._C._get_tracing_state():
+        # [JIT WORKAROUND] lack of support for .normal_()
+        return torch.normal(torch.zeros(shape, dtype=dtype, device=device),
+                            torch.ones(shape, dtype=dtype, device=device))
+    return torch.empty(shape, dtype=dtype, device=device).normal_()
+
+
 def _sum_rightmost(value, dim):
     r"""
     Sum out ``dim`` many rightmost dimensions of a given tensor.

From e6fe8d9cf55c37fdbf1b1aa50eb6a2b448b6d3ee Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 13 Sep 2018 20:10:21 -0700
Subject: [PATCH 145/237] Try to delete codeowners for ATen/core (#10693)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10693

Reviewed By: soumith

Differential Revision: D9772210

Pulled By: ezyang

fbshipit-source-id: 14560eaf77441980e9784536acd0ffe20b15c5b8
---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 113be035c9b99f..5723d6ebe5c058 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,6 +2,7 @@
 # Each line is a file pattern followed by one or more owners.
 
 /aten/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/aten/src/ATen/core/
 /torch/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
 /docs/source @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ssnl @zou3519
 /docs/cpp @goldsborough @ebetica @apaszke @soumith @colesbury @gchanan @zdevito @ezyang

From 1637729620599706a8647f32fd375f623c4b7d9a Mon Sep 17 00:00:00 2001
From: zrphercule <zrphercule@gmail.com>
Date: Thu, 13 Sep 2018 20:15:55 -0700
Subject: [PATCH 146/237] Fix ci by skipping some tests (#11668)

Summary:
scalar_tensor_test skipped
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11668

Differential Revision: D9825819

Pulled By: zrphercule

fbshipit-source-id: 6e62a001bcde49be8f7af1501b303bd93d09d005
---
 .jenkins/caffe2/test.sh  | 9 ++++++++-
 .jenkins/pytorch/test.sh | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 6ca0850f940968..ac29672c8e2ced 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -62,7 +62,14 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
     */mkl_utils_test|*/aten/integer_divider_test)
       continue
       ;;
-    *)
+    */scalar_tensor_test|*/basic|*/native_test)
+	  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+		continue
+	  else
+	    "$test"
+	  fi
+	  ;;
+	*)
       # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While
       # planning to migrate to gtest as the common PyTorch c++ test suite, we
       # currently do NOT use the xml test reporter, because Catch doesn't
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index bc22fa7a36070a..471fd8fac1fc6e 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -90,6 +90,8 @@ test_python_all_except_nn() {
 
 test_aten() {
   # Test ATen
+  # The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
+  # scalar_tensor_test, basic, native_test
   if ([[ "$BUILD_ENVIRONMENT" != *asan* ]] && [[ "$BUILD_ENVIRONMENT" != *rocm* ]]); then
     echo "Running ATen tests with pytorch lib"
     TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib

From c5f7da3f4ae289bf110b5b0b6605d0c51e0bc86a Mon Sep 17 00:00:00 2001
From: Jiyan Yang <chocjy@fb.com>
Date: Fri, 14 Sep 2018 02:36:26 -0700
Subject: [PATCH 147/237] Support FP16 sparse lookup (#11674)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11674

Pull Request resolved: https://github.com/pytorch/pytorch/pull/11658

Reviewed By: hyuen

Differential Revision: D9676950

fbshipit-source-id: 89a115b9664b84e4e4436b7da033e5a428c2246d
---
 caffe2/python/layers/sparse_lookup.py | 33 ++++++++++++++++++++++++---
 caffe2/python/optimizer.py            | 31 +++++++++++++++++++------
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
index d96739b66c865b..4c3661b284dc3c 100644
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -28,6 +28,12 @@ def get_sparse_lookup_predictor_version(version):
     return version
 
 
+def get_sparse_lookup_trainer_version(version):
+    assert version in {'fp32', 'fp16'},\
+        "Unexpected version of sparse_lookup layer {0}".format(version)
+    return version
+
+
 def _is_id_list(input_record):
     return schema.equal_schemas(input_record, IdList)
 
@@ -72,10 +78,12 @@ def __init__(self, model, input_record, inner_shape, reducer,
             "{} should have categorical limit > 0, but got {}".format(
                 get_key(input_record)(), input_dim))
 
-        scale = math.sqrt(1.0 / input_dim)
+        self.input_dim = input_dim
         self.shape = [input_dim] + inner_shape
-        self.weight_init = weight_init if weight_init else (
-            'UniformFill', {'min': -scale, 'max': scale})
+
+        default_init_op = self._get_default_init_op()
+
+        self.weight_init = weight_init or default_init_op
 
         if _is_id_list(self.input_record):
             sparse_key = self.input_record.items()
@@ -141,6 +149,25 @@ def get_8bits_compatible_parameters(self, fused=True):
             )
             return [RowwiseQuantized8BitsWeight(self.w, self.scale_bias)]
 
+    def _get_default_init_op(self):
+        scale = math.sqrt(1.0 / self.input_dim)
+
+        cur_scope = get_current_scope()
+        trainer_version = get_sparse_lookup_trainer_version(
+            **cur_scope.get(get_sparse_lookup_trainer_version.__name__,
+                            {'version': 'fp32'}))
+
+        if trainer_version == 'fp32':
+            default_weight_init = ('UniformFill', {'min': -scale, 'max': scale})
+        elif trainer_version == 'fp16':
+            default_weight_init = ("Float16UniformFill", {'min': -scale, 'max': scale})
+        else:
+            raise NotImplementedError(
+                "Train version {} is not currently supported".format(trainer_version)
+            )
+
+        return default_weight_init
+
     def _gather_wrapper(self, net, version, in_indices, out):
         # Gather can work on all kinds of input data types, and output
         # data with the same type. Convert the output of Gather to float,
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 2ebcf1d92a1240..a48502c68a3cd6 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -8,6 +8,8 @@
 from collections import namedtuple, defaultdict
 from past.builtins import basestring
 
+import logging
+
 import numpy as np
 
 from caffe2.python import core, scope, utils, workspace
@@ -20,6 +22,8 @@
 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
 _optimizer_instance_count = defaultdict(int)
 
+logger = logging.getLogger(__name__)
+
 
 class Optimizer(object):
     def __init__(self):
@@ -554,6 +558,8 @@ def _run(self, net, param_init_net, param_info):
         )
 
         if self.rowWise:
+            assert self.engine == "SIMD", "Got {}".format(self.engine)
+
             shapes, types = workspace.InferShapesAndTypes([param_init_net])
             if str(param) not in shapes:
                 # Type/shape inference is not available for this param, fallback
@@ -577,13 +583,24 @@ def _run(self, net, param_init_net, param_info):
                     shape=[shapes[str(param)][0]],
                     value=0.0
                 )
-
         else:
-            param_squared_sum = param_init_net.ConstantFill(
-                [param],
-                str(param) + "_squared_sum",
-                value=0.0
-            )
+            if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16":
+                shapes, types = workspace.InferShapesAndTypes([param_init_net])
+                assert str(param) in shapes, shapes
+                shape = shapes[str(param)]
+
+                param_squared_sum = param_init_net.Float16ConstantFill(
+                    [],
+                    str(param) + "_squared_sum",
+                    value=0.0,
+                    shape=shape,
+                )
+            else:
+                param_squared_sum = param_init_net.ConstantFill(
+                    [param],
+                    str(param) + "_squared_sum",
+                    value=0.0
+                )
 
         self._aux_params.local.append(param_squared_sum)
 
@@ -604,7 +621,7 @@ def _run(self, net, param_init_net, param_info):
                 [param, param_squared_sum, grad.indices, grad.values, lr],
                 [param, param_squared_sum],
                 epsilon=self.epsilon,
-                engine=self.engine
+                engine=self.engine,
             )
         else:
             output_args = [param, param_squared_sum]

From 19065f91fc71116ab639b27fb1d25026d4d23191 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 14 Sep 2018 07:34:06 -0700
Subject: [PATCH 148/237] Centralize TypeExtendedInterface casts. (#11576)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11576

Previously, they were spattered throughout the codebase.
We now follow this convention:

- LegacyTypeDispatch gives you Type
- Context gives you TypeExtendedInterface
- Tensor::type() gives you Type
- at::getType() gives you TypeExtendedInterface

I change some sites to use getType() over type().

Reviewed By: SsnL

Differential Revision: D9790187

fbshipit-source-id: 5e2577cb590a5bbf5df530f3763d3b3c0b4625ca
---
 aten/src/ATen/Context.cpp                 |  9 ++++--
 aten/src/ATen/Context.h                   | 36 +++++++++++++----------
 aten/src/ATen/function_wrapper.py         |  2 +-
 aten/src/ATen/native/LegacyBridge.cpp     | 16 +++++-----
 aten/src/ATen/native/TensorFactories.cpp  |  4 +--
 aten/src/ATen/templates/Functions.h       |  7 ++---
 caffe2/contrib/aten/aten_op_template.h    |  2 +-
 caffe2/contrib/aten/gen_op.py             |  2 +-
 tools/autograd/templates/VariableType.cpp |  8 ++---
 tools/autograd/templates/VariableType.h   |  4 +--
 10 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 40cb364e91fd0c..cb96814927130c 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -12,6 +12,7 @@
 
 #include "ATen/CPUGenerator.h"
 #include "ATen/RegisterCPU.h"
+#include "ATen/Tensor.h"
 
 #include "TH/TH.h"  // for USE_LAPACK
 
@@ -107,17 +108,21 @@ bool Context::setFlushDenormal(bool on) {
 #endif
 }
 
-Type& getType(TensorOptions options) {
+TypeExtendedInterface& getType(TensorOptions options) {
   return globalContext().getType(
             options.backend(), options.dtype(), options.is_variable());
 }
 
-Type& getType(const TensorImpl* impl) {
+TypeExtendedInterface& getType(const TensorImpl* impl) {
   Backend backend = tensorTypeIdToBackend(impl->type_id());
   return globalContext().getType(
             backend, impl->scalar_type(), impl->is_variable());
 }
 
+TypeExtendedInterface& getType(const Tensor& t) {
+  return getType(t.unsafeGetTensorImpl());
+}
+
 Allocator* getCPUAllocator() {
   return getTHDefaultAllocator();
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 7b3634dd83086f..4e147cffabbe86 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -5,6 +5,7 @@
 #include "ATen/CUDAStream.h"
 #include "ATen/core/Generator.h"
 #include "ATen/Type.h"
+#include "ATen/TypeExtendedInterface.h"
 #include "ATen/Utils.h"
 #include "ATen/core/Error.h"
 #include "ATen/detail/CUDAHooksInterface.h"
@@ -21,23 +22,25 @@
 
 namespace at {
 
+struct Tensor;
+
 class AT_API Context {
 public:
   Context();
-  Type* getNonVariableTypeRaw(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s);
+  TypeExtendedInterface* getNonVariableTypeRaw(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface*>(globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s));
   }
-  Type * getNonVariableTypeOpt(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s);
+  TypeExtendedInterface * getNonVariableTypeOpt(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface*>(globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s));
   }
-  Type & getNonVariableType(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getNonVariableType(p, s);
+  TypeExtendedInterface & getNonVariableType(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface&>(globalLegacyTypeDispatch().getNonVariableType(p, s));
   }
-  Type & getVariableType(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getVariableType(p, s);
+  TypeExtendedInterface & getVariableType(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface&>(globalLegacyTypeDispatch().getVariableType(p, s));
   }
-  Type & getType(Backend p, ScalarType s, bool is_variable) {
-    return globalLegacyTypeDispatch().getType(p, s, is_variable);
+  TypeExtendedInterface & getType(Backend p, ScalarType s, bool is_variable) {
+    return static_cast<TypeExtendedInterface&>(globalLegacyTypeDispatch().getType(p, s, is_variable));
   }
   // The passed in Type must be delete'able
   // TODO: Just make it take a unique_ptr
@@ -142,24 +145,25 @@ static inline void init() {
   }
 }
 
-static inline Type& getNonVariableType(Backend p, ScalarType s) {
+static inline TypeExtendedInterface& getNonVariableType(Backend p, ScalarType s) {
   return globalContext().getNonVariableType(p, s);
 }
 
-static inline Type& getNonVariableType(DeviceType p, ScalarType s) {
+static inline TypeExtendedInterface& getNonVariableType(DeviceType p, ScalarType s) {
   return globalContext().getNonVariableType(deviceTypeToBackend(p), s);
 }
 
-AT_API Type& getType(TensorOptions options);
-AT_API Type& getType(const TensorImpl*);
+AT_API TypeExtendedInterface& getType(TensorOptions options);
+AT_API TypeExtendedInterface& getType(const TensorImpl*);
+AT_API TypeExtendedInterface& getType(const Tensor&);
 
 AT_API Allocator* getCPUAllocator();
 
-static inline Type& CPU(ScalarType s) {
+static inline TypeExtendedInterface& CPU(ScalarType s) {
   return getNonVariableType(Backend::CPU, s);
 }
 
-static inline Type& CUDA(ScalarType s) {
+static inline TypeExtendedInterface& CUDA(ScalarType s) {
   return getNonVariableType(Backend::CUDA, s);
 }
 
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 266f9645a18527..e88dd8867a0e96 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -1151,7 +1151,7 @@ def find_formal(formal_name, formals):
                 option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor)
             else:
                 # doesn't depend on a specific type, use undefined float
-                option['inferred_type'] = 'detail::non_specific_type()'
+                option['inferred_type'] = 'at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)'
             declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION
             top_env['function_declarations'].append(declaration.substitute(env))
             if is_factory_method:
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 2803112b7692bb..0aaf2149b42a05 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -136,34 +136,34 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta
 
 Tensor tensor(const Type& dtype) {
   if (_type_has_native(dtype)) {
-    return static_cast<const TypeExtendedInterface&>(dtype).native_tensor();
+    return at::getType(dtype.options()).native_tensor();
   } else {
-    return static_cast<const TypeExtendedInterface&>(dtype).th_tensor();
+    return at::getType(dtype.options()).th_tensor();
   }
 }
 
 Tensor tensor(const Type& dtype, ArrayRef<int64_t> size) {
   if (_type_has_native(dtype)) {
-    return static_cast<const TypeExtendedInterface&>(dtype).native_tensor(size);
+    return at::getType(dtype.options()).native_tensor(size);
   } else {
-    return static_cast<const TypeExtendedInterface&>(dtype).th_tensor(size);
+    return at::getType(dtype.options()).th_tensor(size);
   }
 }
 
 Tensor sparse_coo_tensor(const Type& dtype, ArrayRef<int64_t> size) {
-  return static_cast<const TypeExtendedInterface&>(dtype.toSparse()).native_sparse_coo_tensor(size);
+  return at::getType(dtype.options().layout(at::kSparse)).native_sparse_coo_tensor(size);
 }
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
-  return static_cast<const TypeExtendedInterface&>(values.type().toSparse()).native_sparse_coo_tensor(indices, values);
+  return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
 }
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return static_cast<const TypeExtendedInterface&>(values.type().toSparse()).native_sparse_coo_tensor(indices, values, size);
+  return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
 }
 
 Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return static_cast<const TypeExtendedInterface&>(values.type().toSparse())._native_sparse_coo_tensor_unsafe(indices, values, size);
+  return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size);
 }
 
 int64_t get_device(const Tensor& self) {
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 20211aeedfa32b..178045d9fd0de4 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -60,7 +60,7 @@ void window_function_checks(
 }
 
 const TypeExtendedInterface& getFactoryType(const TensorOptions& options) {
-  return static_cast<const TypeExtendedInterface&>(at::getType(options));
+  return at::getType(options);
 }
 
 } // namespace
@@ -98,7 +98,7 @@ Tensor& arange_out(Tensor& result, Scalar end) {
 }
 
 Tensor _dim_arange(const Tensor& like, int64_t dim) {
-  return static_cast<const TypeExtendedInterface&>(like.type().toScalarType(at::kLong))._arange(like.size(dim));
+  return at::getType(like.options().dtype(at::kLong))._arange(like.size(dim));
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index 7e2b6580343be4..8bbc17af5da291 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -25,14 +25,11 @@ namespace detail {
 
 static inline TypeExtendedInterface & infer_type(const Tensor & t) {
   AT_CHECK(t.defined(), "undefined Tensor");
-  return static_cast<TypeExtendedInterface&>(t.type());
+  return getType(t);
 }
 static inline TypeExtendedInterface & infer_type(const TensorList & tl) {
   AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
-  return static_cast<TypeExtendedInterface&>(tl[0].type());
-}
-static inline TypeExtendedInterface & non_specific_type() {
-  return static_cast<TypeExtendedInterface&>(at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float));
+  return getType(tl[0]);
 }
 
 } // namespace detail
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index 8b12cc9dad16d8..583a2fa492647b 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -223,7 +223,7 @@ class ATenOp : public Operator<Context> {
     CAFFE_THROW("unsupported type annotation: ", name);
   }
   at::TypeExtendedInterface & stringToType(const std::string & name) {
-    return static_cast<at::TypeExtendedInterface&>(at::getNonVariableType(backend(), stringToScalarType(name)));
+    return at::getNonVariableType(backend(), stringToScalarType(name));
   }
   at::TypeExtendedInterface * readTypeAttribute(const std::string & name) {
     CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index bc75ac9859dcc5..70843bb0d91108 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -278,7 +278,7 @@ def find_factory_methods(decls):
                     # first tensor input is used to define the output type.
                     defined_inferred_type = True
                     env['statements'].append(
-                        'auto inferred_type = &(static_cast<at::TypeExtendedInterface&>({}.type()));'.format(
+                        'auto inferred_type = &at::getType({});'.format(
                             arg['name']))
             else:
                 init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index c6a84afe710a89..6bd409ad0a4040 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -42,9 +42,9 @@ using namespace torch::autograd::generated;
 
 namespace torch { namespace autograd {
 
-VariableType::VariableType(Context* context, Type* baseType)
+VariableType::VariableType(Context* context, TypeExtendedInterface* baseType)
   : TypeDefault(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
-  , baseType(static_cast<TypeExtendedInterface*>(baseType))
+  , baseType(baseType)
   , id_(context->freshTypeID()) {
   str = std::string("Variable[") + baseType->toString() + "]";
 }
@@ -105,7 +105,7 @@ TypeID VariableType::ID() const {
 std::vector<std::unique_ptr<Type>> type_to_variable_type;
 
 // XXX - this is not threadsafe with uses of Variables
-void register_variable_type_for(Type* baseType) {
+void register_variable_type_for(TypeExtendedInterface* baseType) {
   AT_ASSERT(baseType);
   size_t base_id = static_cast<size_t>(baseType->ID());
   if(type_to_variable_type.size() <= base_id) {
@@ -166,7 +166,7 @@ REGISTER_VARIABLE_HOOKS(VariableHooks)
 // Pre-condition: backend/scalar_type is a valid type in the type_registry
 void VariableHooks::registerVariableTypeFor(at::LegacyTypeDispatch* context, at::Backend backend, at::ScalarType scalar_type) const {
   auto* baseType = context->getNonVariableTypeRaw(backend, scalar_type);
-  register_variable_type_for(baseType);
+  register_variable_type_for(static_cast<at::TypeExtendedInterface*>(baseType));
 }
 
 at::Type& VariableHooks::getVariableTypeFromBaseType(const at::Type& baseType) const {
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 19de1b910a0b69..446fb5b889f47a 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -31,10 +31,10 @@ using at::ScalarType;
 using at::optional;
 using at::Device;
 
-void register_variable_type_for(at::Type* baseType);
+void register_variable_type_for(at::TypeExtendedInterface* baseType);
 
 struct TORCH_API VariableType final : public at::TypeDefault {
-  VariableType(Context* context, at::Type* baseType);
+  VariableType(Context* context, at::TypeExtendedInterface* baseType);
   at::ScalarType scalarType() const override;
   virtual caffe2::TypeMeta typeMeta() const override;
   at::Backend backend() const override;

From 74197c7115af427d041fd6783c9a95207b6603c5 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 14 Sep 2018 07:36:16 -0700
Subject: [PATCH 149/237] Restore support for dim=None on WeightNorm. (#11661)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11661

Reviewed By: veenix

Differential Revision: D9826799

Pulled By: ezyang

fbshipit-source-id: 9eec57bb27a365406669e412f6eb88741b22ed3d
---
 test/test_nn.py               | 6 ++++++
 torch/nn/utils/weight_norm.py | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index 03a35bf10d88f8..809b820c42202e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1683,6 +1683,12 @@ def test_weight_norm(self):
         self.assertEqual(m.weight_g.size(), (1, 5))
         self.assertEqual(m(input), expected_output)
 
+        # test with dim=None
+        m = nn.Linear(5, 7)
+        expected_output = m(input)
+        m = torch.nn.utils.weight_norm(m, dim=None)
+        self.assertEqual(m(input), expected_output)
+
     def test_weight_norm_pickle(self):
         m = torch.nn.utils.weight_norm(nn.Linear(5, 7))
         m = pickle.loads(pickle.dumps(m))
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
index ddefa3d6f5f544..a80ef02b411ea0 100644
--- a/torch/nn/utils/weight_norm.py
+++ b/torch/nn/utils/weight_norm.py
@@ -7,6 +7,8 @@
 
 class WeightNorm(object):
     def __init__(self, name, dim):
+        if dim is None:
+            dim = -1
         self.name = name
         self.dim = dim
 
@@ -17,6 +19,9 @@ def compute_weight(self, module):
 
     @staticmethod
     def apply(module, name, dim):
+        if dim is None:
+            dim = -1
+
         fn = WeightNorm(name, dim)
 
         weight = getattr(module, name)

From c391c2006304233eb5745f99bba850785c0297d6 Mon Sep 17 00:00:00 2001
From: Neeraj Pradhan <npradhan@uber.com>
Date: Fri, 14 Sep 2018 07:49:47 -0700
Subject: [PATCH 150/237] Adding .expand method for TransformedDistribution
 (#11607)

Summary:
This PR:
 - adds a `.expand` method for `TransformedDistribution` along the lines of #11341.
 - uses this method to simplify `.expand` in distribution classes that subclass off of `TransformedDistribution`.
 - restores testing of `TransformedDistribution` fixtures.
 - fixes some bugs wherein we were not setting certain attributes in the expanded instances, and adds tests for `.mean` and `.variance` which use these attributes.

There are many cases where users directly use `TransformedDistribution` rather than subclassing off it. In such cases, it seems rather inconvenient to have to write a separate class just to define a `.expand` method. The default implementation should suffice in these cases.

cc. fritzo, vishwakftw, alicanb
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11607

Differential Revision: D9818225

Pulled By: soumith

fbshipit-source-id: 2c4b3812b9a03e6985278cfce0f9a127ce536f23
---
 test/common.py                                 |  7 +++++++
 test/test_distributions.py                     | 18 +++++++++++-------
 torch/distributions/chi2.py                    |  1 -
 torch/distributions/gumbel.py                  |  9 +++------
 torch/distributions/half_cauchy.py             |  6 +-----
 torch/distributions/half_normal.py             |  6 +-----
 torch/distributions/log_normal.py              |  6 +-----
 torch/distributions/logistic_normal.py         |  9 +--------
 torch/distributions/pareto.py                  |  7 +++----
 torch/distributions/relaxed_bernoulli.py       |  5 +----
 torch/distributions/relaxed_categorical.py     |  7 +------
 .../distributions/transformed_distribution.py  | 10 ++++++++++
 torch/distributions/weibull.py                 |  6 ++++--
 13 files changed, 44 insertions(+), 53 deletions(-)

diff --git a/test/common.py b/test/common.py
index 04986fcf5af57b..c1cd7999d82901 100644
--- a/test/common.py
+++ b/test/common.py
@@ -350,6 +350,13 @@ def assertTensorsEqual(a, b):
                     self.assertTrue(torch.equal(nan_mask, b != b), message)
                     diff = a - b
                     diff[nan_mask] = 0
+                    # inf check if allow_inf=True
+                    if allow_inf:
+                        inf_mask = (a == float("inf")) | (a == float("-inf"))
+                        self.assertTrue(torch.equal(inf_mask,
+                                                    (b == float("inf")) | (b == float("-inf"))),
+                                        message)
+                        diff[inf_mask] = 0
                     # TODO: implement abs on CharTensor
                     if diff.is_signed() and 'CharTensor' not in diff.type():
                         diff = diff.abs()
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 61d5e57d910b54..8264337a681782 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -796,27 +796,33 @@ def test_has_examples(self):
     def test_distribution_expand(self):
         shapes = [torch.Size(), torch.Size((2,)), torch.Size((2, 1))]
         for Dist, params in EXAMPLES:
-            if Dist.__name__ == "TransformedDistribution":
-                continue
             for param in params:
                 for shape in shapes:
                     d = Dist(**param)
                     expanded_shape = shape + d.batch_shape
                     original_shape = d.batch_shape + d.event_shape
                     expected_shape = shape + original_shape
-                    expanded = d.expand(batch_shape=expanded_shape)
+                    expanded = d.expand(batch_shape=list(expanded_shape))
                     sample = expanded.sample()
                     actual_shape = expanded.sample().shape
                     self.assertEqual(expanded.__class__, d.__class__)
                     self.assertEqual(d.sample().shape, original_shape)
                     self.assertEqual(expanded.log_prob(sample), d.log_prob(sample))
                     self.assertEqual(actual_shape, expected_shape)
+                    self.assertEqual(expanded.batch_shape, expanded_shape)
+                    try:
+                        self.assertEqual(expanded.mean,
+                                         d.mean.expand(expanded_shape + d.event_shape),
+                                         allow_inf=True)
+                        self.assertEqual(expanded.variance,
+                                         d.variance.expand(expanded_shape + d.event_shape),
+                                         allow_inf=True)
+                    except NotImplementedError:
+                        pass
 
     def test_distribution_subclass_expand(self):
         expand_by = torch.Size((2,))
         for Dist, params in EXAMPLES:
-            if Dist.__name__ == "TransformedDistribution":
-                continue
 
             class SubClass(Dist):
                 pass
@@ -2221,8 +2227,6 @@ def test_independent_shape(self):
 
     def test_independent_expand(self):
         for Dist, params in EXAMPLES:
-            if Dist.__name__ == "TransformedDistribution":
-                continue
             for param in params:
                 base_dist = Dist(**param)
                 for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py
index fe8c5956fb5b68..7fdc5e8d0fd114 100644
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@@ -24,7 +24,6 @@ def __init__(self, df, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Chi2, _instance)
-        batch_shape = torch.Size(batch_shape)
         return super(Chi2, self).expand(batch_shape, new)
 
     @property
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index f6d10aa4b0efb9..b489c8754aa441 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -40,12 +40,9 @@ def __init__(self, loc, scale, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Gumbel, _instance)
-        batch_shape = torch.Size(batch_shape)
-        base_dist = self.base_dist.expand(batch_shape)
-        transforms = self.transforms
-        super(Gumbel, new).__init__(base_dist, transforms, validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        return super(Gumbel, self).expand(batch_shape, _instance=new)
 
     @property
     def mean(self):
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index 8b81ea8b8c862d..916cb47019a28f 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -35,11 +35,7 @@ def __init__(self, scale, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(HalfCauchy, _instance)
-        batch_shape = torch.Size(batch_shape)
-        base_dist = self.base_dist.expand(batch_shape)
-        super(HalfCauchy, new).__init__(base_dist, AbsTransform(), validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        return super(HalfCauchy, self).expand(batch_shape, _instance=new)
 
     @property
     def scale(self):
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index d18cb3b971cd2c..00d0015231c2e6 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -35,11 +35,7 @@ def __init__(self, scale, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(HalfNormal, _instance)
-        batch_shape = torch.Size(batch_shape)
-        base_dist = self.base_dist.expand(batch_shape)
-        super(HalfNormal, new).__init__(base_dist, AbsTransform(), validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        return super(HalfNormal, self).expand(batch_shape, _instance=new)
 
     @property
     def scale(self):
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index 274bcd83c47a83..38fe532bde31f7 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -33,11 +33,7 @@ def __init__(self, loc, scale, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LogNormal, _instance)
-        batch_shape = torch.Size(batch_shape)
-        base_dist = self.base_dist.expand(batch_shape)
-        super(LogNormal, new).__init__(base_dist, ExpTransform(), validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        return super(LogNormal, self).expand(batch_shape, _instance=new)
 
     @property
     def loc(self):
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index 909a30d3db8a7c..c1be0d2b086b46 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -41,14 +41,7 @@ def __init__(self, loc, scale, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LogisticNormal, _instance)
-        batch_shape = torch.Size(batch_shape)
-        base_dist = self.base_dist.expand(batch_shape + self.base_dist.batch_shape[-1:])
-        super(LogisticNormal, new).__init__(base_dist,
-                                            StickBreakingTransform(),
-                                            validate_args=False)
-        new._event_shape = self._event_shape
-        new._validate_args = self._validate_args
-        return new
+        return super(LogisticNormal, self).expand(batch_shape, _instance=new)
 
     @property
     def loc(self):
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 6fa499642065de..c860f07b13835d 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -30,10 +30,9 @@ def __init__(self, scale, alpha, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Pareto, _instance)
-        base_dist = self.base_dist.expand(batch_shape)
-        super(Pareto, new).__init__(base_dist, self.transforms, validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        new.scale = self.scale.expand(batch_shape)
+        new.alpha = self.alpha.expand(batch_shape)
+        return super(Pareto, self).expand(batch_shape, _instance=new)
 
     @property
     def mean(self):
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index f885bfea2bc422..09897e1d34190f 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -120,10 +120,7 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(RelaxedBernoulli, _instance)
-        base_dist = self.base_dist.expand(batch_shape)
-        super(RelaxedBernoulli, new).__init__(base_dist, SigmoidTransform(), validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        return super(RelaxedBernoulli, self).expand(batch_shape, _instance=new)
 
     @property
     def temperature(self):
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 59f5b5998a12d3..86406cd207ab52 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -116,12 +116,7 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(RelaxedOneHotCategorical, _instance)
-        base_dist = self.base_dist.expand(batch_shape)
-        super(RelaxedOneHotCategorical, new).__init__(base_dist,
-                                                      ExpTransform(),
-                                                      validate_args=False)
-        new._validate_args = self._validate_args
-        return new
+        return super(RelaxedOneHotCategorical, self).expand(batch_shape, _instance=new)
 
     @property
     def temperature(self):
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 79e2c691638076..a7c49b4f7b1810 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -56,6 +56,16 @@ def __init__(self, base_distribution, transforms, validate_args=None):
         event_shape = shape[len(shape) - event_dim:]
         super(TransformedDistribution, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(TransformedDistribution, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist_batch_shape = batch_shape + self.base_dist.batch_shape[len(self.batch_shape):]
+        new.base_dist = self.base_dist.expand(base_dist_batch_shape)
+        new.transforms = self.transforms
+        super(TransformedDistribution, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @constraints.dependent_property
     def support(self):
         return self.transforms[-1].codomain if self.transforms else self.base_dist.support
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index 0636a74d113f82..8b5afee400b780 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -36,12 +36,14 @@ def __init__(self, scale, concentration, validate_args=None):
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Weibull, _instance)
-        batch_shape = torch.Size(batch_shape)
         new.scale = self.scale.expand(batch_shape)
         new.concentration = self.concentration.expand(batch_shape)
+        new.concentration_reciprocal = new.concentration.reciprocal()
         base_dist = self.base_dist.expand(batch_shape)
+        transforms = [PowerTransform(exponent=new.concentration_reciprocal),
+                      AffineTransform(loc=0, scale=new.scale)]
         super(Weibull, new).__init__(base_dist,
-                                     self.transforms,
+                                     transforms,
                                      validate_args=False)
         new._validate_args = self._validate_args
         return new

From cda71e260026b84990650809b5b5eae3a201742f Mon Sep 17 00:00:00 2001
From: Neeraj Pradhan <npradhan@uber.com>
Date: Fri, 14 Sep 2018 07:52:21 -0700
Subject: [PATCH 151/237] Disallow scalar parameters in Dirichlet and
 Categorical (#11589)

Summary:
This adds a small check in `Dirichlet` and `Categorical` `__init__` methods to ensure that scalar parameters are not admissible.

**Motivation**
Currently, `Dirichlet` throws no error when provided with a scalar parameter, but if we `expand` a scalar instance, it inherits the empty event shape from the original instance and gives unexpected results.

The alternative to this check is to promote `event_shape` to be `torch.Size((1,))` if the original instance was a scalar, but that seems to add a bit more complexity (and changes the behavior of `expand` in that it would affect the `event_shape` as well as the `batch_shape` now). Does this seem reasonable? cc. alicanb, fritzo.

```python
In [4]: d = dist.Dirichlet(torch.tensor(1.))

In [5]: d.sample()
Out[5]: tensor(1.0000)

In [6]: d.log_prob(d.sample())
Out[6]: tensor(0.)

In [7]: e = d.expand([3])

In [8]: e.sample()
Out[8]: tensor([0.3953, 0.1797, 0.4250])  # interpreted as events

In [9]: e.log_prob(e.sample())
Out[9]: tensor(0.6931)  # wrongly summed out

In [10]: e.batch_shape
Out[10]: torch.Size([3])

In [11]: e.event_shape
Out[11]: torch.Size([])  # cannot be empty
```

Additionally, based on review comments, this removes `real_vector` constraint. This was only being used in `MultivariateNormal`, but I am happy to revert this if we want to keep it around for backwards compatibility.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11589

Differential Revision: D9818271

Pulled By: soumith

fbshipit-source-id: f9bbba90ed6f04e0b5bdfa169e70ca20b280fc74
---
 torch/distributions/categorical.py | 4 ++++
 torch/distributions/dirichlet.py   | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 2ad69cb1bacec1..fcfea47c2eadd5 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -45,8 +45,12 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         if (probs is None) == (logits is None):
             raise ValueError("Either `probs` or `logits` must be specified, but not both.")
         if probs is not None:
+            if probs.dim() < 1:
+                raise ValueError("`probs` parameter must be at least one-dimensional.")
             self.probs = probs / probs.sum(-1, keepdim=True)
         else:
+            if logits.dim() < 1:
+                raise ValueError("`logits` parameter must be at least one-dimensional.")
             self.logits = logits - logits.logsumexp(dim=-1, keepdim=True)
         self._param = self.probs if probs is not None else self.logits
         self._num_events = self._param.size()[-1]
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index 94d66d60c0848c..f618628c9f68e7 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -54,7 +54,9 @@ class Dirichlet(ExponentialFamily):
     has_rsample = True
 
     def __init__(self, concentration, validate_args=None):
-        self.concentration, = broadcast_all(concentration)
+        if concentration.dim() < 1:
+            raise ValueError("`concentration` parameter must be at least one-dimensional.")
+        self.concentration = concentration
         batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:]
         super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 

From 6c3792b9ecdcd94c799c2a18dbfd056093e109ac Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Fri, 14 Sep 2018 08:28:49 -0700
Subject: [PATCH 152/237] Implement UndefinedType::typeMeta.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11666

Differential Revision: D9816212

Pulled By: gchanan

fbshipit-source-id: 079899590150009bc2e2a3bbdc78a98de9380e37
---
 aten/src/ATen/UndefinedType.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
index 9c3457bdd4803f..8e9722eae3be09 100644
--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -9,7 +9,7 @@ ScalarType UndefinedType::scalarType() const {
   return ScalarType::Undefined;
 }
 caffe2::TypeMeta UndefinedType::typeMeta() const {
-  AT_ERROR("typeMeta not defined for UndefinedType");
+  return scalarTypeToTypeMeta(scalarType());
 }
 Backend UndefinedType::backend() const {
   return Backend::Undefined;

From 2631da0822149c19214dba94510e29abb5b25f75 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Fri, 14 Sep 2018 08:55:24 -0700
Subject: [PATCH 153/237] Move some Tensor method definitions from Type.h to
 TensorMethods.h. (#11650)

Summary:
There's no reason they need to be in Type.h and this moves us along the path of not having circular dependencies (so we can get rid of TensorMethods.h).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11650

Reviewed By: ezyang

Differential Revision: D9812271

Pulled By: gchanan

fbshipit-source-id: 8b70db9a5eb0a332398ab2e8998eeaf7d2eea6d7
---
 aten/src/ATen/core/TensorMethods.h      | 16 ++++++++++++++++
 aten/src/ATen/core/Type.h               | 16 ----------------
 aten/src/ATen/templates/TensorMethods.h | 16 ++++++++++++++++
 aten/src/ATen/templates/Type.h          | 16 ----------------
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index aba169279e6d2b..ff85267e78fb81 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -1215,6 +1215,22 @@ inline Scalar Tensor::_local_scalar() const {
     return type()._local_scalar(*this);
 }
 
+inline bool Tensor::is_variable() const noexcept {
+  return type().is_variable();
+}
+
+inline ScalarType Tensor::dtype() const noexcept {
+  return type().scalarType();
+}
+
+inline Layout Tensor::layout() const noexcept {
+  return type().layout();
+}
+
+inline Device Tensor::device() const {
+  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
+}
+
 #define DEFINE_CAST(T, name, _)                  \
   template <>                                    \
   inline T* Tensor::data() const {               \
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index e8b2e2d40980c2..dabd19c7837223 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -629,20 +629,4 @@ struct AT_API Type {
 
 };
 
-inline bool Tensor::is_variable() const noexcept {
-  return type().is_variable();
-}
-
-inline ScalarType Tensor::dtype() const noexcept {
-  return type().scalarType();
-}
-
-inline Layout Tensor::layout() const noexcept {
-  return type().layout();
-}
-
-inline Device Tensor::device() const {
-  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
-}
-
 } // namespace at
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 54fd80c5f13463..8283bea01f6bed 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -55,6 +55,22 @@ inline void Tensor::set_data(Tensor new_data) {
 // all static inline to allow for inlining of the non-dynamic part of dispatch
 ${tensor_method_definitions}
 
+inline bool Tensor::is_variable() const noexcept {
+  return type().is_variable();
+}
+
+inline ScalarType Tensor::dtype() const noexcept {
+  return type().scalarType();
+}
+
+inline Layout Tensor::layout() const noexcept {
+  return type().layout();
+}
+
+inline Device Tensor::device() const {
+  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
+}
+
 #define DEFINE_CAST(T, name, _)                  \
   template <>                                    \
   inline T* Tensor::data() const {               \
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index ef49b2880dcec7..8dda48749b0c97 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -143,20 +143,4 @@ struct AT_API Type {
 
 };
 
-inline bool Tensor::is_variable() const noexcept {
-  return type().is_variable();
-}
-
-inline ScalarType Tensor::dtype() const noexcept {
-  return type().scalarType();
-}
-
-inline Layout Tensor::layout() const noexcept {
-  return type().layout();
-}
-
-inline Device Tensor::device() const {
-  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
-}
-
 } // namespace at

From 72822ee6b251118aefbe9af75ac0fcb4ee56a0ee Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 14 Sep 2018 08:55:39 -0700
Subject: [PATCH 154/237] =?UTF-8?q?Fix=20#11430=20(CPU=20only=20builds=20r?=
 =?UTF-8?q?aise=20opaque=20error=20message=20when=20calling=20.=E2=80=A6?=
 =?UTF-8?q?=20(#11533)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
…cuda())

While I was at it, I audited all other ways I know how we might get a CUDA
type from PyTorch and fixed more constructors which don't work.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11533

Differential Revision: D9775786

Pulled By: ezyang

fbshipit-source-id: cd07cdd375fdf74945539ec475a48bf08cbc0c17
---
 test/test_torch.py                               |  9 +++++++++
 tools/autograd/gen_python_functions.py           |  4 ++--
 .../templates/python_torch_functions.cpp         | 14 +++++++-------
 .../templates/python_torch_functions_dispatch.h  |  4 ++--
 torch/csrc/DynamicTypes.cpp                      |  4 ++++
 torch/csrc/tensor/python_tensor.cpp              |  6 +++---
 torch/csrc/utils/cuda_lazy_init.cpp              | 12 +++++++++---
 torch/csrc/utils/cuda_lazy_init.h                | 16 ++++++++++++++--
 8 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 837d7715f3d9e1..260bfe3d42e8e7 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8852,6 +8852,15 @@ def test_meshgrid(self):
         self.assertTrue(grid_b.equal(expected_grid_b))
         self.assertTrue(grid_c.equal(expected_grid_c))
 
+    @unittest.skipIf(torch.cuda.is_available(), "CUDA is available, can't test CUDA not built error")
+    def test_cuda_not_built(self):
+        msg = "Torch not compiled with CUDA enabled"
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.current_device())
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1], device="cuda"))
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).cuda())
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.FloatTensor())
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).to(device="cuda"))
+
 
 # Functions to test negative dimension wrapping
 METHOD = 1
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 8712c7c6ca2d43..249ba042bb7056 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -427,9 +427,9 @@ def append_actuals_formals(actual, formal):
         env['actuals'] = actuals
 
         if has_tensor_options:
-            env['initialize_cuda'] = 'maybe_initialize_cuda(at::getType(options));'
+            env['initialize_cuda'] = 'maybe_initialize_cuda(options);'
         else:
-            env['initialize_cuda'] = 'maybe_initialize_cuda({});'.format(type_args[0]['name']) if type_args else ''
+            env['initialize_cuda'] = ''
 
         if 'call_args' in declaration:
             env['dispatch_args'] = declaration['call_args']
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 51ded6eb792f34..b2b6739710c8da 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -65,7 +65,7 @@ inline Tensor dispatch_arange(Scalar end, Tensor result) {
 }
 
 inline Tensor dispatch_arange(Scalar end, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::arange(end, options);
 }
@@ -76,7 +76,7 @@ inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, Tensor resu
 }
 
 inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::arange(start, end, step, options);
 }
@@ -147,7 +147,7 @@ inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, Tensor resul
 }
 
 inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   DeviceGuard device_guard(options.device());
   return torch::range(start, end, step, options);
@@ -189,7 +189,7 @@ inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator
   return at::randint_out(result, high, size, generator);
 }
 inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(high, size, generator, options);
 }
@@ -198,7 +198,7 @@ inline Tensor dispatch_randint(int64_t high, IntList size, Tensor result) {
   return at::randint_out(result, high, size);
 }
 inline Tensor dispatch_randint(int64_t high, IntList size, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(high, size, options);
 }
@@ -207,7 +207,7 @@ inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generato
   return at::randint_out(result, low, high, size, generator);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generator * generator, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(low, high, size, generator, options);
 }
@@ -216,7 +216,7 @@ inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Tensor r
   return at::randint_out(result, low, high, size);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(low, high, size, options);
 }
diff --git a/tools/autograd/templates/python_torch_functions_dispatch.h b/tools/autograd/templates/python_torch_functions_dispatch.h
index 78dbe4c225f639..fdc33538bcada2 100644
--- a/tools/autograd/templates/python_torch_functions_dispatch.h
+++ b/tools/autograd/templates/python_torch_functions_dispatch.h
@@ -28,8 +28,8 @@ static at::Type& default_type() {
   return torch::tensors::get_default_tensor_type();
 }
 
-static void maybe_initialize_cuda(const at::Type &type) {
-  if (type.is_cuda()) {
+static void maybe_initialize_cuda(const at::TensorOptions& options) {
+  if (options.device().is_cuda()) {
     torch::utils::cuda_lazy_init();
   }
 }
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index bed40cb67a2d0e..e5a95a94e1ae85 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -7,6 +7,7 @@
 #include "torch/csrc/Exceptions.h"
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/utils/cuda_enabled.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
 
 #include <ATen/ATen.h>
 
@@ -99,6 +100,9 @@ void registerLayoutObject(THPLayout *layout, at::Backend backend) {
 
 at::Type& getVariableType(at::ScalarType scalarType, const THPLayout& layout, const at::Device& device) {
   const at::Backend backend = get_backend(device.type() == at::Device::Type::CUDA, layout.layout == at::Layout::Sparse);
+  if (device.is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
   auto baseType = at::globalContext().getNonVariableTypeOpt(backend, scalarType);
   if (!baseType) {
     std::ostringstream oss;
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index beda85e938be1b..4a40cf243f3a68 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -43,6 +43,9 @@ struct PyTensorType {
   // Precondition: Access to this struct is protected by the GIL
   at::Type* aten_type() {
     if (!aten_type_) {
+      if (is_cuda) {
+        torch::utils::cuda_lazy_init();
+      }
       auto* baseType = globalContext().getNonVariableTypeOpt(static_cast<at::Backend>(backend), static_cast<at::ScalarType>(scalar_type));
       aten_type_ = baseType ? torch::autograd::VariableType::getVariableTypeFromBaseType(*baseType) : nullptr;
     }
@@ -69,9 +72,6 @@ static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs
   if (!aten_type) {
     throw unavailable_type(tensor_type);
   }
-  if (aten_type->is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
   return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(*aten_type, args, kwargs));
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/utils/cuda_lazy_init.cpp b/torch/csrc/utils/cuda_lazy_init.cpp
index ac35e04e1decbf..6fd73526bba520 100644
--- a/torch/csrc/utils/cuda_lazy_init.cpp
+++ b/torch/csrc/utils/cuda_lazy_init.cpp
@@ -10,13 +10,19 @@ namespace torch {
 namespace utils {
 
 void cuda_lazy_init() {
-  static std::once_flag once;
-  std::call_once(once, []() {
+  AutoGIL g;
+  // Protected by the GIL.  We don't use call_once because under ASAN it
+  // has a buggy implementation that deadlocks if an instance throws an
+  // exception.  In any case, call_once isn't necessary, because we
+  // have taken a lock.
+  static bool run_yet = false;
+  if (!run_yet) {
     auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
     if (!module) throw python_error();
     auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
     if (!res) throw python_error();
-  });
+    run_yet = true;
+  }
 }
 
 }
diff --git a/torch/csrc/utils/cuda_lazy_init.h b/torch/csrc/utils/cuda_lazy_init.h
index 8b1d4be125beaa..f8522c1463443a 100644
--- a/torch/csrc/utils/cuda_lazy_init.h
+++ b/torch/csrc/utils/cuda_lazy_init.h
@@ -1,11 +1,23 @@
 #pragma once
 
-// It initially lies in torch/csrc/cuda, but to unconditionlly compile it
-// we have to put it here.
+// cuda_lazy_init() is always compiled, even for CPU-only builds.
+// Thus, it does not live in the cuda/ folder.
 
 namespace torch {
 namespace utils {
 
+// The INVARIANT is that this function MUST be called before you attempt
+// to get a CUDA Type object from ATen, in any way.  Here are some common
+// ways that a Type object may be retrieved:
+//
+//    - You call getNonVariableType or getNonVariableTypeOpt
+//    - You call toBackend() on a Type
+//
+// It's important to do this correctly, because if you forget to add it
+// you'll get an oblique error message about "Cannot initialize CUDA without
+// ATen_cuda library" if you try to use CUDA functionality from a CPU-only
+// build, which is not good UX.
+//
 void cuda_lazy_init();
 
 }

From 0d9b9100f977f677a0734af9096e9121ed85b337 Mon Sep 17 00:00:00 2001
From: Vishwak Srinivasan <cs15btech11043@iith.ac.in>
Date: Fri, 14 Sep 2018 09:12:45 -0700
Subject: [PATCH 155/237] Fix gesv and gels docs (#11699)

Summary: Closes #9935 and closes #5431 .

Differential Revision: D9830448

Pulled By: soumith

fbshipit-source-id: 4e5320a1d0c1d4c8253a5b26f4842cea76530514
---
 torch/_torch_docs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 19a1fcbc625003..64d93a5916ab16 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1694,7 +1694,7 @@ def parse_kwargs(desc):
 
 Computes the solution to the least squares and least norm problems for a full
 rank matrix :math:`A` of size :math:`(m \times n)` and a matrix :math:`B` of
-size :math:`(n \times k)`.
+size :math:`(m \times k)`.
 
 If :math:`m \geq n`, :func:`gels` solves the least-squares problem:
 
@@ -1713,7 +1713,7 @@ def parse_kwargs(desc):
    \end{array}
 
 Returned tensor :math:`X` has shape :math:`(\max(m, n) \times k)`. The first :math:`n`
-rows of :math:`X` contains the solution. If :math`m \geq n`, the residual sum of squares
+rows of :math:`X` contains the solution. If :math:`m \geq n`, the residual sum of squares
 for the solution in each column is given by the sum of squares of elements in the
 remaining :math:`m - n` rows of that column.
 
@@ -1834,9 +1834,9 @@ def parse_kwargs(desc):
 
 Args:
     B (Tensor): input matrix of size :math:`(*, m, k)` , where :math:`*`
-      is zero or more batch dimensions.
+                is zero or more batch dimensions.
     A (Tensor): input square matrix of size :math:`(*, m, m)`, where
-      :math:`*` is zero or more batch dimensions.
+                :math:`*` is zero or more batch dimensions.
     out ((Tensor, Tensor), optional): optional output tuple.
 
 Example::

From eb039dc92c2801f40415d4e366251d506accf726 Mon Sep 17 00:00:00 2001
From: Alexander Sidorov <salex@fb.com>
Date: Fri, 14 Sep 2018 09:33:55 -0700
Subject: [PATCH 156/237] Add CHECKs into GetTensorInfo and ExtractDeviceOption
 (#11597)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11597

We should always CHECK pointers which we plan to dereference
if they are inputs to the function. Nobody knows how the function will
be called in the future.

Reviewed By: yinghai

Differential Revision: D9800002

fbshipit-source-id: 7fd05f4717f2256d1b09a9e75475b12de6685b03
---
 caffe2/core/context.h     | 1 +
 caffe2/core/tensor.cc     | 1 +
 caffe2/core/tensor_impl.h | 7 +++++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index a4e87e8110dc83..aff66534d22198 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -207,6 +207,7 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* /*data*/)
       override {
+    CHECK(device);
     device->set_device_type(TypeToProto(GetDeviceType()));
   }
 
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index e88ed360e8b92b..58b4c4b75e91cb 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -81,6 +81,7 @@ vector<TIndex> GetTensorInfo(
     const void* c,
     size_t* capacity,
     DeviceOption* device) {
+  CHECK(capacity);
   const Tensor* tc = static_cast<const Tensor*>(c);
   CHECK(tc);
   CHECK(tc->unsafeGetTensorImpl());
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 4c146e0d257530..2e01c55318d88f 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -103,7 +103,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * context pointer in tensor, which indicates the type of the tensor.
    */
   at::BaseStaticContext* GetStaticContext() const {
-    return get_static_context(GetDeviceType());
+    auto device_type = GetDeviceType();
+    return get_static_context(device_type);
   }
 
   /* @brief
@@ -732,7 +733,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   void ExtractDeviceOption(DeviceOption* device) const {
-    GetStaticContext()->ExtractDeviceOption(device, raw_data());
+    auto* context = GetStaticContext();
+    CHECK(context);
+    context->ExtractDeviceOption(device, raw_data());
   }
 
   const at::Storage& storage() {

From 115b13ffabec5d7ac3441aa1e2c7a792619bbd39 Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Fri, 14 Sep 2018 09:44:40 -0700
Subject: [PATCH 157/237] clean up some old Half stuff

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11687

Differential Revision: D9829027

Pulled By: li-roy

fbshipit-source-id: f35dcdf93ea57ba4fa775e36e9d6378bed46a710
---
 aten/src/ATen/core/Half.h            |  7 ----
 aten/src/ATen/cuda/ATenCUDAGeneral.h |  4 +++
 aten/src/ATen/cuda/CUDAHalf.cu       | 51 ----------------------------
 aten/src/ATen/cuda/CUDAHalf.cuh      | 31 -----------------
 aten/src/ATen/gen.py                 |  2 +-
 5 files changed, 5 insertions(+), 90 deletions(-)
 delete mode 100644 aten/src/ATen/cuda/CUDAHalf.cu
 delete mode 100644 aten/src/ATen/cuda/CUDAHalf.cuh

diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index c306fcd6b92b72..e2cbd1174fc91c 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -186,13 +186,6 @@ To checked_convert(From f, const char* name) {
   return convert<To, From>(f);
 }
 
-template <typename To, typename From>
-To HalfFix(From h) {
-  To ret;
-  ret.x = h.x;
-  return ret;
-}
-
 AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
 
 } // namespace at
diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h
index 7b41f1fe3f7235..761551f808610b 100644
--- a/aten/src/ATen/cuda/ATenCUDAGeneral.h
+++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
 #ifdef _WIN32
 # if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_CUDA_BUILD_MAIN_LIB)
 #  define AT_CUDA_API __declspec(dllexport)
diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu
deleted file mode 100644
index bd121250ee4847..00000000000000
--- a/aten/src/ATen/cuda/CUDAHalf.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "ATen/core/Half.h"
-#include "ATen/cuda/CUDAHalf.cuh"
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-namespace at {
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-
-half Converter<half, Half>::operator()(Half aten_half) {
-  return half{aten_half.x};
-}
-
-half Converter<half, double>::operator()(double value) {
-  return half{Half(value).x};
-}
-
-Half Converter<Half, half>::operator()(half cuda_half) {
-  return Half(cuda_half.x, Half::from_bits);
-}
-#else
-half Converter<half, Half>::operator()(Half aten_half) {
-  __half_raw x_raw;
-  x_raw.x = aten_half.x;
-  return half(x_raw);
-}
-
-Half Converter<Half, half>::operator()(half cuda_half) {
-  __half_raw raw(cuda_half);
-  return Half(raw.x, Half::from_bits);
-}
-
-half Converter<half, double>::operator()(double value) {
-  __half_raw raw;
-  raw.x = Half(value).x;
-  return half {raw};
-}
-
-template <> __half HalfFix(Half h) {
-  __half_raw raw;
-  raw.x = h.x;
-  return __half{raw};
-}
-
-template <> Half HalfFix(__half h) {
-  __half_raw raw(h);
-  return Half(raw.x, Half::from_bits);
-}
-#endif
-} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh
deleted file mode 100644
index 6558ed518ac1fd..00000000000000
--- a/aten/src/ATen/cuda/CUDAHalf.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include "ATen/cuda/ATenCUDAGeneral.h"
-#include "ATen/core/Half.h"
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-namespace at {
-
-template <>
-struct AT_CUDA_API Converter<half, Half> {
-  half operator()(Half);
-};
-
-template <>
-struct AT_CUDA_API Converter<Half, half> {
-  Half operator()(half);
-};
-
-template <>
-struct AT_CUDA_API Converter<half, double> {
-  half operator()(double);
-};
-
-#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
-template <> __half HalfFix(Half h);
-template <> Half HalfFix(__half h);
-#endif
-} // namespace at
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 76ed7c74a88e07..66444e1837c540 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -259,7 +259,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
             '#undef THNN_',
             '#undef THCIndexTensor_',
         ]
-        env['extra_cuda_headers'] = ['#include <ATen/cuda/CUDAHalf.cuh>']
+        env['extra_cuda_headers'] = ['#include <ATen/cuda/ATenCUDAGeneral.h>']
         env['extra_cuda_headers'].append('#include <ATen/DeviceGuard.h>')
         env['extra_cuda_headers'].append('#include <ATen/cuda/CUDADevice.h>')
         env['extra_cuda_headers'].append('#include <ATen/cuda/CUDATypeDefault.h>')

From 278e304c18b0da89cbc58ac43b2d5eabdd3fd6f3 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Fri, 14 Sep 2018 10:04:05 -0700
Subject: [PATCH 158/237] Implement elif in string frontend (#11667)

Summary:
Closes #11625
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11667

Differential Revision: D9828145

Pulled By: jamesr66a

fbshipit-source-id: c72dc41cb310a4211b4e4c6b33f7e2c1fb3581a0
---
 test/test_jit.py               | 18 ++++++++++++++++++
 torch/csrc/jit/script/parser.h | 23 +++++++++++++++++------
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 985a4f760466b7..b5e26b304c57db 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -6889,6 +6889,24 @@ def forward(self, x):
             DynamicSliceExportMod(), (input,), f, example_outputs=example_outs)
         self.assertExpected(exported)
 
+    def test_string_frontend_elif(self):
+        code = '''
+            def elif_test(niter : int):
+                rv = 0
+                for i in range(niter):
+                    if i % 3 == 0 and i % 5 == 0:
+                        rv += 35
+                    elif i % 3 == 0:
+                        rv += 3
+                    elif i % 5 == 0:
+                        rv += 5
+                    else:
+                        rv += i
+                return rv
+        '''
+
+        self.checkScript(code, (101,), name='elif_test', outputs=3028)
+
 
 class MnistNet(nn.Module):
     def __init__(self):
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index 53f4dc368714f4..14e5e4f5ae1354 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -263,8 +263,10 @@ struct Parser {
   StringLiteral parseStringLiteral() {
     auto range = L.cur().range;
     std::stringstream ss;
-    while(L.cur().kind == TK_STRINGLITERAL)
-      ss << parseString(L.cur().range, L.next().text());
+    while(L.cur().kind == TK_STRINGLITERAL) {
+      auto literal_range = L.cur().range;
+      ss << parseString(literal_range, L.next().text());
+    }
     return StringLiteral::create(range, ss.str());
   }
 
@@ -339,7 +341,8 @@ struct Parser {
     auto param_types = parseList('(', ',', ')', &Parser::parseBareTypeAnnotation);
     TreeRef return_type;
     if (L.nextIf(TK_ARROW)) {
-      return_type = Maybe<Expr>::create(L.cur().range, parseExp());
+      auto return_type_range = L.cur().range;
+      return_type = Maybe<Expr>::create(return_type_range, parseExp());
     } else {
       return_type = Maybe<Expr>::create(L.cur().range);
     }
@@ -397,9 +400,10 @@ struct Parser {
     }
     return list;
   }
-  TreeRef parseIf() {
+  TreeRef parseIf(bool expect_if=true) {
     auto r = L.cur().range;
-    L.expect(TK_IF);
+    if (expect_if)
+      L.expect(TK_IF);
     auto cond = parseExp();
     L.expect(':');
     auto true_branch = parseStatements();
@@ -407,6 +411,12 @@ struct Parser {
     if (L.nextIf(TK_ELSE)) {
       L.expect(':');
       false_branch = parseStatements();
+    } else if (L.nextIf(TK_ELIF)) {
+      // NB: this needs to be a separate statement, since the call to parseIf
+      // mutates the lexer state, and thus causes a heap-use-after-free in
+      // compilers which evaluate argument expressions LTR
+      auto range = L.cur().range;
+      false_branch = makeList(range, {parseIf(false)});
     }
     return If::create(r, Expr(cond), List<Stmt>(true_branch), List<Stmt>(false_branch));
   }
@@ -448,7 +458,8 @@ struct Parser {
     TreeRef return_type;
     if (L.nextIf(TK_ARROW)) {
       // Exactly one expression for return type annotation
-      return_type = Maybe<Expr>::create(L.cur().range, parseExp());
+      auto return_type_range = L.cur().range;
+      return_type = Maybe<Expr>::create(return_type_range, parseExp());
     } else {
       // Default to returning single tensor. TODO: better sentinel value?
       return_type = Maybe<Expr>::create(L.cur().range);

From 3258fc11a73a5b6a90c2ad416c4fbaa0d667bc24 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Fri, 14 Sep 2018 10:44:28 -0700
Subject: [PATCH 159/237] Delete torch/csrc/api/README.md (#11703)

Summary:
We'll have separate docs for the C++ frontend, right now this file is just misleading
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11703

Differential Revision: D9832847

Pulled By: goldsborough

fbshipit-source-id: 2e8b30ccf6b5cba9d0526e6261160f7c6211a35c
---
 torch/csrc/api/README.md | 50 ----------------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 torch/csrc/api/README.md

diff --git a/torch/csrc/api/README.md b/torch/csrc/api/README.md
deleted file mode 100644
index 3f11fdae25226f..00000000000000
--- a/torch/csrc/api/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# AUTOGRADPP
-
-This is an experimental C++ frontend to pytorch's C++ backend. Use at your own
-risk.
-
-How to build:
-```
-git submodule update --init --recursive
-
-cd pytorch
-# On Linux:
-python setup.py build
-# On macOS (may need to prefix with `MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++` when using anaconda)
-LDSHARED="cc -dynamiclib -undefined dynamic_lookup" python setup.py build
-
-cd ..; mkdir -p build; cd build
-cmake .. -DPYTHON_EXECUTABLE:FILEPATH=$(which python)  # helpful if you use anaconda
-make -j
-```
-
-# Stuff
-
-- Check out the [MNIST example](https://github.com/ebetica/autogradpp/blob/eee977ddd377c484af5fce09ae8676410bb6fcce/tests/integration_t.cpp#L320-L355),
-which tries to replicate PyTorch's MNIST model + training loop
-- The principled way to write a model is probably something like 
-```
-TORCH_AUTOGRAD_CONTAINER_CLASS(MyModel) {
-  // This does a 2D convolution, followed by global sum pooling, followed by a linear.
- public:
-  void initialize_containers() override {
-    myConv_ = add(Conv2d(1, 50, 3, 3).stride(2).make(), "conv");
-    myLinear_ = add(Linear(50, 1).make(), "linear");
-  }
-  std::vector<Tensor> forward(std::vector<Tensor> x) override {
-    auto v = myConv_->forward(x);
-    v = v.mean(-1).mean(-1);
-    return myLinear_.forward({v});
-  }
- private:
-  Container myLinear_;
-  Container myConv_;
-}
-```
-
-Some things are not implemented:
-- SGD, Adagrad, RMSprop, and Adam are the only optimizers implemented
-- Bidirectional, batch first, and PackedSequence are not implemented for LSTMs
-- Sparse Tensors might work but are very untested
-
-Otherwise, lots of other things work. There may be breaking API changes.

From 7535d98ec436cab718d12d83bd412223cbfc55c8 Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pcnoordhuis@gmail.com>
Date: Fri, 14 Sep 2018 10:48:50 -0700
Subject: [PATCH 160/237] Add message tag parameter to send/recv

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11490

Reviewed By: teng-li

Differential Revision: D9828116

Pulled By: pietern

fbshipit-source-id: 98be1ae84b6763ffb329e63c030c5e3ec0e748b7
---
 test/test_c10d.py                           |  4 +--
 test/test_distributed.py                    | 20 ++++++++++++
 torch/csrc/distributed/c10d/init.cpp        |  6 ++--
 torch/distributed/distributed_c10d.py       | 34 +++++++++++++--------
 torch/lib/c10d/ProcessGroup.hpp             |  9 ++++--
 torch/lib/c10d/ProcessGroupGloo.cpp         | 25 +++++++++++----
 torch/lib/c10d/ProcessGroupGloo.hpp         |  9 ++++--
 torch/lib/c10d/ProcessGroupMPI.cpp          | 15 +++++----
 torch/lib/c10d/ProcessGroupMPI.hpp          |  9 ++++--
 torch/lib/c10d/ProcessGroupNCCL.cpp         |  5 ++-
 torch/lib/c10d/ProcessGroupNCCL.hpp         |  9 ++++--
 torch/lib/c10d/test/ProcessGroupMPITest.cpp |  6 ++--
 12 files changed, 106 insertions(+), 45 deletions(-)

diff --git a/test/test_c10d.py b/test/test_c10d.py
index ff9d87be76c84f..0df1e3c749c6e6 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -379,14 +379,14 @@ def test_send_recv_all_to_all(self):
         for i in range(self.world_size):
             if i == self.rank:
                 continue
-            send_work.append(pg.send([inputs[i]], i))
+            send_work.append(pg.send([inputs[i]], i, 0))
 
         # Issue recvs
         recv_work = []
         for i in range(self.world_size):
             if i == self.rank:
                 continue
-            recv_work.append(pg.recv([outputs[i]], i))
+            recv_work.append(pg.recv([outputs[i]], i, 0))
 
         # Wait for sends to complete
         for work in send_work:
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 6dd0d984e842d3..f3250644bd621a 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -322,6 +322,26 @@ def test_send_recv_any_source(self):
         self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
         self._barrier()
 
+    # SEND RECV WITH TAG
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
+    def test_send_recv_with_tag(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        tensor = _build_tensor(10, value=rank)
+
+        for dst in range(0, world_size):
+            if dst == rank:
+                # Recv mode
+                for src in range(0, world_size):
+                    if src == rank:
+                        continue
+                    output_tensor = _build_tensor(10, value=-1)
+                    dist.recv(output_tensor, src, tag=src)
+                    self.assertTrue(output_tensor.eq(src).all())
+            else:
+                # Send mode
+                dist.send(tensor, dst, tag=rank)
+
     # ISEND
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend")
     def test_isend(self):
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index d4f9e0f07b1d23..72bcac27cf8754 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -258,7 +258,8 @@ PyObject* c10d_init(PyObject* _unused) {
               "recv_anysource",
               [](::c10d::ProcessGroup& pg,
                  std::vector<at::Tensor>& input,
-                 at::Tensor& srcRankTensor) {
+                 at::Tensor& srcRankTensor,
+                 int tag) {
                 if (srcRankTensor.type().scalarType() != at::kInt) {
                   throw std::runtime_error(
                       "source rank tensor needs to be "
@@ -270,10 +271,11 @@ PyObject* c10d_init(PyObject* _unused) {
                       "contain only one element");
                 }
                 return pg.recvAnysource(
-                    input, static_cast<int*>(srcRankTensor.data_ptr()));
+                    input, static_cast<int*>(srcRankTensor.data_ptr()), tag);
               },
               py::arg("tensors"),
               py::arg("src_rank"),
+              py::arg("tag"),
               py::call_guard<py::gil_scoped_release>())
 
           .def(
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index d1a697e58288bb..651f2ea6f2f870 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -373,7 +373,8 @@ def get_world_size(group=group.WORLD):
 
 def isend(tensor,
           dst,
-          group=group.WORLD):
+          group=group.WORLD,
+          tag=0):
     """
     Sends a tensor asynchronously.
 
@@ -381,6 +382,7 @@ def isend(tensor,
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match send with remote recv
 
     Returns:
         A distributed request object.
@@ -392,15 +394,16 @@ def isend(tensor,
 
     if group == GroupMember.WORLD:
         _check_default_pg()
-        return _default_pg.send([tensor], dst)
+        return _default_pg.send([tensor], dst, tag)
     else:
         group_dst_rank = _get_group_rank(group, dst)
-        return group.send([tensor], group_dst_rank)
+        return group.send([tensor], group_dst_rank, tag)
 
 
 def irecv(tensor,
           src,
-          group=group.WORLD):
+          group=group.WORLD,
+          tag=0):
     """
     Receives a tensor asynchronously.
 
@@ -408,6 +411,7 @@ def irecv(tensor,
         tensor (Tensor): Tensor to fill with received data.
         src (int): Source rank.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match recv with remote send
 
     Returns:
         A distributed request object.
@@ -419,15 +423,16 @@ def irecv(tensor,
 
     if group == GroupMember.WORLD:
         _check_default_pg()
-        return _default_pg.recv([tensor], src)
+        return _default_pg.recv([tensor], src, tag)
     else:
         group_src_rank = _get_group_rank(group, src)
-        return group.recv([tensor], group_src_rank)
+        return group.recv([tensor], group_src_rank, tag)
 
 
 def send(tensor,
          dst,
-         group=group.WORLD):
+         group=group.WORLD,
+         tag=0):
     """
     Sends a tensor synchronously.
 
@@ -435,6 +440,7 @@ def send(tensor,
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match send with remote recv
 
     """
     if _rank_not_in_group(group):
@@ -442,15 +448,16 @@ def send(tensor,
 
     if group == GroupMember.WORLD:
         _check_default_pg()
-        _default_pg.send([tensor], dst).wait()
+        _default_pg.send([tensor], dst, tag).wait()
     else:
         group_dst_rank = _get_group_rank(group, dst)
-        group.send([tensor], group_dst_rank).wait()
+        group.send([tensor], group_dst_rank, tag).wait()
 
 
 def recv(tensor,
          src=None,
-         group=group.WORLD):
+         group=group.WORLD,
+         tag=0):
     """
     Receives a tensor synchronously.
 
@@ -459,6 +466,7 @@ def recv(tensor,
         src (int, optional): Source rank. Will receive from any
             process if unspecified.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match recv with remote send
 
     Returns:
         Sender rank
@@ -476,7 +484,7 @@ def recv(tensor,
 
     if src is None:
         rank_tensor = torch.IntTensor([-1])
-        pg.recv_anysource([tensor], rank_tensor).wait()
+        pg.recv_anysource([tensor], rank_tensor, tag).wait()
         src_rank = rank_tensor[0].item()
         if group == GroupMember.WORLD:
             return src_rank
@@ -484,10 +492,10 @@ def recv(tensor,
             return _get_global_rank(pg, src_rank)
     else:
         if group == GroupMember.WORLD:
-            pg.recv([tensor], src).wait()
+            pg.recv([tensor], src, tag).wait()
         else:
             group_src_rank = _get_group_rank(pg, src)
-            pg.recv([tensor], group_src_rank).wait()
+            pg.recv([tensor], group_src_rank, tag).wait()
         return src
 
 
diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp
index 2ac2948457a7f6..3f80f72cf36382 100644
--- a/torch/lib/c10d/ProcessGroup.hpp
+++ b/torch/lib/c10d/ProcessGroup.hpp
@@ -115,15 +115,18 @@ class ProcessGroup {
 
   virtual std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank) = 0;
+      int dstRank,
+      int tag) = 0;
 
   virtual std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank) = 0;
+      int srcRank,
+      int tag) = 0;
 
   virtual std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
-      int* srcRank) = 0;
+      int* srcRank,
+      int tag) = 0;
 
   virtual std::shared_ptr<ProcessGroup::Work> barrier() = 0;
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index e110a78001c134..4417364641efd2 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -707,17 +707,26 @@ at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
   return tensor;
 }
 
+uint32_t checkTag(int32_t tag) {
+  if (tag < 0) {
+    throw std::runtime_error("Tag must be >= 0");
+  }
+  return (uint32_t) tag;
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
     std::vector<at::Tensor>& tensors,
-    int dstRank) {
+    int dstRank,
+    int tag) {
   auto& tensor = checkSingleTensor(tensors);
+  auto utag = checkTag(tag);
   auto ptr = tensor.data_ptr();
   auto size = tensor.numel() * tensor.type().elementSizeInBytes();
 
   // Construct unbound buffer.
   auto& context = contexts_[0];
   auto buf = context->createUnboundBuffer(ptr, size);
-  buf->send(dstRank, 0);
+  buf->send(dstRank, utag);
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the send.
@@ -726,15 +735,17 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
     std::vector<at::Tensor>& tensors,
-    int srcRank) {
+    int srcRank,
+    int tag) {
   auto& tensor = checkSingleTensor(tensors);
+  auto utag = checkTag(tag);
   auto ptr = tensor.data_ptr();
   auto size = tensor.numel() * tensor.type().elementSizeInBytes();
 
   // Construct unbound buffer.
   auto& context = contexts_[0];
   auto buf = context->createUnboundBuffer(ptr, size);
-  buf->recv(srcRank, 0);
+  buf->recv(srcRank, utag);
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the recv.
@@ -743,8 +754,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
     std::vector<at::Tensor>& tensors,
-    int* srcRank) {
+    int* srcRank,
+    int tag) {
   auto& tensor = checkSingleTensor(tensors);
+  auto utag = checkTag(tag);
   auto ptr = tensor.data_ptr();
   auto size = tensor.numel() * tensor.type().elementSizeInBytes();
 
@@ -761,7 +774,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
     srcRanks.push_back(i);
   }
 
-  buf->recv(srcRanks, 0);
+  buf->recv(srcRanks, utag);
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the recv.
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index e78b1e36f8b94e..773ad600d4402f 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -327,15 +327,18 @@ class ProcessGroupGloo : public ProcessGroup {
 
   std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank) override;
+      int dstRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank) override;
+      int srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
-      int* srcRank) override;
+      int* srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> barrier() override;
 
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 39e5e5b609c1e8..03699962c15b59 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -649,7 +649,8 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
     std::vector<at::Tensor>& tensors,
-    int dstRank) {
+    int dstRank,
+    int tag) {
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
@@ -666,7 +667,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
         tensor.numel(),
         mpiDatatype.at(tensor.type().scalarType()),
         dstRank,
-        0,
+        tag,
         pgComm_,
         &request));
   }
@@ -676,7 +677,8 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
     std::vector<at::Tensor>& tensors,
-    int srcRank) {
+    int srcRank,
+    int tag) {
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
@@ -693,7 +695,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
         tensor.numel(),
         mpiDatatype.at(tensor.type().scalarType()),
         srcRank,
-        0,
+        tag,
         pgComm_,
         &request));
   }
@@ -703,7 +705,8 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
     std::vector<at::Tensor>& tensors,
-    int* srcRank) {
+    int* srcRank,
+    int tag) {
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
@@ -720,7 +723,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
         tensor.numel(),
         mpiDatatype.at(tensor.type().scalarType()),
         MPI_ANY_SOURCE,
-        0,
+        tag,
         pgComm_,
         &request));
   }
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
index 4a72570759ce91..5bd2b303c1a4e1 100644
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -165,15 +165,18 @@ class ProcessGroupMPI : public ProcessGroup {
 
   std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank);
+      int dstRank,
+      int tag);
 
   std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank);
+      int srcRank,
+      int tag);
 
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensor,
-      int* srcRank);
+      int* srcRank,
+      int tag);
 
   std::shared_ptr<ProcessGroup::Work> barrier();
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 157a9917f6803e..db3fcf5e211789 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -599,19 +599,22 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support send");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support recv");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
-    int* /* unused */) {
+    int* /* unused */,
+    int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support recv");
 }
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index d3862fc044321d..3eca7c4d95c535 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -130,15 +130,18 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank) override;
+      int dstRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank) override;
+      int srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
-      int* srcRank) override;
+      int* srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> barrier() override;
 
diff --git a/torch/lib/c10d/test/ProcessGroupMPITest.cpp b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
index 3a470fd1b88904..398bf31b9e06ae 100644
--- a/torch/lib/c10d/test/ProcessGroupMPITest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
@@ -318,7 +318,7 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
     std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
     for (auto& tensors : allTensors) {
       // Kick off work
-      std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1);
+      std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1, 0);
       works.push_back(std::move(work));
     }
     for (auto& work : works) {
@@ -337,11 +337,11 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
     for (auto& tensors : allTensors) {
       // Kick off work
       if (!recvAnysource) {
-        std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0);
+        std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0, 0);
         works.push_back(std::move(work));
       } else {
         std::shared_ptr<::c10d::ProcessGroup::Work> work =
-            pg->recvAnysource(tensors, &srcRanks[i]);
+            pg->recvAnysource(tensors, &srcRanks[i], 0);
         works.push_back(std::move(work));
       }
       ++i;

From b90872c00ee159bc7e0fc936de8d58dcf1dc803b Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Fri, 14 Sep 2018 10:52:38 -0700
Subject: [PATCH 161/237] Get rid of default arguments for TH/THC factory
 functions. (#11673)

Summary:
This is causing codegen problems in caffe2, when we try to remove the circular Tensor/Type declarations.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11673

Differential Revision: D9819341

Pulled By: gchanan

fbshipit-source-id: f2c2cd96e8a16f6de6aa4889e71b8a78e12e9256
---
 aten/src/ATen/Declarations.cwrap | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 5a6687aaa53762..d45815c5b600c9 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -302,8 +302,7 @@
       output: True
     - accreal start
     - accreal end
-    - arg: accreal step
-      default: 1
+    - accreal step
 ]]
 [[
   name: _arange
@@ -320,8 +319,7 @@
           output: True
         - accreal start
         - accreal end
-        - arg: accreal step
-          default: 1
+        - accreal step
     - cname: arange
       arguments:
         - arg: THTensor* result
@@ -1956,8 +1954,7 @@
       output: True
     - real start
     - real end
-    - arg: long steps
-      default: 100
+    - long steps
 ]]
 [[
   name: _logspace
@@ -1976,8 +1973,7 @@
       output: True
     - real start
     - real end
-    - arg: long steps
-      default: 100
+    - long steps
 ]]
 [[
   name: histc

From 4050770909166230baf33aa327ef6d4b6395b332 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Fri, 14 Sep 2018 12:59:52 -0500
Subject: [PATCH 162/237] Skip tests that depend on double datatype for MIOpen
 and in absence of the fallback fail.

---
 test/test_jit.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_jit.py b/test/test_jit.py
index 985a4f760466b7..76d4ee79638d0f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -6981,6 +6981,7 @@ def test_dcgan_models(self):
         self._test_dcgan_models(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_dcgan_models_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_dcgan_models(self, device='cuda', check_export_import=False)
@@ -7103,11 +7104,13 @@ def test_mnist(self):
         self._test_mnist(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_mnist(self, device='cuda', check_export_import=False)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_training_leaks_no_memory_cuda(self):
         net = MnistNet().cuda()
         # MnistNet uses dropout, don't check its trace

From 0c2648830fbb48f1d600b18e1e4d1a6be071336d Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@nvidia.com>
Date: Fri, 14 Sep 2018 11:53:48 -0700
Subject: [PATCH 163/237] Augment emit_nvtx to help connect backward-pass
 Function apply calls with their corresponding forward pass ops (#10881)

Summary:
Often, we find ourselves looking at some long-running kernel or emit_nvtx range on an nvvp profile and trying to connect it to the offending line in a training script.  If the op is in the forward pass that's easy:  ops are enqueued explicitly from the Python side, so tracking it down with manual nvtx ranges supplemented by the built-in emit_nvtx ranges is straightforward.  If the op is in the backward pass, it's much more difficult.  From the Python side, all you can do is wrap loss.backward() in an nvtx range, and if you also use emit_nvtx, the automatic ranges provide only local information.  Right now, the only consistent way to connect backward-pass kernels to their associated forward-pass lines of Python is to understand your script line by line, and know exactly where in the backward pass you are.

This PR augments the existing nvtx machinery to bridge the gap between forward and backward, allowing connection of backward-pass Function apply calls to the forward-pass operations that required/created those Functions.

The method is simple and surgical.  During the forward pass, when running with emit_nvtx, the nvtx range for each function in VariableType is tagged with the current sequence number.  During the backward pass, the nvtx range associated with each Function's operator() is tagged with that Function's stashed sequence number, which can be compared to "current sequence numbers" from the forward pass to locate the associated op.

Double-backward is not a problem.  If a backward pass with create_graph = True is underway, the relationship between backward and double-backward is conceptually the same as the relationship between forward and backward:  The functions in VariableType still spit out current-sequence-number-tagged ranges, the Function objects they create still stash those sequence numbers, and in the eventual double-backward execution, their operator() ranges are still tagged with the stashed numbers, which can be compared to "current sequence numbers" from the backward pass.

Minor caveats:

- The sequence number is thread-local, and many VariableType functions (specifically, those without a derivative explicitly defined in derivatives.yaml) don't create an associated function object (instead delegating that to sub-functions further down the call chain, perhaps called from within at::native functions that route back through VariableType by calling at::function_name).  So the correspondence of stashed sequence numbers in Function operator() ranges with numbers in forward-pass ranges is not guaranteed to be 1 to 1.  However, it's still a vast improvement over the current situation, and I don't think this issue should be a blocker.
- Feel free to litigate my use of stringstream in profiler.cpp.  I did it because it was easy and clean.  If that's too big a hammer, let's figure out something more lightweight.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10881

Differential Revision: D9833371

Pulled By: apaszke

fbshipit-source-id: 1844f2e697117880ef5e31394e36e801d1de6088
---
 tools/autograd/gen_variable_type.py |  2 +-
 torch/autograd/profiler.py          | 45 +++++++++++++++++++++++++++++
 torch/csrc/autograd/function.cpp    |  4 +++
 torch/csrc/autograd/function.h      |  2 ++
 torch/csrc/autograd/profiler.cpp    | 21 ++++++++++++--
 torch/csrc/autograd/profiler.h      |  4 ++-
 6 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 86dd5fcb98bda0..d09a07a7b550c4 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -124,7 +124,7 @@
 """)
 
 RECORD_FUNCTION = CodeTemplate("""\
-profiler::RecordFunction profiler("${name}");""")
+profiler::RecordFunction profiler("${name}", Function::peek_at_next_sequence_nr());""")
 
 PRE_RECORD_TRACE = CodeTemplate("""\
 torch::jit::Node* node = nullptr;
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index c1be47ad494397..97a0fa8e97d85f 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -249,6 +249,51 @@ class emit_nvtx(object):
         ...     model(x) # Warmup CUDA memory allocator and profiler
         ...     with torch.autograd.profiler.emit_nvtx():
         ...         model(x)
+
+    **Forward-backward correlation**
+
+    When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler,
+    correlating each backward-pass op with the corresponding forward-pass op can be difficult.
+    To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it
+    generates.
+
+    During the forward pass, each function range is decorated with ``seq=<N>``.  ``seq`` is a running
+    counter, incremented each time a new backward Function object is created and stashed for backward.
+    Thus, the `seq=<N>` annotation associated with each forward function range tells you that
+    if a backward Function object is created by this forward function,
+    the backward object will receive sequence number N.
+    During the backward pass, the top-level range wrapping each C++ backward Function's
+    ``apply()`` call is decorated with ``stashed seq=<M>``.  ``M`` is the sequence number that
+    the backward object was created with.  By comparing ``stashed seq`` numbers in backward with ``seq``
+    numbers in forward, you can track down which forward op created each backward Function.
+
+    Any functions executed during the backward pass are also decorated with ``seq=<N>``.  During
+    default backward (with ``create_graph=False``) this information is irrelevant, and in fact,
+    ``N`` may simply be 0 for all such functions.  Only the top-level ranges associated with
+    backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function
+    objects with the earlier forward pass.
+
+    **Double-backward**
+
+    If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words,
+    if you are setting up for a double-backward), each function's execution during backward
+    is given a nonzero, useful ``seq=<N>``.  Those functions may themselves create Function objects
+    to be executed later during double-backward, just as the original functions in the forward pass did.
+    The relationship between backward and double-backward is conceptually the same as the relationship
+    between forward and backward: The functions still emit current-sequence-number-tagged ranges,
+    the Function objects they create still stash those sequence numbers, and during the eventual
+    double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq``
+    numbers, which can be compared to `seq` numbers from the backward pass.
+
+    .. warning:
+        The sequence number is thread-local, and some forward functions don't create an associated
+        backward Function object (instead delegating that to sub-functions further down the call chain).
+        For these reasons, the correspondence of stashed sequence numbers in
+        backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is
+        not guaranteed to be 1 to 1.  The sequence numbers alone may not be enough to fully
+        disambiguate which forward function created which
+        backward Function object.  You may need to make a judgment based on analytic knowledge of what
+        the expected correspondence should be.
     """
     def __init__(self, enabled=True):
         self.enabled = enabled
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index ba33a6f116bac2..57fe29810c4c8f 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -21,6 +21,10 @@ namespace torch { namespace autograd {
 /// numbers.
 thread_local uint64_t Function_next_sequence_nr_ = 0;
 
+uint64_t Function::peek_at_next_sequence_nr() {
+  return Function_next_sequence_nr_;
+}
+
 uint64_t& Function::get_next_sequence_nr() {
   return Function_next_sequence_nr_;
 }
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index dfe8683197f8ee..7b01b4e7e112f9 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -315,6 +315,8 @@ struct TORCH_API Function : std::enable_shared_from_this<Function> {
     return nullptr;
   }
 
+  static uint64_t peek_at_next_sequence_nr();
+
  protected:
   static uint64_t& get_next_sequence_nr();
 
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 66ff6d1a09c7c0..77c884e9b71b64 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -1,6 +1,8 @@
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/function.h"
 
+#include <sstream>
+
 namespace torch { namespace autograd { namespace profiler {
 
 ProfilerState state = ProfilerState::Disabled;
@@ -37,13 +39,19 @@ void mark(std::string name, bool include_cuda /* = true */) {
   }
 }
 
-void pushRange(std::string name) {
+void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*= -1*/) {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
 #ifdef USE_CUDA
-    nvtxRangePushA(name.c_str());
+    if(sequence_nr >= 0) {
+      std::stringstream s;
+      s << name << msg << sequence_nr;
+      nvtxRangePushA(s.str().c_str());
+    } 
+    else
+      nvtxRangePushA(name.c_str());
 #else
     throw std::logic_error(
         "pushRange called with NVTX tracing, but compiled without CUDA");
@@ -95,6 +103,13 @@ RecordFunction::RecordFunction(const char* name) {
   pushRange(name);
 }
 
+RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr) 
+{
+  if (state == ProfilerState::Disabled)
+    return;
+  pushRange(name, ", seq=", current_sequence_nr);
+}
+
 RecordFunction::~RecordFunction() {
   if (state == ProfilerState::Disabled)
     return;
@@ -102,7 +117,7 @@ RecordFunction::~RecordFunction() {
 }
 
 void RecordFunction::pushFunctionRange(Function* fn) {
-  pushRange(fn->name());
+  pushRange(fn->name(), ", stashed seq=", fn->sequence_nr());
 }
 
 #ifdef USE_CUDA
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index ba0fee1510baa2..29dc1044fb2f10 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -166,7 +166,7 @@ enum class ProfilerState {
 
 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);
-TORCH_API void pushRange(std::string name);
+TORCH_API void pushRange(std::string name, const char* msg = "", int64_t sequence_nr = -1);
 TORCH_API void popRange();
 
 struct TORCH_API RecordFunction {
@@ -176,6 +176,8 @@ struct TORCH_API RecordFunction {
 
   explicit RecordFunction(const char* name);
 
+  explicit RecordFunction(const char* name, int64_t current_sequence_nr);
+
   ~RecordFunction();
 
   // Needed only because we don't have Function defined yet.

From 224e62bbec2b00719f38a2a2fb2191e40120bc4a Mon Sep 17 00:00:00 2001
From: Anders Papitto <anderspapitto@gmail.com>
Date: Fri, 14 Sep 2018 12:16:29 -0700
Subject: [PATCH 164/237] respect USE_CUDA_STATIC_LINK in build_libtorch.py

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11713

Differential Revision: D9835972

Pulled By: anderspapitto

fbshipit-source-id: 046363b132e5487c05ef7e6e6d88b508196386a1
---
 tools/build_libtorch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index df29875666063e..db698a24128ea9 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -23,6 +23,8 @@
     command = [build_pytorch_libs, '--use-nnpack']
     if USE_CUDA:
         command.append('--use-cuda')
+        if os.environ.get('USE_CUDA_STATIC_LINK', False):
+            command.append('--cuda-static-link')
     if options.use_cereal:
         command.append('--use-cereal')
     command.append('caffe2')

From 70e68e755a8fdd94218e9b8e0abfb6b805354a08 Mon Sep 17 00:00:00 2001
From: David Riazati <davidriazati@fb.com>
Date: Fri, 14 Sep 2018 13:29:06 -0700
Subject: [PATCH 165/237] Casting for binary ops (#11708)

Summary:
Fixes #11663

`TensorIterator` was replacing the op tensors with type casted tensors
which ended up producing side effects in binary ops like `a.float() * b`
where `a` and `b` are `LongTensor`s.

colesbury ezyang apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11708

Differential Revision: D9834016

Pulled By: driazati

fbshipit-source-id: 4082eb9710b31dfc741161a0fbdb9a8eba8fe39d
---
 aten/src/ATen/native/TensorIterator.cpp |  3 ++-
 aten/src/ATen/native/TensorIterator.h   |  1 +
 test/test_torch.py                      | 14 ++++++++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index bae2a94b86273b..15d86fb5162a3e 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -98,7 +98,8 @@ void TensorIterator::compute_common_type() {
       if (op.tensor->defined() && type != op.tensor->type()) {
         if (op.tensor->dim() == 0) {
           if (type.backend() != at::Backend::CUDA) {
-            *op.tensor = op.tensor->toType(type);
+            cast_tensors_.emplace_back(op.tensor->toType(type));
+            op.tensor = &(cast_tensors_.back());
           }
         } else {
           op.needs_cast = true;
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 245866373d4763..3faedbec6bb320 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -184,6 +184,7 @@ struct AT_API TensorIterator {
   DimVector shape_;
   DimVector perm_;
   SmallVector<OperandInfo, 4> operands_;
+  SmallVector<Tensor, 4> cast_tensors_;
   int num_outputs_ = 0;
   bool has_coalesced_dimensions_ = false;
 };
diff --git a/test/test_torch.py b/test/test_torch.py
index 260bfe3d42e8e7..b90e0631d4f470 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8861,6 +8861,20 @@ def test_cuda_not_built(self):
         self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.FloatTensor())
         self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).to(device="cuda"))
 
+    def test_cast_binary_op(self):
+        # Scalar
+        a = torch.tensor(2)
+        b = torch.tensor(3)
+        a_copy = a.clone()
+        b_copy = b.clone()
+
+        self.assertEqual(torch.tensor(6), a.float() * b)
+
+        self.assertEqual(a.type(), a_copy.type())
+        self.assertEqual(a.data.type(), a_copy.data.type())
+        self.assertEqual(b.type(), b_copy.type())
+        self.assertEqual(b.data.type(), b_copy.type())
+
 
 # Functions to test negative dimension wrapping
 METHOD = 1

From 96d3f968ebcbbcff5a7d3adf1ceb357e78ca672a Mon Sep 17 00:00:00 2001
From: Mike Ruberry <38511765+mruberry@users.noreply.github.com>
Date: Fri, 14 Sep 2018 13:53:09 -0700
Subject: [PATCH 166/237] Splits CPU and CUDA fusion compilers (#10981)

Summary:
This PR splits the CPU and CUDA fusion compilers, putting them into a new jit/fusers/ directory with jit/fusers/common for common components. In particular:

- A fusion interface is created that allows "fusion handles" to be requested
- The CPU and CUDA fusers implement this interface, with dispatch determined by device
- The fusion compilers, fusion function specializations and resource strings are split
- CPU-specific classes like TempFile and DynamicLibrary are in the CPU fuser
- Common classes likes TensorDesc and the base fusion function class are in jit/fusers/common
- There is still some specialization in jit/fusers/common, but these specializations are small(-ish)
- Updates the build system to remove the dummy interface on Windows and minimize the use of macros

This structure should allow in-flight PRs to easily rebase while providing a clear interface to the fusers.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10981

Reviewed By: soumith

Differential Revision: D9701999

Pulled By: apaszke

fbshipit-source-id: 3b6bec7b97e0444b2a93caa38d9b897f2e68c1b3
---
 .gitignore                                    |    1 +
 test/test_jit.py                              |   15 +-
 tools/amd_build/build_pytorch_amd.py          |    2 +-
 torch/CMakeLists.txt                          |   30 +-
 torch/__init__.py                             |    2 +-
 torch/csrc/jit/fusers/Config.h.in             |    4 +
 .../csrc/jit/fusers/common/annotated_graph.h  |   26 +
 torch/csrc/jit/fusers/common/fused_kernel.cpp |  556 ++++++
 torch/csrc/jit/fusers/common/fused_kernel.h   |   88 +
 .../csrc/jit/fusers/common/fusion_arg_spec.h  |   46 +
 .../jit/fusers/common/fusion_handle_impl.cpp  |  400 ++++
 .../jit/fusers/common/fusion_handle_impl.h    |   72 +
 torch/csrc/jit/fusers/common/partition_desc.h |   48 +
 torch/csrc/jit/fusers/common/tensor_desc.cpp  |   20 +
 torch/csrc/jit/fusers/common/tensor_desc.h    |   83 +
 torch/csrc/jit/fusers/common/tensor_info.h    |   25 +
 torch/csrc/jit/fusers/cpu/dynamic_library.h   |   42 +
 torch/csrc/jit/fusers/cpu/fused_kernel.cpp    |   95 +
 torch/csrc/jit/fusers/cpu/fused_kernel.h      |   45 +
 torch/csrc/jit/fusers/cpu/fusion_compiler.cpp |   85 +
 torch/csrc/jit/fusers/cpu/fusion_compiler.h   |   55 +
 torch/csrc/jit/fusers/cpu/interface.h         |   32 +
 torch/csrc/jit/fusers/cpu/resource_strings.h  |   56 +
 torch/csrc/jit/fusers/cpu/temp_file.h         |   70 +
 torch/csrc/jit/fusers/cuda/fused_kernel.cpp   |  105 ++
 torch/csrc/jit/fusers/cuda/fused_kernel.h     |   59 +
 .../csrc/jit/fusers/cuda/fusion_compiler.cpp  |   71 +
 torch/csrc/jit/fusers/cuda/fusion_compiler.h  |   44 +
 torch/csrc/jit/fusers/cuda/interface.h        |   32 +
 torch/csrc/jit/fusers/cuda/resource_strings.h |  197 ++
 torch/csrc/jit/fusers/interface.cpp           |   81 +
 torch/csrc/jit/fusers/interface.h             |   37 +
 torch/csrc/jit/fusion_compiler.cpp            | 1615 -----------------
 torch/csrc/jit/fusion_compiler.h              |  188 --
 torch/csrc/jit/init.cpp                       |    4 +-
 torch/csrc/jit/interpreter.cpp                |    1 -
 torch/csrc/jit/passes/graph_fuser.cpp         |    4 +-
 torch/csrc/jit/python_interpreter.cpp         |    1 -
 torch/csrc/jit/register_prim_ops.cpp          |    8 +-
 torch/csrc/jit/test_jit.cpp                   |   10 +-
 torch/jit/__init__.py                         |    1 +
 41 files changed, 2534 insertions(+), 1822 deletions(-)
 create mode 100644 torch/csrc/jit/fusers/Config.h.in
 create mode 100644 torch/csrc/jit/fusers/common/annotated_graph.h
 create mode 100644 torch/csrc/jit/fusers/common/fused_kernel.cpp
 create mode 100644 torch/csrc/jit/fusers/common/fused_kernel.h
 create mode 100644 torch/csrc/jit/fusers/common/fusion_arg_spec.h
 create mode 100644 torch/csrc/jit/fusers/common/fusion_handle_impl.cpp
 create mode 100644 torch/csrc/jit/fusers/common/fusion_handle_impl.h
 create mode 100644 torch/csrc/jit/fusers/common/partition_desc.h
 create mode 100644 torch/csrc/jit/fusers/common/tensor_desc.cpp
 create mode 100644 torch/csrc/jit/fusers/common/tensor_desc.h
 create mode 100644 torch/csrc/jit/fusers/common/tensor_info.h
 create mode 100644 torch/csrc/jit/fusers/cpu/dynamic_library.h
 create mode 100644 torch/csrc/jit/fusers/cpu/fused_kernel.cpp
 create mode 100644 torch/csrc/jit/fusers/cpu/fused_kernel.h
 create mode 100644 torch/csrc/jit/fusers/cpu/fusion_compiler.cpp
 create mode 100644 torch/csrc/jit/fusers/cpu/fusion_compiler.h
 create mode 100644 torch/csrc/jit/fusers/cpu/interface.h
 create mode 100644 torch/csrc/jit/fusers/cpu/resource_strings.h
 create mode 100644 torch/csrc/jit/fusers/cpu/temp_file.h
 create mode 100644 torch/csrc/jit/fusers/cuda/fused_kernel.cpp
 create mode 100644 torch/csrc/jit/fusers/cuda/fused_kernel.h
 create mode 100644 torch/csrc/jit/fusers/cuda/fusion_compiler.cpp
 create mode 100644 torch/csrc/jit/fusers/cuda/fusion_compiler.h
 create mode 100644 torch/csrc/jit/fusers/cuda/interface.h
 create mode 100644 torch/csrc/jit/fusers/cuda/resource_strings.h
 create mode 100644 torch/csrc/jit/fusers/interface.cpp
 create mode 100644 torch/csrc/jit/fusers/interface.h
 delete mode 100644 torch/csrc/jit/fusion_compiler.cpp
 delete mode 100644 torch/csrc/jit/fusion_compiler.h

diff --git a/.gitignore b/.gitignore
index e3e7acb8e310cb..b600e7515b9134 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 torch/csrc/jit/generated/*
+torch/csrc/jit/fusers/Config.h
 torch/csrc/nn/THCUNN.cpp
 torch/csrc/nn/THCUNN.cwrap
 torch/csrc/nn/THNN_generic.cpp
diff --git a/test/test_jit.py b/test/test_jit.py
index b5e26b304c57db..b715045b30a27c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -205,6 +205,17 @@ def _construct_empty_tensor_list():
     return []
 
 
+def enable_cpu_fuser(fn):
+    def wrapper(*args, **kwargs):
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        try:
+            fn(*args, **kwargs)
+        except Exception:
+            torch._C._jit_override_can_fuse_on_cpu(False)
+            raise
+    return wrapper
+
+
 class JitTestCase(TestCase):
     _do_cuda_memory_leak_check = True
     _restored_warnings = False
@@ -630,6 +641,7 @@ def test_lstm_fusion_cuda(self):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/8746")
+    @enable_cpu_fuser
     def test_lstm_fusion_cpu(self):
         inputs = get_lstm_inputs('cpu')
         try:
@@ -1370,6 +1382,7 @@ def test_ge_unoptimized(self):
         self.run_ge_tests(False, False)
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @enable_cpu_fuser
     def test_ge_optimized(self):
         self.run_ge_tests(True, False)
 
@@ -3053,6 +3066,7 @@ def chunk_4_last(x):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @skipIfRocm
+    @enable_cpu_fuser
     def test_chunk_fusion_correctness(self):
         return self._test_chunk_fusion_correctness(self, 'cpu')
 
@@ -6928,7 +6942,6 @@ def forward(self, x):
 
 
 class TestEndToEndHybridFrontendModels(JitTestCase):
-
     @staticmethod
     def _test_dcgan_models(self, device, check_export_import=True):
         class DCGANGenerator(nn.Module):
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
index e9daea644e9cb2..d433ad50dbfae7 100644
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -41,7 +41,7 @@
 
 # Make various replacements inside AMD_BUILD/torch directory
 ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
-                "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
+                "csrc/cuda/cuda_check.h"]
 for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
     for filename in files:
         if filename.endswith(".cpp") or filename.endswith(".h"):
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 04a72819717bad..7679f3d6467ba0 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -146,7 +146,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
   ${TORCH_SRC_DIR}/csrc/jit/export.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp
   ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
   ${TORCH_SRC_DIR}/csrc/jit/import.cpp
@@ -176,6 +175,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/fusers/interface.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
@@ -190,6 +190,34 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
   )
 
+SET(USE_CPU_FUSER 0)
+if (NOT WIN32)
+  SET(USE_CPU_FUSER 1)
+
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/common/tensor_desc.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/common/fusion_handle_impl.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/common/fused_kernel.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cpu/fusion_compiler.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cpu/fused_kernel.cpp
+  )
+endif()
+
+SET(USE_CUDA_FUSER 0)
+if (USE_CUDA AND NOT USE_ROCM AND NOT WIN32)
+  SET(USE_CUDA_FUSER 1)
+
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cuda/fusion_compiler.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cuda/fused_kernel.cpp
+  )
+
+endif()
+
+CONFIGURE_FILE(
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/Config.h.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/jit/fusers/Config.h)
+
 if (NOT NO_API AND NOT USE_ROCM)
   list(APPEND TORCH_SRCS
     ${TORCH_SRC_DIR}/csrc/api/src/utils.cpp
diff --git a/torch/__init__.py b/torch/__init__.py
index e494cdec6cbec1..c6298353cd8ab8 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -281,6 +281,7 @@ def manager_path():
 
 import torch.cuda
 import torch.autograd
+from torch.autograd import no_grad, enable_grad, set_grad_enabled
 import torch.nn
 import torch.optim
 import torch.multiprocessing
@@ -293,7 +294,6 @@ def manager_path():
 import torch.testing
 import torch.backends.cuda
 import torch.backends.mkl
-from torch.autograd import no_grad, enable_grad, set_grad_enabled
 
 _C._init_names(list(torch._storage_classes))
 
diff --git a/torch/csrc/jit/fusers/Config.h.in b/torch/csrc/jit/fusers/Config.h.in
new file mode 100644
index 00000000000000..0809591ca702e9
--- /dev/null
+++ b/torch/csrc/jit/fusers/Config.h.in
@@ -0,0 +1,4 @@
+#pragma once
+
+#define USE_CPU_FUSER @USE_CPU_FUSER@
+#define USE_CUDA_FUSER @USE_CUDA_FUSER@
diff --git a/torch/csrc/jit/fusers/common/annotated_graph.h b/torch/csrc/jit/fusers/common/annotated_graph.h
new file mode 100644
index 00000000000000..bf8a6e862c4dac
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/annotated_graph.h
@@ -0,0 +1,26 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+struct AnnotatedGraph {
+  // short-term storage only, so it borrows Graph.
+  AnnotatedGraph(Graph& graph, int device)
+  : graph(&graph), device(device) {}
+  
+  Graph* graph = nullptr; // TODO: this should really be const
+  int device = kCPUDevice;
+  std::vector<TensorDesc> input_desc;
+  std::vector<TensorDesc> output_desc;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
new file mode 100644
index 00000000000000..2021b9fa1b832d
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -0,0 +1,556 @@
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/cpu/resource_strings.h"
+#include "torch/csrc/jit/fusers/cuda/resource_strings.h"
+#include "torch/csrc/jit/fusers/common/partition_desc.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+#include "torch/csrc/jit/fusers/common/tensor_info.h"
+
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/assertions.h"
+
+#include "ATen/ATen.h"
+
+#if USE_CUDA_FUSER
+  #include "THC/THCTensorRandom.h"
+  #include "THC/THCGenerator.hpp"
+  THCGenerator* THCRandom_getGenerator(THCState* state);
+#endif // USE_CUDA_FUSER
+
+#include <tuple>
+#include <iostream>
+#include <sstream>
+#include <cstdint>
+#include <vector>
+
+namespace torch { namespace jit {
+
+// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
+// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
+// linearId /= sizes[i];
+auto dim_calc = CodeTemplate(R"(
+//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
+size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
+${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
+)");
+
+// XXX: this code assumes that inputs are 32-bit addressable
+static uint32_t computeNumel(at::ArrayRef<int64_t> sizes) {
+  uint32_t result = 1;
+  if (sizes.size() == 0) {
+    return 1; // scalar tensor
+  }
+  for (int64_t size : sizes) {
+    result *= size;
+  }
+  return result;
+}
+
+// XXX: Assumes that after at::chunk, all inputs are the same size
+static std::vector<int64_t> computeMapSize(
+    const at::Tensor& tensor,
+    const PartitionDesc& chunkDesc) {
+  std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
+  // Should have been checked in graph fuser
+  JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0);
+  sizes[chunkDesc.dim] /= chunkDesc.nSubtensors;
+  return sizes;
+}
+
+// Tries to compress sizes and strides according to cont. Emits the result t
+// c_sizes, c_strides and throws an error on failure (if can't compress)
+static void compressContiguous(
+  at::IntList sizes
+, at::IntList strides
+, const std::vector<bool> & cont
+, uint32_t* c_sizes
+, uint32_t* c_strides) {
+  size_t compressed_dims = 0;
+  size_t cur = 0;
+  size_t ndim = sizes.size();
+  while (cur < ndim) {
+    size_t total_size = sizes[cur];
+    cur++;
+    while (cont[cur-1] && cur < ndim) {
+      JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
+      total_size *= sizes[cur];
+      cur++;
+    }
+   // cur starts pointing at the beginning of run to compress
+   // cur ends one _after_ the terminating false or end of list.
+   // total_size is the size of all dimensions [begin,end)
+   // examples:
+   // f = not cont.
+   // t = cont.
+   // x = don't care, including past end of list
+   // s = start of cur
+   // e = end of cur
+
+
+   // f x x x
+   // s e
+
+   //  t f x x
+   //  s   e
+
+   //  t t f x
+   //  s     e
+
+    c_sizes[compressed_dims] = total_size;
+    c_strides[compressed_dims] = strides[cur-1];
+    compressed_dims++;
+  }
+  if (ndim > 0) {
+    JIT_ASSERT(!cont.back() || strides.back() == 1);
+  }
+}
+
+void FusedKernel::launch_with_tensors(
+  at::ArrayRef<at::Tensor> inputs
+, at::ArrayRef<at::Tensor> outputs) {
+  at::DeviceGuard device_guard(inputs);
+  JIT_ASSERT(inputs.size() == input_desc.size());
+  JIT_ASSERT(outputs.size() == output_desc.size());
+  size_t flat_inputs_size = 0;
+  size_t flat_outputs_size = 0;
+  for (auto& c : chunk_desc)
+    flat_inputs_size += c.nSubtensors;
+  for (auto& c : concat_desc)
+    flat_outputs_size += c.nSubtensors;
+  // XXX: this code assumes that inputs are 32-bit addressable
+  // XXX: this code assumes that all inputs are of the same size
+  JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
+
+  // Compute map_size, numel from the first input
+  at::IntList map_size;
+  uint32_t numel;
+  std::vector<int64_t> keep_alive_size;
+  if (chunk_desc[0].isNoop()) {
+    map_size = inputs[0].sizes();
+    numel = inputs[0].numel();
+  } else {
+    keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]);
+    map_size = keep_alive_size;
+    numel = computeNumel(map_size);
+  }
+
+  // Compute the storage needed to store TensorInfo structs for inputs and outputs.
+  size_t uncompressedDim = input_desc.at(0).contiguity.size();
+  size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
+  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
+  std::vector<char> buffer(maxPossibleBufferSize);
+  char* buffer_next = buffer.data();
+  // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
+  std::vector<void*> arguments;
+  arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
+  auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) {
+    size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
+    JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
+    auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
+    ti->data = data_ptr;
+    compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
+    buffer_next += maxPossibleTensorInfoSize;
+    arguments.push_back(ti);
+  };
+  // Asserts that t's dims can be compressed in the same way as in desc
+  // (that's what the kernel assumes), and appends it to the arguments vector.
+  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
+    addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
+  };
+  arguments.push_back(&numel);
+  for (size_t i = 0; i < input_desc.size(); ++i) {
+    auto & chunk = chunk_desc[i];
+    const at::Tensor& tensor = inputs[i];
+    if (chunk.isNoop()) {
+      addTensorInfo(input_desc[i], tensor);
+    } else {
+      size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType());
+      char * data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
+      for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) {
+        addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides());
+        data_ptr += chunk_offset;
+      }
+    }
+  }
+  for (size_t i = 0; i < output_desc.size(); ++i) {
+    auto & c = concat_desc[i];
+    at::Tensor o = outputs[i];
+    if (c.isNoop()) {
+      o.resize_(map_size);
+      addTensorInfo(output_desc[i], outputs[i]);
+    } else {
+      size_t small_size = map_size[c.dim];
+      std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
+      concat_size[c.dim] = small_size * c.nSubtensors;
+      o.resize_(concat_size);
+      size_t offset = 0;
+      for(size_t j = 0; j < c.nSubtensors; ++j) {
+        // because the concatenated_output stays live, the underlying data
+        // in this view remains live through the end of this function
+        // so there is not need to hold onto this tensor
+        auto view = o.narrow(c.dim, offset, small_size);
+        addTensorInfo(*c.subtensorDesc, view);
+        offset += small_size;
+      }
+    }
+  }
+
+  // If the kernel call contains a random op, we need to pass in random seeds as
+  // well.
+  #if USE_CUDA_FUSER
+    if (has_random && this->backend() == at::Backend::CUDA) {
+      auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
+      uint64_t offset =
+          gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel));
+      arguments.push_back(&gen_->state.initial_seed);
+      arguments.push_back(&offset);
+    }
+  #endif // USE_CUDA_FUSER
+  
+  launch_raw(numel, arguments.data());
+}
+
+void FusedKernel::launch(
+  at::ArrayRef<at::Tensor> inputs
+, std::vector<at::Tensor> & outputs) {
+  at::DeviceGuard guard(inputs.back());
+  JIT_ASSERT(inputs.size() > 0);
+  auto & ref_type = inputs[0].type();
+  outputs.clear();
+  outputs.reserve(outputDescriptors().size());
+  for(auto & od : outputDescriptors()) {
+    outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
+  }
+
+  launch_with_tensors(inputs, outputs);
+}
+
+static std::string valueName(Value * n) {
+  return "n" + std::to_string(n->unique());
+}
+
+static std::string scalarValue(int64_t v) {
+  return std::to_string(v);
+}
+
+static std::string scalarValue(double v) {
+  std::ostringstream out;
+  out << std::scientific << v << "f";
+  return out.str();
+}
+
+static const char * scalarTypeName(at::ScalarType type) {
+  if (type == at::ScalarType::Half) {
+    return "half";
+  }
+
+  switch(type) {
+    #define DEFINE_CASE(ctype,name,_) \
+      case at::ScalarType::name: return #ctype;
+    AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE)
+    #undef DEFINE_CASE
+    default:
+      throw std::runtime_error("unknown scalar type");
+  }
+}
+
+static std::string encodeRHS(Node* n) {
+  static std::unordered_map<NodeKind, std::string> simple_map_ops = {
+    // unary
+    {aten::abs, "absf(${0})"},
+    {aten::sigmoid, "1.f / (1.f + expf(-${0}))"},
+    {aten::relu, "${0} < 0 ? 0.f : ${0} "},
+    {aten::log, "logf(${0})"},
+    {aten::log10, "log10f(${0})"},
+    {aten::log1p, "log1pf(${0})"},
+    {aten::log2,  "log2f(${0})"},
+    {aten::lgamma, "lgammaf(${0})"},
+    {aten::exp, "expf(${0})"},
+    {aten::expm1, "expm1f(${0})"},
+    {aten::cos, "cosf(${0})"},
+    {aten::acos, "acosf(${0})"},
+    {aten::cosh, "coshf(${0})"},
+    {aten::sin, "sinf(${0})"},
+    {aten::asin, "asinf(${0})"},
+    {aten::sinh, "sinhf(${0})"},
+    {aten::tan, "tanf(${0})"},
+    {aten::atan, "atanf(${0})"},
+    {aten::tanh, "tanhf(${0})"},
+    {aten::sqrt, "sqrtf(${0})"},
+    {aten::rsqrt, "rsqrtf(${0})"},
+    {aten::ceil, "ceilf(${0})"},
+    {aten::floor, "floorf(${0})"},
+    {aten::round, "roundf(${0})"},
+    {aten::trunc, "truncf(${0})"},
+    {aten::frac, "fracf(${0})"},
+    {aten::reciprocal, "reciprocalf(${0})"},
+    {aten::neg, "-${0}"},
+    //simple binary
+    {aten::atan2, "atan2(${0}, ${1})"},
+    {aten::min, "fminf(${0}, ${1})"},
+    {aten::max, "fmaxf(${0}, ${1})"},
+
+    //binary with other
+    // TODO: some of these ops will not get generated because
+    // we only work on float inputs/outputs, but they are here to record
+    // that they are valid mappable ops once we handle more type
+    {aten::__and__, "${0} && ${1}"},
+    {aten::__lshift__, "${0} << ${1}"},
+    {aten::__or__, "${0} || ${1}"},
+    {aten::__rshift__, "${0} >> ${1}"},
+    {aten::__xor__, "${0} ^ ${1}"},
+    {aten::div, "${0} / ${1}"},
+    {aten::eq, "${0} == ${1}"},
+    {aten::fmod, "fmodf(${0}, ${1})"},
+    {aten::ge, "(${0} >= ${1})"},
+    {aten::gt, "${0} > ${1}"},
+    {aten::le, "(${0} <= ${1})"},
+    {aten::lt, "${0} < ${1}"},
+    {aten::type_as, "(${0})"}, //everything is implicitly convertible to float
+    {aten::mul, "${0} * ${1}"},
+    {aten::ne, "${0} != ${1}"},
+    {aten::remainder, "remainderf(${0}, ${1})"},
+    {aten::pow, "powf(${0}, ${1})"},
+
+    //alpha
+    {aten::add, "${0} + ${2}*${1}"},
+    {aten::sub, "(${0} - ${2}*${1})"},
+    {aten::rand_like, "uniform(rnd())"},
+
+    // simple derivatives
+    {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
+    {aten::_tanh_backward,    "${0} * (1.f - ${1} * ${1})"},
+  };
+
+  if (n->kind() == prim::Constant) {
+    auto val = toIValue(n->output()).value();
+    if (val.isDouble()) {
+      return scalarValue(val.toDouble());
+    } else {
+      JIT_ASSERT(val.isInt());
+      return scalarValue(val.toInt());
+    }
+  }
+
+  TemplateEnv env;
+  size_t i = 0;
+  for(auto in : n->inputs()) {
+    env.s(std::to_string(i++), valueName(in));
+  }
+
+  const auto & str = simple_map_ops.at(n->kind());
+  return format(str, env);
+}
+
+static Node* usedInFusedChunk(Value* input) {
+  auto uses = input->uses();
+  if (uses.size() == 1) {
+    Node *user = uses[0].user;
+    if (user->kind() == prim::ConstantChunk) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+static void emitIndexingFor(
+  std::ostream& out
+, const std::string& tensor
+, int ndim
+, bool last_is_cont) {
+  TemplateEnv env;
+  env.s("tensor",tensor);
+  out << format("IndexType ${tensor}_offset = 0;\n",env);
+  out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
+  for (int d = ndim - 1; d >= 0; --d) {
+    env.d("d",d);
+    env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
+    env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
+      format("* ${tensor}.strides[${d}]",env) : "");
+    out << dim_calc.format(env);
+    if (d > 0) {
+      out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
+    }
+  }
+}
+
+// Returns: (input chunk metadata, output concat metadata, is_random)
+std::tuple<
+    std::vector<PartitionDesc>
+  , std::vector<PartitionDesc>
+  , bool> 
+  emitCompilationUnit(
+    std::ostream& out
+  , const std::string& name
+  , AnnotatedGraph& agraph
+  , bool use_cuda) {
+  bool has_random = false;
+  Graph& subgraph = *agraph.graph;
+  TemplateEnv env;
+  env.s("kernelName", name);
+  // TODO: handle cases where we need to generate > 2^32 element tensors
+  env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
+
+  std::stringstream body;
+  std::stringstream tensorOffsets;
+  std::vector<std::string> formals;
+  std::vector<std::string> argument_loads;
+  auto emitFormal = [&](Value * n, const TensorDesc & desc) {
+    std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
+    size_t nDim = desc.nDim();
+    emitIndexingFor(tensorOffsets, tensor, nDim,  desc.lastIsContiguous());
+    env.s("tensor",tensor);
+    env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex
+    env.d("nDim",nDim);
+    env.s("scalar_type",scalarTypeName(desc.scalar_type));
+    formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}", env));
+    argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])", env));
+  };
+
+  std::vector<PartitionDesc> chunk_desc;
+  std::vector<std::pair<Value*,TensorDesc&>> flat_inputs;
+  {
+    size_t input_index = 0;
+    for(auto p : subgraph.inputs()) {
+      if (Node * chunk = usedInFusedChunk(p)) {
+        int64_t dim = chunk->i(attr::dim);
+        int64_t chunks = chunk->i(attr::chunks);
+        chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim);
+        for (auto * o : chunk->outputs()) {
+          flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc);
+        }
+      } else {
+        chunk_desc.emplace_back();
+        flat_inputs.emplace_back(p, agraph.input_desc[input_index++]);
+      }
+    }
+    for (auto & input : flat_inputs) {
+      emitFormal(input.first, input.second);
+    }
+  }
+
+  std::vector<PartitionDesc> concat_desc;
+  std::vector<std::pair<Value*,TensorDesc>> flat_output_nodes;
+  {
+    size_t i = 0;
+    for(auto o : subgraph.outputs()) {
+      auto & desc = agraph.output_desc[i++];
+      if(o->node()->kind() != prim::FusedConcat) {
+        emitFormal(o, desc);
+        concat_desc.emplace_back();
+        flat_output_nodes.emplace_back(o, desc);
+      } else {
+        auto cat = o->node();
+        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
+        for(auto c : cat->inputs()) {
+          emitFormal(c, *concat_desc.back().subtensorDesc);
+          flat_output_nodes.emplace_back(c, desc);
+        }
+      }
+    }
+  }
+
+  #if USE_CUDA_FUSER
+    bool has_half_tensor = false;
+  #endif // USE_CUDA_FUSER
+  size_t formal_count = 0;
+  for(auto input : flat_inputs) {
+    auto p = input.first;
+    env.s("node", valueName(p));
+    env.d("formal", formal_count++);
+
+    // Acquires and converts (if needed) inputs
+    bool is_half = input.second.scalar_type == at::ScalarType::Half;
+    if (is_half) {
+      AT_ASSERT(use_cuda);
+      #if USE_CUDA_FUSER
+        env.s(
+          "access"
+        , format("__half2float(t${formal}.data[t${formal}_offset])", env));
+        has_half_tensor = true;
+      #endif // USE_CUDA_FUSER
+    } else {
+      env.s("access", format("t${formal}.data[t${formal}_offset]", env));
+    }
+
+    //TODO: actual type propagation rather than relying on auto..
+    body << format("auto ${node} = ${access};\n", env);
+  }
+
+  for (auto n : subgraph.nodes()) {
+    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
+    if (n->kind() == prim::FusedConcat)
+      continue;
+    if (n->kind() == prim::ConstantChunk)
+      continue;
+    if (n->kind() == aten::rand_like) {
+      has_random = true;
+      if (!use_cuda)
+        throw std::runtime_error("Fusion doesn't support rand on CPU");
+    }
+    env.s("node",valueName(n->output()));
+    env.s("rhs", encodeRHS(n));
+    body << format("auto ${node} = ${rhs};\n",env);
+  }
+
+  for (auto output : flat_output_nodes) {
+    auto o = output.first;
+    env.d("formal",formal_count++);
+    env.s("access",format("t${formal}.data[t${formal}_offset]",env));
+    env.s("node",valueName(o));
+
+    // Acquires and converts (if needed) outputs
+    bool is_half = output.second.scalar_type == at::ScalarType::Half;
+    if (is_half) {
+      AT_ASSERT(use_cuda);
+      #if USE_CUDA_FUSER
+        body << format("${access} = __float2half(${node});\n",env);
+        has_half_tensor = true;
+      #endif // USE_CUDA_FUSER
+    } else {
+      body << format("${access} = ${node};\n",env);
+    }
+  }
+
+  // Includes half support if any half tensors are involved
+  #if USE_CUDA_FUSER
+    if (has_half_tensor) {
+      env.s("HalfHeader", cudafuser::half_support_literal);
+    } else {
+      env.s("HalfHeader", "");
+    }
+
+    if (has_random) {
+      env.s("RandHeader", cudafuser::rand_support_literal);
+      env.s("RandParam", cudafuser::rand_param);
+      env.s("RandInit", cudafuser::rand_init);
+    } else {
+      env.s("RandHeader", "");
+      env.s("RandParam", "");
+      env.s("RandInit", "");
+    }
+  #endif // USE_CUDA_FUSER
+
+  env.s("tensorOffsets", tensorOffsets.str());
+  env.s("kernelBody", body.str());
+  env.v("formals", formals);
+  env.v("argument_loads", argument_loads);
+  if (use_cuda) {
+    #if USE_CUDA_FUSER
+      env.s("type_declarations", cudafuser::type_declarations_template.format(env));
+      out << cudafuser::cuda_compilation_unit_template.format(env);
+    #else
+      throw std::runtime_error("CUDA Fusion requested but not supported.");
+    #endif // USE_CUDA_FUSER
+  } else {
+    env.s("type_declarations", cpufuser::type_declarations_template.format(env));
+    out << cpufuser::cpu_compilation_unit_template.format(env);
+  }
+
+  return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.h b/torch/csrc/jit/fusers/common/fused_kernel.h
new file mode 100644
index 00000000000000..fbdf5d60d0a8cd
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fused_kernel.h
@@ -0,0 +1,88 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+#include "torch/csrc/jit/fusers/common/partition_desc.h"
+
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+#include <string>
+#include <cstdint>
+#include <vector>
+
+namespace torch { namespace jit {
+
+std::tuple<std::vector<PartitionDesc>, std::vector<PartitionDesc>, bool> emitCompilationUnit(
+  std::ostream& out
+, const std::string& name
+, AnnotatedGraph& agraph
+, bool use_cuda);
+
+struct FusedKernel {
+  TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
+
+  FusedKernel(
+    const std::string& name
+  , AnnotatedGraph& agraph)
+  : name{name}
+  , input_desc{agraph.input_desc}
+  , output_desc{agraph.output_desc} { }
+
+  virtual ~FusedKernel() = default;
+
+  // expects outputs to be pre-allocated
+  void launch_with_tensors(
+    at::ArrayRef<at::Tensor> inputs
+  , at::ArrayRef<at::Tensor> outputs);
+
+  // creates new tensors for outputs
+  void launch(
+    at::ArrayRef<at::Tensor> inputs
+  , std::vector<at::Tensor>& outputs);
+  
+  const std::vector<TensorDesc>& outputDescriptors() const {
+    return output_desc;
+  }
+
+protected:
+
+  virtual at::Backend backend() const = 0;
+
+  // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
+  // code.
+  // The format of arguments is suitable for directly passing to a call to
+  // cuLaunchKernel as the kernel arguments.
+  // Currently the first argument is a pointer to numel (for passing to
+  // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
+  // that compiled code uses to load Tensor data.
+  // launch_with_tensors handles packing at::Tensors into this arguments array.
+  // CPU code uses the same convension so that launch_with_tensors can be shared.
+  virtual void launch_raw(uint32_t numel, void** arguments) = 0;
+
+  virtual uint64_t get_rand_offset(uint32_t numel) = 0;
+  bool has_random;
+  std::string name;
+  // We keep these around for debugging
+  std::string compilation_unit;
+  std::vector<TensorDesc> input_desc;
+  std::vector<TensorDesc> output_desc;
+
+  // same size as output_desc, describes whether
+  // an output is actually a concatenation of
+  // many subtensors that the fusion group produces
+  std::vector<PartitionDesc> concat_desc;
+
+  // same size as input_desc, describes whether an
+  // input should be broken into subtensors (chunks)
+  // to be consumed by the fusion group
+  std::vector<PartitionDesc> chunk_desc;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/fusion_arg_spec.h b/torch/csrc/jit/fusers/common/fusion_arg_spec.h
new file mode 100644
index 00000000000000..b8780f8ada9e77
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fusion_arg_spec.h
@@ -0,0 +1,46 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/utils/functional.h" // fmap
+#include "torch/csrc/utils/hash.h"
+
+#include "ATen/ATen.h"
+
+#include <vector>
+#include <cstdint>
+
+namespace torch { namespace jit {
+
+struct FusionArgSpec {
+  FusionArgSpec(at::TensorList inputs)
+  : descs_(fmap<TensorDesc>(inputs))
+  , hash_code_(torch::get_hash(inputs.size(), descs_)) {}
+
+  bool operator==(const FusionArgSpec& spec) const {
+    return hash_code_ == spec.hash_code_ && descs_ == spec.descs_;
+  }
+
+  bool operator!=(const FusionArgSpec& spec) const {
+    return !(*this == spec);
+  }
+
+  static size_t hash(const FusionArgSpec& spec) {
+    return spec.hash_code_;
+  }
+
+  const std::vector<TensorDesc>& descs() const {
+    return descs_;
+  }
+
+private:
+  std::vector<TensorDesc> descs_;
+  size_t hash_code_;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp b/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp
new file mode 100644
index 00000000000000..052a08cac93ede
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp
@@ -0,0 +1,400 @@
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_arg_spec.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+#include "torch/csrc/jit/fusers/cpu/fused_kernel.h"
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/cuda/fused_kernel.h"
+
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/custom_operator.h"
+
+#include "torch/csrc/utils/functional.h" //fmap
+
+#include "ATen/ATen.h"
+#include "ATen/ExpandUtils.h"
+
+#include <unordered_set>
+#include <tuple>
+#include <algorithm>
+#include <exception>
+
+namespace torch { namespace jit {
+
+////////////////////////////////////////////////////////////////////////////////
+// FusedKernelCache
+
+// Note [Run-time shape checking code]
+// There are multiple assumptions that our codegen makes, which we can't check
+// in the fusion pass, because we don't have the shape information. Most notably,
+// that all values (post-input-chunk, and pre-output-concat) have the same shape
+// (hereinafter referred to as map size). One way to check this would be to run
+// shape propagation for every size configuration we get as an input, but that
+// requires a full graph traversal, and might incur unnecessary overhead. The code
+// below uses a few nice properties of broadcasting rules and their interactions with
+// pointwise operations, and takes a smarter approach, to quickly verify validity of
+// the kernel.
+//
+// Notation:
+//   - a.s when a is a tensor is a shorthand for a.shape.
+//   - B is a shorthand for the broadcasting/expanding function. It is used as a
+//     vararg function.
+//   - E is a shorthand for expand function.
+//   - Every pointwise operation can be equivalently rewritten as
+//     f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))),
+//     where f^ is a non-broadcasting verison of f.
+//   - A set of inputs that are used to produce a certain graph output is referred to
+//     as the output's broadcasting group (see Lemma 2. for explanation why).
+//
+// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the
+//          operation of broadcasting (returning bottom upon shape mismatch) forms a monoid.
+//          In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c).
+//
+// Proof.   Satisfies all monoid laws:
+//            - Closed under broadcasting (trivial)
+//            - Empty shape is the identity element: B(a, []) == B([], a) == a
+//            - Associativity: A simple visual proof is that you can expand 3 tensors
+//                at the same time by stacking their sizes (with alignment to the right),
+//                just as you'd do in the case of 2 tensors, but with an intermediate
+//                (the algorithm ends up being pretty much the same).
+//
+// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set
+//          of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]).
+//
+// Proof.   Let G be any DAG of pointwise ops and < be any valid topological
+//          ordering on nodes of G. Proof by induction over <.
+//          Base case (graph input):
+//            Trivial (input is also an output).
+//          Step (n = f(q, r)):
+//            Let QS (RS) be the set of shapes of inputs that q (r) depends on.
+//            Note that the set of inputs that n depends on is exactly QS + RS.
+//            shape(n) == shape(f(q, r))
+//                          (def of f)
+//                     == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s))))
+//                          (output shape of f^ is equal to either of argument shapes)
+//                     == shape(E(q, B(q.s, r.s)))
+//                          (property of expand)
+//                     == B(q.s, r.s)
+//                          (induction assumption)
+//                     == B(B(QS...), B(RS...))
+//                          (Lemma 1.)
+//                     == B(QS..., RS...)
+//                          (repeated shapes don't matter for broadcasting)
+//                     == B((QS + RS)...)
+//
+// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s))
+// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)).
+// Proof.   A simple exercise for the reader :)
+//
+// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to
+//          (post-chunk-)inputs, and have all intermediates of the same shape
+//          (no broadcasting happening in the body).
+//
+// Proof.   Using the above lemmas we can easily show that a graph with a single output
+//          can be easily rewritten by taking the shape given by B applied to all input
+//          shapes, expanding inputs to it, and using only non-broadcasting operations.
+//          Example:
+//
+//          let d = f(a, b) in
+//          let e = h(b, c) in
+//          g(d, e)
+//
+//          (By def. of broadcasting pointwise ops applied to g, f and h)
+//          (Lemma 2. for a closed formula for the size of g = gs)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in
+//          let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in
+//          g^(d', e')
+//
+//          (Lemma 3.)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in
+//          let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in
+//          g^(d', e')
+//
+//          (Lemma 4. + Lemma 1. to simplify broadcasting function)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let d' = f^(E(a, gs), E(b, gs)) in
+//          let e' = h^(E(b, gs), E(c, gs)) in
+//          g^(d', e')
+//
+//          (Simple rewrite)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let a' = E(a, gs) in
+//          let b' = E(b, gs) in
+//          let c' = E(c, gs) in
+//          let d' = f^(a', b') in
+//          let e' = h^(b', c') in
+//          g^(d', e')
+//
+//          This example can be easily formalized to arbitrary DAGs using induction
+//          over topological ordering, similar to Lemma 2. Now, if broadcasting groups
+//          for all outputs have the same shape, then performing an expand to this size
+//          on all inputs will ensure that all intermediates on all paths to outputs
+//          will have the same shape, proving that the body of the kernel is valid.
+//
+//          This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs
+//          is straightforward (needs a simple lemma for moving expands through chunks).
+
+// Register implementations of fused operators, so that we can reuse the fused graph
+// to generate fallback code.
+RegisterOperators reg_fused_operators({
+  Operator(
+    prim::FusedConcat,
+    [](Node* node) {
+      int64_t dim = node->i(attr::dim);
+      int64_t num_inputs = node->inputs().size();
+      return [dim, num_inputs](Stack& stack) {
+        auto result = at::cat(
+          fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }),
+          dim
+        );
+        drop(stack, num_inputs);
+        pack(stack, std::move(result));
+        return 0;
+      };
+    })
+});
+
+FusionHandleImpl::FusionHandleImpl(
+  std::shared_ptr<Graph> _graph
+, int device)
+: device(device)
+, fallback_code(_graph)
+, graph(std::move(_graph))
+, input_broadcast_groups(getInputBroadcastGroups())
+, input_chunks(getInputChunkDescriptors())
+, kernels() { }
+
+std::atomic<size_t> FusionHandleImpl::next_kernel_id {0};
+
+static Node* usedInFusedChunk(Value* input) {
+  auto uses = input->uses();
+  if (uses.size() == 1) {
+    Node *user = uses[0].user;
+    if (user->kind() == prim::ConstantChunk) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+auto FusionHandleImpl::getInputChunkDescriptors() -> std::vector<PartitionInfo> {
+  std::vector<PartitionInfo> descs;
+  descs.reserve(graph->inputs().size());
+  for (Value* input : graph->inputs()) {
+    if (Node* chunk = usedInFusedChunk(input)) {
+      descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim));
+    } else {
+      descs.emplace_back(1, 0);
+    }
+  }
+  return descs;
+}
+
+// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access
+static std::vector<int64_t> getInputDependencies(Value* output) {
+  // Run a DFS traversal to find all inputs that affect a given output value
+  std::vector<Value*> queue { output };
+  std::unordered_set<Value*> inputs;
+  std::unordered_set<Value*> seen;
+  while (!queue.empty()) {
+    Value* val = queue.back(); queue.pop_back();
+    Node* producer = val->node();
+    if (producer->kind() == prim::Param) {
+      inputs.insert(val);
+      continue;
+    }
+    for (Value* input : producer->inputs()) {
+      if (/*bool inserted = */seen.insert(input).second) {
+        queue.push_back(input);
+      }
+    }
+  }
+
+  // Convert Value* into offsets into the graph's input list
+  std::vector<int64_t> offsets;
+  offsets.reserve(inputs.size());
+  for (Value* input : inputs) {
+    offsets.push_back(input->offset());
+  }
+
+  std::sort(offsets.begin(), offsets.end());
+  return offsets;
+}
+
+// See Note [Run-time shape checking code] for more explanation on the algorithm.
+at::optional<std::vector<int64_t>> FusionHandleImpl::canRunKernel(at::TensorList args) {
+  AT_CHECK(args.size() == input_chunks.size(),
+           "Expected ", input_chunks.size(), " arguments, but got ", args.size());
+
+  at::optional<std::vector<int64_t>> map_size;
+  for (const auto & broadcast_group : input_broadcast_groups) {
+    if (!map_size) {
+      map_size = getMapSize(args, broadcast_group);
+      if (!map_size) {
+        return at::nullopt;
+      }
+    } else {
+      auto group_map_size = getMapSize(args, broadcast_group);
+      // NB: this checks that group_map_size is defined AND equal to map_size
+      if (map_size != group_map_size) {
+        return at::nullopt;
+      }
+    }
+  }
+  return map_size;
+}
+
+std::unique_ptr<FusedKernel> FusionHandleImpl::compileSpec(
+  const FusionArgSpec& spec
+, const std::vector<int64_t>& map_size) {
+  AnnotatedGraph agraph{*graph, device};
+
+  agraph.input_desc = spec.descs();
+  // XXX: this assumes that fused kernels only operate on floating-point values inside
+  at::optional<at::ScalarType> scalar_type;
+  for (TensorDesc& desc : agraph.input_desc) {
+    if (isFloatingType(desc.scalar_type)) {
+      scalar_type = desc.scalar_type;
+      break;
+    }
+  }
+  JIT_ASSERT(scalar_type);
+
+  for (Value * output : graph->outputs()) {
+    std::vector<int64_t> sizes = map_size;
+    if (output->node()->kind() == prim::FusedConcat) {
+      sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size();
+    }
+    auto type = CompleteTensorType::create(*scalar_type, device, sizes);
+    agraph.output_desc.emplace_back(std::move(type));
+  }
+
+  std::string name = "kernel_" + std::to_string(next_kernel_id++);
+  FusedKernel* raw_func;
+  if (device != kCPUDevice) {
+    #if USE_CUDA_FUSER
+      raw_func = new cudafuser::CUDAFusedKernel(name, agraph);
+    #else
+      throw std::runtime_error("CUDA Fusion is not supported on this build.");
+    #endif // USE_CUDA_FUSER
+  } else {
+    raw_func = new cpufuser::CPUFusedKernel(
+      name
+    , agraph
+    , cpufuser::getFusionCompiler().getConfig());
+  }
+  return std::unique_ptr<FusedKernel>(raw_func);
+}
+
+// NB: args are mutated in this call. map_size is mutated too, but is restored to its original
+// value before this function returns (it's an optimization).
+void FusionHandleImpl::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {
+  for (size_t i = 0; i < args.size(); ++i) {
+    auto& arg = args[i];
+    auto& pdesc = input_chunks[i];
+    if (pdesc.nSubtensors == 1) {
+      if (arg.sizes().equals(map_size)) continue;
+      arg = arg.expand(map_size);
+    } else {
+      map_size.at(pdesc.dim) *= pdesc.nSubtensors;
+      if (!arg.sizes().equals(map_size)) {
+        arg = arg.expand(map_size);
+      }
+      map_size.at(pdesc.dim) /= pdesc.nSubtensors;
+    }
+  }
+}
+
+std::vector<std::vector<int64_t>> FusionHandleImpl::getInputBroadcastGroups() {
+  std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>> broadcast_groups;
+  for (Value* output : graph->outputs()) {
+    broadcast_groups.insert(getInputDependencies(output));
+  }
+  return std::vector<std::vector<int64_t>>{broadcast_groups.begin(), broadcast_groups.end()};
+}
+
+void FusionHandleImpl::run(Stack& stack) {
+  int64_t num_inputs = graph->inputs().size();
+  auto args = fmap(last(stack, num_inputs), [](const IValue& i) {
+    return i.toTensor();
+  });
+
+  auto maybe_map_size = canRunKernel(args);
+  if (!maybe_map_size) {
+    return runFallback(stack);
+  }
+  expandArgs(args, *maybe_map_size);
+
+  FusionArgSpec spec{args};
+  auto it = kernels.find(spec);
+  if (it == kernels.end()) {
+    std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size));
+  }
+  auto& fn = it->second;
+
+  std::vector<at::Tensor> outputs;
+  fn->launch(args, outputs);
+  drop(stack, num_inputs);
+  stack.insert(
+    stack.end()
+  , std::make_move_iterator(outputs.begin())
+  , std::make_move_iterator(outputs.end()));
+}
+
+at::optional<std::vector<int64_t>> FusionHandleImpl::getMapSize(
+  at::TensorList args
+, at::IntList arg_subset) {
+  int64_t dim_after_broadcast = 0;
+  for (int64_t arg_idx : arg_subset) {
+    dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim());
+  }
+  // TODO: this keeps reallocating map_size at every iteration, but we know
+  // exactly how much storage do we need, so this could be fixed in-place at
+  // every step. We're just missing a few functions for ATen, but the fix
+  // should be straightforward.
+  // NB: we leave this uninitialized, because an empty size is trivially
+  // broadcastable to any other size.
+  std::vector<int64_t> map_size;
+  for (size_t i = 0; i < arg_subset.size(); ++i) {
+    auto& arg = args.at(arg_subset[i]);
+    auto& chunk_desc = input_chunks.at(arg_subset[i]);
+    if (chunk_desc.nSubtensors == 1) {
+      try {
+        map_size = at::infer_size(map_size, arg.sizes());
+      } catch (std::exception& e) {
+        return at::nullopt;
+      }
+    } else {
+      auto tensor_sizes = arg.sizes().vec();
+      int64_t num_chunks = chunk_desc.nSubtensors;
+      int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size());
+      if (tensor_sizes[dim] % num_chunks != 0) {
+        return at::nullopt;
+      }
+      tensor_sizes[dim] /= num_chunks;
+      try {
+        map_size = at::infer_size(map_size, tensor_sizes);
+      } catch (std::exception& e) {
+        return at::nullopt;
+      }
+    }
+  }
+
+  return {map_size};
+}
+
+void FusionHandleImpl::runFallback(Stack& stack) {
+  InterpreterState(fallback_code).runOneStage(stack);
+}
+
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/common/fusion_handle_impl.h b/torch/csrc/jit/fusers/common/fusion_handle_impl.h
new file mode 100644
index 00000000000000..cfc601ca385d21
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fusion_handle_impl.h
@@ -0,0 +1,72 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_arg_spec.h"
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+
+#include "torch/csrc/jit/stack.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+
+namespace torch { namespace jit {
+
+// FusionCompiler has very limited shape information available at the time getOrCompile
+// is called, and this is why it can't really prepare the kernels at that time. Instead,
+// it returns this object, which will take care of matching the run-time shapes to whatever
+// kernels we have compiled already.
+//
+// Two configurations are considered eligible for the same fused kernel if:
+//   - the shapes satisfy graph invariants for our fused code (e.g. that all intermediate shapes
+//     are the same - see fusion_compiler.cpp for more details).
+//   - their FusionArgSpecs compare equal
+struct FusionHandleImpl : public FusionHandle {
+  FusionHandleImpl(
+    std::shared_ptr<Graph> _graph
+  , int device);
+
+  void run(Stack& inputs);
+
+private:
+  struct PartitionInfo {
+    PartitionInfo(int64_t nsub, int64_t dim)
+    : nSubtensors(nsub), dim(dim) { };
+
+    int64_t nSubtensors;
+    int64_t dim;
+  };
+
+  void runFallback(Stack& stack);
+  void expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size);
+  at::optional<std::vector<int64_t>> canRunKernel(at::TensorList args);
+  at::optional<std::vector<int64_t>> getMapSize(at::TensorList args, at::IntList arg_subset);
+  std::vector<std::vector<int64_t>> getInputBroadcastGroups();
+  std::vector<PartitionInfo> getInputChunkDescriptors();
+  std::unique_ptr<FusedKernel> compileSpec(
+        const FusionArgSpec& spec, const std::vector<int64_t>& map_size);
+
+  static std::atomic<size_t> next_kernel_id;
+
+  int device;
+  Code fallback_code;
+  std::shared_ptr<Graph> graph;
+  std::vector<std::vector<int64_t>> input_broadcast_groups;
+  std::vector<PartitionInfo> input_chunks;
+  std::unordered_map<
+    FusionArgSpec
+  , std::unique_ptr<FusedKernel>
+  , torch::hash<FusionArgSpec>> kernels;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/partition_desc.h b/torch/csrc/jit/fusers/common/partition_desc.h
new file mode 100644
index 00000000000000..b9825155f55ca1
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/partition_desc.h
@@ -0,0 +1,48 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/jit/assertions.h"
+
+#include <memory>
+#include <cstdint>
+#include <vector>
+
+namespace torch { namespace jit {
+
+// Descriptor for chunk-ing an input tensor into subtensors
+// OR concat-ing an output tensor from subtensors
+struct PartitionDesc {
+  
+  PartitionDesc()
+  : nSubtensors(1), dim(0) {}
+
+  PartitionDesc(const TensorDesc& desc, size_t nSubtensors, size_t dim)
+  : nSubtensors(nSubtensors), dim(dim) {
+    JIT_ASSERT(nSubtensors > 1);
+    std::vector<bool> cont = desc.contiguity;
+    if(dim > 0) {
+      // when we narrow the concatenated output/chunked input
+      // we make the size[dim] smaller while keeping the stride[dim] the same,
+      // meaning: stride[dim - 1] != stride[dim]*size[dim]
+      // so dim - 1 is no longer contiguous
+      cont[dim - 1] = false;
+    }
+    subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
+  }
+
+  bool isNoop() const {
+    return nSubtensors == 1;
+  }
+
+  size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat
+  size_t dim; // dimension along which the chunk/concat occurs
+  std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/tensor_desc.cpp b/torch/csrc/jit/fusers/common/tensor_desc.cpp
new file mode 100644
index 00000000000000..5b1de58e3129dc
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/tensor_desc.cpp
@@ -0,0 +1,20 @@
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/jit/assertions.h"
+
+namespace torch { namespace jit {
+
+std::vector<bool> TensorDesc::findContiguous(
+  const at::IntList& sizes
+, const at::IntList& strides) {
+  JIT_ASSERT(sizes.size() == strides.size());
+  std::vector<bool> cont(sizes.size());
+  for(size_t i = 0; i < sizes.size(); ++i) {
+    int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
+    cont[i] = strides[i] == expected_stride;
+  }
+  return cont;
+}
+
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/common/tensor_desc.h b/torch/csrc/jit/fusers/common/tensor_desc.h
new file mode 100644
index 00000000000000..ff5d640bfd6cbb
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/tensor_desc.h
@@ -0,0 +1,83 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/type.h"
+
+#include "torch/csrc/utils/hash.h"
+
+#include "ATen/ATen.h"
+
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+namespace torch { namespace jit {
+
+// type information needed by the compiler for input/outputs
+// contiguity[i] is true if the dim i is contiguous with dim i + 1.
+// contiguity.back() == true means strides.back() == 1.
+struct TensorDesc {
+  at::ScalarType scalar_type;
+  std::vector<bool> contiguity;
+
+  TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
+  : scalar_type{type}, contiguity{contiguity} {
+    if (contiguity.size() == 0) {
+      nDim_ = 0;
+    } else {
+      nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
+    }
+  }
+
+  TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
+  : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
+
+  TensorDesc(const at::Tensor& t)
+  : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
+
+  TensorDesc(CompleteTensorTypePtr type)
+  : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}
+
+  // number of dimensions after contiguity compression
+  size_t nDim() const {
+    return nDim_;
+  }
+
+  // do we have inner stride == 1?
+  bool lastIsContiguous() const {
+    return contiguity.size() == 0 || contiguity.back();
+  }
+
+  static std::vector<bool> findContiguous(
+    const at::IntList& sizes,
+    const at::IntList& strides);
+
+  bool operator==(const TensorDesc & desc) const {
+    return scalar_type == desc.scalar_type && contiguity == desc.contiguity;
+  }
+
+  bool operator!=(const TensorDesc & desc) const {
+    return !(*this == desc);
+  }
+
+  static size_t hash(const TensorDesc& spec) {
+    return torch::get_hash(spec.scalar_type, spec.nDim_, std::hash<std::vector<bool>>{}(spec.contiguity));
+  }
+
+private:
+  size_t nDim_;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) {
+  out << d.scalar_type << "[";
+  for (auto b : d.contiguity)
+    out << b << ";";
+  out << "]";
+  return out;
+}
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/tensor_info.h b/torch/csrc/jit/fusers/common/tensor_info.h
new file mode 100644
index 00000000000000..9798a44bd877e8
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/tensor_info.h
@@ -0,0 +1,25 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include <cstdint>
+
+namespace torch { namespace jit {
+
+// Host-side view of TensorInfo (that visivle for the kernel is defined above).
+// Note dims[0] - we need to dynamically allocate the dims.
+struct TensorInfo {
+  
+  uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
+  uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
+
+  void* data;
+  #pragma GCC diagnostic ignored "-Wpedantic"
+    uint32_t sizes_strides[0];
+  #pragma GCC diagnostic pop
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/dynamic_library.h b/torch/csrc/jit/fusers/cpu/dynamic_library.h
new file mode 100644
index 00000000000000..ec16698a75b960
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/dynamic_library.h
@@ -0,0 +1,42 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "dlfcn.h"
+
+namespace torch { namespace jit { namespace cpufuser {
+
+static void* checkDL(void* x) {
+  if (!x) {
+    AT_ERROR("error in dlopen or dlsym: ", dlerror());
+  }
+
+  return x;
+}
+
+struct DynamicLibrary {
+  TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
+
+  DynamicLibrary(const char* name) {
+    handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
+  }
+
+  void* sym(const char* name) {
+    JIT_ASSERT(handle);
+    return checkDL(dlsym(handle, name));
+  }
+
+  ~DynamicLibrary() {
+    if (!handle) return;
+    dlclose(handle);
+  }
+
+private:
+  void* handle = nullptr;
+};
+
+} // namespace cpufuser
+} // namespace jit
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/fused_kernel.cpp b/torch/csrc/jit/fusers/cpu/fused_kernel.cpp
new file mode 100644
index 00000000000000..277ec7b0c20975
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fused_kernel.cpp
@@ -0,0 +1,95 @@
+#include "torch/csrc/jit/fusers/cpu/fused_kernel.h"
+
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/cpu/temp_file.h"
+#include "torch/csrc/jit/fusers/cpu/dynamic_library.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+
+#include "torch/csrc/jit/assertions.h"
+#include "torch/csrc/jit/code_template.h"
+
+#include <sstream>
+#include <tuple>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+
+
+namespace torch { namespace jit { namespace cpufuser {
+
+static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
+static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
+
+// NB: -march=native not supported on PPC64 g++.  It's a bit annoying
+// to do a configure-style test to decide whether or not the g++
+// actually supports it or not, so we heuristically use the host
+// compiler to predict if the runtime compiler supports the option we
+// want.  This probably won't work if you're cross-compiling.
+// NB: -march=native is disabled because it has caused problems where
+// compiler and assembler do not agree on what native instruction they
+// understand for AVX512. When we need better CPU performance this
+// optimization can be re-enabled by tracking down the platforms where
+// this error occurs and only selectively disabling it.
+static const std::string compile_string =
+  "\"${cxx}\" -O3 -g "
+#ifndef __PPC64__
+//  "-march=native "
+#endif
+  "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
+
+static void runCompiler(
+  CPUFusionCompilerConfig& config
+, const std::string& cpp_file
+, const std::string& so_file) {
+  TemplateEnv env;
+  env.s("cxx", config.cxx);
+  env.s("fopenmp", config.openmp ? "-fopenmp" : "");
+  env.s("cpp_file",cpp_file);
+  env.s("so_file",so_file);
+  std::string result = format(compile_string, env);
+  int r = system(result.c_str());
+  if (config.openmp && r != 0) {
+    std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
+    config.openmp = false; // disable for future compiles
+    return runCompiler(config, cpp_file, so_file);
+  }
+  JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel");
+}
+
+static const std::string disas_string =
+  "objdump -M  intel -d \"${so_file}\"";
+static void disas(const std::string& so_file) {
+  TemplateEnv env;
+  env.s("so_file", so_file);
+  std::string cmd = format(disas_string, env);
+  int r = system(cmd.c_str());
+  JIT_ASSERT(r == 0);
+}
+
+CPUFusedKernel::CPUFusedKernel(
+  const std::string& name
+, AnnotatedGraph& agraph
+, CPUFusionCompilerConfig& config)
+: FusedKernel(name, agraph) {
+  TempFile so_file(so_template, 3);
+  TempFile cpp_file(cpp_template, 4);
+
+  std::stringstream cu;
+  std::tie(chunk_desc, concat_desc, has_random) = emitCompilationUnit(cu, name, agraph, false);
+  JIT_ASSERT(!has_random);
+  compilation_unit = cu.str();
+  cpp_file.write(compilation_unit);
+  cpp_file.sync();
+  runCompiler(config, cpp_file.name(), so_file.name());
+  if (config.debug) {
+    disas(so_file.name());
+  }
+  so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
+  #pragma GCC diagnostic ignored "-Wpedantic"
+    kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
+  #pragma GCC diagnostic pop
+}
+
+} // namespace cpufuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cpu/fused_kernel.h b/torch/csrc/jit/fusers/cpu/fused_kernel.h
new file mode 100644
index 00000000000000..d32f4c351733bf
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fused_kernel.h
@@ -0,0 +1,45 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/cpu/dynamic_library.h"
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+
+#include "ATen/ATen.h"
+
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+struct CPUFusedKernel : public ::torch::jit::FusedKernel {
+  CPUFusedKernel(
+    const std::string& name
+  , AnnotatedGraph& agraph
+  , CPUFusionCompilerConfig& config);
+
+protected:
+  virtual at::Backend backend() const override {
+    return at::Backend::CPU;
+  }
+
+  virtual uint64_t get_rand_offset(uint32_t numel) override {
+    return numel;
+  }
+
+  virtual void launch_raw(uint32_t numel, void** arguments) override {
+    kernel(numel, arguments);
+  }
+
+  std::unique_ptr<DynamicLibrary> so_lib;
+  void (*kernel)(uint32_t, void**) = nullptr;
+};
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp b/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp
new file mode 100644
index 00000000000000..9285ae4caa179b
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp
@@ -0,0 +1,85 @@
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/passes/shape_analysis.h" // EraseShapeInformation
+#include "torch/csrc/utils/functional.h" //fmap
+#include "torch/csrc/jit/ivalue.h" // IValue
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/assertions.h"
+
+#include <cstdlib>
+#include <string>
+#include <sstream>
+#include <tuple>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+CPUFusionCompiler& getFusionCompiler() {
+  static CPUFusionCompiler compiler;
+  return compiler;
+}
+
+static const std::string check_exists_string = "which '${program}' > /dev/null";
+static bool programExists(const std::string& program) {
+  TemplateEnv env;
+  env.s("program", program);
+  std::string cmd = format(check_exists_string, env);
+  return 0 == system(cmd.c_str());
+}
+
+CPUFusionCompiler::CPUFusionCompiler() {
+  const char* cxx_env = getenv("CXX");
+  if (cxx_env != nullptr) {
+    config_.cxx = cxx_env;
+  }
+
+  if (!programExists(config_.cxx)) {
+    config_.cxx = "";
+  }
+  
+  const char* debug_env = getenv("PYTORCH_FUSION_DEBUG");
+  config_.debug = debug_env && atoi(debug_env) != 0;
+}
+
+std::shared_ptr<FusionHandle> CPUFusionCompiler::getFusionHandle(Node* fusion_group) {
+  int device = fusion_group->i(attr::device);
+  JIT_ASSERT(device == kCPUDevice);
+  auto graph = fusion_group->g(attr::Subgraph)->copy();
+  EraseShapeInformation(*graph);
+  std::stringstream key;
+  key << "device " << device << "\n";
+  key << *graph << "\n";
+  std::string key_ = key.str();
+  auto it = cache_map.find(key_);
+  if (it == cache_map.end()) {
+    std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared<FusionHandleImpl>(graph, device));
+  }
+  return it->second;
+}
+
+std::vector<at::Tensor> CPUFusionCompiler::debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  auto wrapper_graph = std::make_shared<Graph>();
+  Node* fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
+  fusion_group->g_(attr::Subgraph, graph.copy());
+  for (size_t i = 0; i < graph.inputs().size(); ++i) {
+    fusion_group->addInput(wrapper_graph->addInput());
+  }
+  for (size_t i = 0; i < graph.outputs().size(); ++i) {
+    wrapper_graph->registerOutput(fusion_group->addOutput());
+  }
+  auto cache = getFusionHandle(fusion_group);
+  Stack stack = fmap<IValue>(inputs);
+  cache->run(stack);
+  return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
+}
+
+
+
+} // namespace cpufuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cpu/fusion_compiler.h b/torch/csrc/jit/fusers/cpu/fusion_compiler.h
new file mode 100644
index 00000000000000..d41fbc7de4b550
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fusion_compiler.h
@@ -0,0 +1,55 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+struct CPUFusionCompilerConfig {
+  std::string cxx = "g++"; // compiler location
+  bool debug = false; // emit debugging information about fusions
+  bool openmp = true;
+};
+
+struct CPUFusionCompiler {
+  TH_DISALLOW_COPY_AND_ASSIGN(CPUFusionCompiler);
+
+  CPUFusionCompiler();
+
+  ~CPUFusionCompiler() = default;
+
+  std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group);
+  
+  std::vector<at::Tensor> debugLaunchGraph(
+    Graph& graph
+  , int device
+  , at::ArrayRef<at::Tensor> inputs);
+
+  CPUFusionCompilerConfig& getConfig() {
+    return config_;
+  }
+
+private:
+  CPUFusionCompilerConfig config_;
+  std::unordered_map<std::string, std::shared_ptr<FusionHandleImpl>> cache_map;
+};
+
+CPUFusionCompiler& getFusionCompiler();
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/interface.h b/torch/csrc/jit/fusers/cpu/interface.h
new file mode 100644
index 00000000000000..01f6feb8e2dd8d
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/interface.h
@@ -0,0 +1,32 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+
+#include "ATen/ATen.h"
+
+#include "torch/csrc/jit/ir.h"
+
+#include <vector>
+#include <memory>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+inline std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group) {
+  return getFusionCompiler().getFusionHandle(fusion_group);
+}
+
+std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  return getFusionCompiler().debugLaunchGraph(graph, device, inputs);
+}
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/resource_strings.h b/torch/csrc/jit/fusers/cpu/resource_strings.h
new file mode 100644
index 00000000000000..60c1c0faaa4fea
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/resource_strings.h
@@ -0,0 +1,56 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/code_template.h"
+
+namespace torch { namespace jit { namespace cpufuser {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
+Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
+so typedefs help it handle those cases*/
+
+auto type_declarations_template = CodeTemplate(R"(
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+
+auto cpu_compilation_unit_template = CodeTemplate(R"(
+#include <cstddef>
+#include <cstdint>
+#include <math.h>
+${type_declarations}
+
+#define OMP_THRESHOLD 100000
+static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
+  #pragma omp parallel for if(totalElements > OMP_THRESHOLD)
+  for (IndexType linearIndex = 0;
+        linearIndex < totalElements;
+        linearIndex += 1) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+
+extern "C"
+void ${kernelName}(IndexType totalElements, void ** args) {
+  ${kernelName}_kernel(totalElements ${,argument_loads});
+}
+)");
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/temp_file.h b/torch/csrc/jit/fusers/cpu/temp_file.h
new file mode 100644
index 00000000000000..2b2c21290c9ff5
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/temp_file.h
@@ -0,0 +1,70 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/assertions.h"
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+
+#include "unistd.h"
+
+#include <string>
+#include <vector>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+struct TempFile {
+  TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
+
+  TempFile(const std::string& t, int suffix) {
+    // mkstemps edits its first argument in places
+    // so we make a copy of the string here, including null terminator
+    std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
+    int fd = mkstemps(tt.data(), suffix);
+    JIT_ASSERT(fd != -1);
+    file_ = fdopen(fd, "r+");
+
+    // - 1 becuase tt.size() includes the null terminator,
+    // but std::string does not expect one
+    name_ = std::string(tt.begin(), tt.end() - 1);
+  }
+
+  const std::string& name() const {
+    return name_;
+  }
+
+  void sync() {
+    fflush(file_);
+  }
+
+  void write(const std::string & str) {
+    size_t result = fwrite(str.c_str(), 1, str.size(), file_);
+    JIT_ASSERT(str.size() == result);
+  }
+
+  FILE* file()  {
+    return file_;
+  }
+
+  ~TempFile() {
+    if (file_ != nullptr) {
+      // unlink first to ensure another mkstemps doesn't
+      // race between close and unlink
+      unlink(name_.c_str());
+      fclose(file_);
+    }
+  }
+private:
+  FILE* file_ = nullptr;
+  std::string name_;
+};
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
new file mode 100644
index 00000000000000..90624307123b56
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
@@ -0,0 +1,105 @@
+#include "torch/csrc/jit/fusers/cuda/fused_kernel.h"
+
+#include "torch/csrc/jit/resource_guard.h"
+
+#include "ATen/cuda/CUDAContext.h"
+#include "THC/THC.h"
+#include "THC/THCGenerator.hpp"
+#include "torch/csrc/cuda/cuda_check.h"
+
+#include "nvrtc.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include <stdexcept>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <algorithm>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+void checkCUDAVersion(const cudaDeviceProp& prop) {
+  if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
+      (prop.major >= 7 && CUDA_VERSION < 9000)) {
+    std::stringstream err_string;
+    err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: "
+         << CUDA_VERSION << " for the current GPU device " << prop.name
+         << " with device capability " << prop.major << "." << prop.minor;
+    throw std::runtime_error(err_string.str());
+  }
+}
+
+CUDAFusedKernel::CUDAFusedKernel(
+  const std::string& name
+, AnnotatedGraph& agraph)
+: FusedKernel(name, agraph) {
+  at::DeviceGuard device_guard(agraph.device);
+
+  TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
+  checkCUDAVersion(prop);
+
+  std::stringstream cu;
+  std::tie(chunk_desc, concat_desc, has_random) = emitCompilationUnit(cu, name, agraph, true);
+  compilation_unit = cu.str();
+  nvrtcProgram program;
+  TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr));
+
+  std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
+  std::vector<const char *> args = {"--std=c++11", compute.c_str(), "-default-device"};
+  nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
+  if (result == NVRTC_ERROR_COMPILATION) {
+    size_t logsize;
+    nvrtcGetProgramLogSize(program, &logsize);
+    std::vector<char> log(logsize);
+    nvrtcGetProgramLog(program, log.data());
+    cu << log.data();
+    throw std::runtime_error(cu.str());
+  }
+  ResourceGuard holdProgram([&] {
+    TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program));
+  });
+  TORCH_NVRTC_CHECK(result);
+
+  size_t ptx_size;
+  TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
+  ptx.resize(ptx_size);
+  TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
+
+  TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
+  TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
+
+  TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    &maxBlocks, function, 128, 0));
+  maxBlocks *= prop.multiProcessorCount;
+}
+
+void CUDAFusedKernel::launch_raw(uint32_t numel, void** arguments) {
+  int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
+
+     //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
+     //          << " numblocks =  " << numBlocks;
+
+     // it is possible that this is the first cuda call on this thread
+     // so make sure we initialize the Driver API's context
+     // cudaFree(0) accomplishes this.
+     CUcontext pctx = 0;
+     TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
+     if (!pctx) {
+        std::unique_lock<std::mutex> cudaFreeMutexLock(
+            *(THCCachingAllocator_getCudaFreeMutex()));
+        cudaFree(0);
+     }
+     CUstream stream = at::cuda::getCurrentCUDAStream();
+     TORCH_CU_CHECK(cuLaunchKernel(
+       function,
+       numBlocks, 1, 1,
+       blockSize, 1, 1,
+       0, stream,
+       arguments,
+       nullptr));
+}
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.h b/torch/csrc/jit/fusers/cuda/fused_kernel.h
new file mode 100644
index 00000000000000..bd6a0ec8b95606
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fused_kernel.h
@@ -0,0 +1,59 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+
+#include "ATen/ATen.h"
+
+#include "nvrtc.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+struct CUDAFusedKernel : public ::torch::jit::FusedKernel {
+  CUDAFusedKernel(const std::string& name, AnnotatedGraph& agraph);
+
+  virtual ~CUDAFusedKernel() override {
+    cuModuleUnload(module);
+  }
+
+protected:
+  virtual at::Backend backend() const override {
+    return at::Backend::CUDA;
+  }
+
+  int ceilDiv(int a, int b) {
+    return (a + b - 1) / b;
+  }
+
+  virtual uint64_t get_rand_offset(uint32_t numel) override {
+     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
+     return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1);
+  }
+
+  virtual void launch_raw(uint32_t numel, void ** arguments) override;
+
+  std::vector<char> ptx;
+  CUmodule module;
+  CUfunction function;
+
+  // we record prop/device so if they are availiable for launch heuristics
+  // querying at launch is too slow for device properties.
+  int device;
+  cudaDeviceProp prop;
+  int blockSize = 128;
+  int maxBlocks;
+};
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp b/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp
new file mode 100644
index 00000000000000..3add518bdeaf21
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp
@@ -0,0 +1,71 @@
+#include "torch/csrc/jit/fusers/cuda/fusion_compiler.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/passes/shape_analysis.h" // EraseShapeInformation
+#include "torch/csrc/utils/functional.h" //fmap
+#include "torch/csrc/jit/ivalue.h" // IValue
+
+#include "torch/csrc/jit/assertions.h"
+
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <cstdlib>
+
+namespace torch { namespace jit { namespace cudafuser {
+CUDAFusionCompiler& getFusionCompiler() {
+  static CUDAFusionCompiler compiler;
+  return compiler;
+}
+
+std::shared_ptr<FusionHandle> CUDAFusionCompiler::getFusionHandle(
+  Node* fusion_group) {
+  // verifies on GPU
+  const auto device = fusion_group->i(attr::device);
+  JIT_ASSERT(device != kCPUDevice);
+
+  auto graph = fusion_group->g(attr::Subgraph)->copy();
+  EraseShapeInformation(*graph);
+  std::stringstream key;
+  key << "device " << device << "\n";
+  key << *graph << "\n";
+  std::string key_ = key.str();
+  auto it = cache_map.find(key_);
+  if (it == cache_map.end()) {
+    std::tie(it, std::ignore) = 
+      cache_map.emplace(
+        key_
+      , std::make_shared<FusionHandleImpl>(graph, device));
+  }
+
+  return it->second;
+}
+
+std::vector<at::Tensor> CUDAFusionCompiler::debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  auto wrapper_graph = std::make_shared<Graph>();
+  Node* fusion_group = 
+    wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
+  fusion_group->g_(attr::Subgraph, graph.copy());
+  
+  for (size_t i = 0; i < graph.inputs().size(); ++i) {
+    fusion_group->addInput(wrapper_graph->addInput());
+  }
+  
+  for (size_t i = 0; i < graph.outputs().size(); ++i) {
+    wrapper_graph->registerOutput(fusion_group->addOutput());
+  }
+
+  auto cache = getFusionHandle(fusion_group);
+  Stack stack = fmap<IValue>(inputs);
+  cache->run(stack);
+  return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
+}
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cuda/fusion_compiler.h b/torch/csrc/jit/fusers/cuda/fusion_compiler.h
new file mode 100644
index 00000000000000..800ea2150dfecc
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fusion_compiler.h
@@ -0,0 +1,44 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+struct CUDAFusionCompiler {
+  TH_DISALLOW_COPY_AND_ASSIGN(CUDAFusionCompiler);
+
+  CUDAFusionCompiler() = default;
+
+  ~CUDAFusionCompiler() = default;
+
+  std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group);
+  
+  std::vector<at::Tensor> debugLaunchGraph(
+    Graph& graph
+  , int device
+  , at::ArrayRef<at::Tensor> inputs);
+
+private:
+  std::unordered_map<std::string, std::shared_ptr<FusionHandleImpl>> cache_map;
+};
+
+CUDAFusionCompiler& getFusionCompiler();
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/cuda/interface.h b/torch/csrc/jit/fusers/cuda/interface.h
new file mode 100644
index 00000000000000..4a6ee3f8e5aa36
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/interface.h
@@ -0,0 +1,32 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/cuda/fusion_compiler.h"
+
+#include "torch/csrc/jit/ir.h"
+
+#include "ATen/ATen.h"
+
+#include <vector>
+#include <memory>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+inline std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group) {
+  return getFusionCompiler().getFusionHandle(fusion_group);
+}
+
+std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  return getFusionCompiler().debugLaunchGraph(graph, device, inputs);
+}
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
\ No newline at end of file
diff --git a/torch/csrc/jit/fusers/cuda/resource_strings.h b/torch/csrc/jit/fusers/cuda/resource_strings.h
new file mode 100644
index 00000000000000..0063288721d727
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/resource_strings.h
@@ -0,0 +1,197 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/code_template.h"
+
+namespace torch { namespace jit { namespace cudafuser {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
+Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
+so typedefs help it handle those cases*/
+
+auto type_declarations_template = CodeTemplate(R"(
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef short int  int16_t;
+typedef long long int int64_t;
+${HalfHeader}
+${RandHeader}
+
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+
+// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
+// curand header correctly.
+constexpr auto rand_support_literal = R"(
+
+  class Philox {
+  public:
+    __device__ inline Philox(unsigned long long seed,
+                             unsigned long long subsequence,
+                             unsigned long long offset) {
+      key.x = (unsigned int)seed;
+      key.y = (unsigned int)(seed >> 32);
+      counter = make_uint4(0, 0, 0, 0);
+      counter.z = (unsigned int)(subsequence);
+      counter.w = (unsigned int)(subsequence >> 32);
+      STATE = 0;
+      incr_n(offset / 4);
+    }
+
+    __device__ inline unsigned long operator()() {
+      if(STATE == 0) {
+        uint4 counter_ = counter;
+        uint2 key_ = key;
+        for(int i = 0; i < 9; i++) {
+          counter_ = single_round(counter_, key_);
+          key_.x += (kPhilox10A); key_.y += (kPhilox10B);
+        }
+        output = single_round(counter_, key_);
+        incr();
+      }
+      unsigned long ret;
+      switch(STATE) {
+        case 0: ret = output.x; break;
+        case 1: ret = output.y; break;
+        case 2: ret = output.z; break;
+        case 3: ret = output.w; break;
+      }
+      STATE = (STATE + 1) % 4;
+      return ret;
+    }
+
+  private:
+    uint4 counter;
+    uint4 output;
+    uint2 key;
+    unsigned int STATE;
+    __device__ inline void incr_n(unsigned long long n) {
+      unsigned int nlo = (unsigned int)(n);
+      unsigned int nhi = (unsigned int)(n >> 32);
+      counter.x += nlo;
+      if (counter.x < nlo)
+        nhi++;
+      counter.y += nhi;
+      if (nhi <= counter.y)
+        return;
+      if (++counter.z)
+        return;
+      ++counter.w;
+    }
+    __device__ inline void incr() {
+      if (++counter.x)
+        return;
+      if (++counter.y)
+        return;
+      if (++counter.z)
+        return;
+      ++counter.w;
+    }
+    __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                      unsigned int *result_high) {
+      *result_high = __umulhi(a, b);
+      return a*b;
+    }
+
+    __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
+      unsigned int hi0;
+      unsigned int hi1;
+      unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+      unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+
+      uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+      return ret;
+    }
+
+    static const unsigned long kPhilox10A = 0x9E3779B9;
+    static const unsigned long kPhilox10B = 0xBB67AE85;
+    static const unsigned long kPhiloxSA = 0xD2511F53;
+    static const unsigned long kPhiloxSB = 0xCD9E8D57;
+  };
+
+  // Inverse of 2^32.
+  #define M_RAN_INVM32 2.3283064e-10f
+  __device__  __inline__ float uniform(unsigned int x) {
+    return x * M_RAN_INVM32;
+  }
+)";
+
+constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset";
+
+constexpr auto rand_init = R"(
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  Philox rnd(seed, idx, offset);
+)";
+
+auto cuda_compilation_unit_template = CodeTemplate(R"(
+${type_declarations}
+
+extern "C" __global__
+void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
+  ${RandInit}
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+        linearIndex < totalElements;
+        linearIndex += gridDim.x * blockDim.x) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+)");
+
+
+// This snippet enables half support in the jit. Following the pattern for
+// reductions, fp16 input data is immediately upconverted to float
+// with __half2float(). All mathematical operations are done on float
+// values, and if needed the intermediate float representation is
+// converted to half with __float2half() when writing to a half tensor.
+constexpr auto half_support_literal  = R"(
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#if defined(__cplusplus)
+  struct __align__(2) __half {
+    __host__ __device__ __half() { }
+
+  protected:
+    unsigned short __x;
+  };
+
+  /* All intrinsic functions are only available to nvcc compilers */
+  #if defined(__CUDACC__)
+    /* Definitions of intrinsics */
+    __device__ __half __float2half(const float f) {
+      __half val;
+      asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
+      return val;
+    }
+
+    __device__ float __half2float(const __half h) {
+      float val;
+      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
+      return val;
+    }
+  #endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+
+typedef __half half;
+)";
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
\ No newline at end of file
diff --git a/torch/csrc/jit/fusers/interface.cpp b/torch/csrc/jit/fusers/interface.cpp
new file mode 100644
index 00000000000000..0b175730f8cfde
--- /dev/null
+++ b/torch/csrc/jit/fusers/interface.cpp
@@ -0,0 +1,81 @@
+#include "torch/csrc/jit/fusers/interface.h"
+
+#include "torch/csrc/jit/fusers/Config.h"
+
+#if USE_CPU_FUSER
+  #include "torch/csrc/jit/fusers/cpu/interface.h"
+#endif // USE_CPU_FUSER
+
+#if USE_CUDA_FUSER
+  #include "torch/csrc/jit/fusers/cuda/interface.h"
+#endif // USE_CUDA_FUSER
+
+#include <stdexcept>
+
+namespace torch { namespace jit {
+
+namespace detail {
+
+bool cpu_fuser_enabled = false;
+
+} // namespace detail
+
+// Pure virtual destructor definition
+FusionHandle::~FusionHandle() { }
+
+std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group) {
+  const auto device = fusion_group->i(attr::device);
+  if (device == kCPUDevice) {
+    #if USE_CPU_FUSER
+      return cpufuser::getFusionHandle(fusion_group);
+    #endif
+    throw std::runtime_error("CPU fusion is not supported on this build.");
+  }
+
+  #if USE_CUDA_FUSER
+    return cudafuser::getFusionHandle(fusion_group);
+  #endif // USE_CUDA_FUSER
+
+  throw std::runtime_error("CUDA fusion is not supported on this build.");
+}
+
+bool canFuseOnCPU() {
+  #if USE_CPU_FUSER
+    return detail::cpu_fuser_enabled;
+  #endif // USE_CPU_FUSER
+
+  return false;
+}
+
+bool canFuseOnGPU() {
+  #if USE_CUDA_FUSER
+    return true;
+  #endif  // USE_CUDA_FUSER
+
+  return false;
+}
+
+void overrideCanFuseOnCPU(bool value) {
+  detail::cpu_fuser_enabled = value;
+}
+
+std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  if (device == kCPUDevice) {
+    #if USE_CPU_FUSER
+      return cpufuser::debugLaunchGraph(graph, device, inputs);
+    #endif // USE_CPU_FUSER
+    throw std::runtime_error("CPU fusion is not supported on this build.");
+  }
+
+  #if USE_CUDA_FUSER
+    return cudafuser::debugLaunchGraph(graph, device, inputs);
+  #endif // USE_CUDA_FUSER
+
+  throw std::runtime_error("CUDA fusion is not supported on this build.");
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/interface.h b/torch/csrc/jit/fusers/interface.h
new file mode 100644
index 00000000000000..cd4956a8870d87
--- /dev/null
+++ b/torch/csrc/jit/fusers/interface.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/stack.h"
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <vector>
+
+namespace torch { namespace jit {
+
+constexpr int kCPUDevice = -1;
+
+struct TORCH_API FusionHandle {
+  virtual void run(Stack& inputs) = 0;
+
+  virtual ~FusionHandle() = 0;
+};
+
+TORCH_API std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group);
+
+TORCH_API bool canFuseOnCPU();
+TORCH_API bool canFuseOnGPU();
+
+// CPU fuser is disabled by default, but we still want to test it.
+TORCH_API void overrideCanFuseOnCPU(bool value);
+
+TORCH_API std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
deleted file mode 100644
index e21d00f00f40de..00000000000000
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ /dev/null
@@ -1,1615 +0,0 @@
-#ifndef _WIN32
-#include "torch/csrc/jit/fusion_compiler.h"
-
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/code_template.h"
-#include "torch/csrc/jit/resource_guard.h"
-#include "torch/csrc/jit/constants.h"
-#include "torch/csrc/jit/passes/shape_analysis.h"
-#include "torch/csrc/jit/custom_operator.h"
-
-#include "torch/csrc/utils/disallow_copy.h"
-#include "torch/csrc/variable_tensor_functions.h"
-#include "torch/csrc/utils/hash.h"
-#include <torch/csrc/jit/assertions.h>
-
-#include "ATen/ATen.h"
-#include "ATen/ExpandUtils.h"
-#include "ATen/WrapDimUtils.h"
-
-#ifdef USE_CUDA
-#include "ATen/cuda/CUDAContext.h"
-#include "THC/THC.h"
-#include <THC/THCGenerator.hpp>
-#include "torch/csrc/cuda/cuda_check.h"
-#include <nvrtc.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#include <string>
-#include <algorithm>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include <sstream>
-#include <iostream>
-#include <dlfcn.h>
-#include <unistd.h>
-
-#ifdef USE_CUDA
-THCGenerator* THCRandom_getGenerator(THCState* state);
-#endif
-
-namespace torch { namespace jit {
-
-std::vector<bool> TensorDesc::findContiguous(
-    const at::IntList& sizes,
-    const at::IntList& strides) {
-  JIT_ASSERT(sizes.size() == strides.size());
-  std::vector<bool> cont(sizes.size());
-  for(size_t i = 0; i < sizes.size(); ++i) {
-    int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
-    cont[i] = strides[i] == expected_stride;
-  }
-  return cont;
-}
-
-// Descriptor for chunk-ing an input tensor into subtensors
-// OR concat-ing an output tensor from subtensors
-struct PartitionDesc {
-  size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat
-  size_t dim; // dimension along which the chunk/concat occurs
-  std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
-  PartitionDesc()
-  : nSubtensors(1), dim(0) {}
-
-  PartitionDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
-  : nSubtensors(nSubtensors), dim(dim) {
-    JIT_ASSERT(nSubtensors > 1);
-    std::vector<bool> cont = desc.contiguity;
-    if(dim > 0) {
-      // when we narrow the concatenated output/chunked input
-      // we make the size[dim] smaller while keeping the stride[dim] the same,
-      // meaning: stride[dim - 1] != stride[dim]*size[dim]
-      // so dim - 1 is no longer contiguous
-      cont[dim - 1] = false;
-    }
-    subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
-  }
-
-  bool isNoop() const {
-    return nSubtensors == 1;
-  }
-};
-
-struct FusedKernel {
-  TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
-
-  FusedKernel(const std::string & name, AnnotatedGraph & agraph);
-  virtual ~FusedKernel() = default;
-
-  // expects outputs to be pre-allocated
-  void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
-
-  // creates new tensors for outputs
-  void launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs);
-  const std::vector<TensorDesc> & outputDescriptors() const {
-    return output_desc;
-  }
-protected:
-  virtual at::Backend backend() const = 0;
-
-  // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
-  // code.
-  // The format of arguments is suitable for directly passing to a call to
-  // cuLaunchKernel as the kernel arguments.
-  // Currently the first argument is a pointer to numel (for passing to
-  // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
-  // that compiled code uses to load Tensor data.
-  // launch_with_tensors handles packing at::Tensors into this arguments array.
-  // CPU code uses the same convension so that launch_with_tensors can be shared.
-  virtual void launch_raw(uint32_t numel, void ** arguments) = 0;
-
-  virtual uint64_t get_rand_offset(uint32_t numel) = 0;
-  bool has_random;
-  std::string name;
-  // We keep these around for debugging
-  std::string compilation_unit;
-  std::vector<TensorDesc> input_desc;
-  std::vector<TensorDesc> output_desc;
-
-  // same size as output_desc, describes whether
-  // an output is actually a concatenation of
-  // many subtensors that the fusion group produces
-  std::vector<PartitionDesc> concat_desc;
-
-  // same size as input_desc, describes whether an
-  // input should be broken into subtensors (chunks)
-  // to be consumed by the fusion group
-  std::vector<PartitionDesc> chunk_desc;
-};
-
-
-namespace {
-
-#ifdef USE_CUDA
-
-static int ceilDiv(int a, int b) {
-  return (a + b - 1) / b;
-}
-
-#endif
-
-Node* usedInFusedChunk(Value * input) {
-  auto uses = input->uses();
-  if (uses.size() == 1) {
-    Node *user = uses[0].user;
-    if (user->kind() == prim::ConstantChunk) {
-      return user;
-    }
-  }
-  return nullptr;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Code generation
-
-namespace codegen {
-
-/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
-Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
-so typedefs help it handle those cases*/
-
-auto type_declarations_template = CodeTemplate(R"(
-#if defined(__CUDACC_RTC__)
-typedef unsigned char uint8_t;
-typedef signed char int8_t;
-typedef short int  int16_t;
-typedef long long int int64_t;
-${HalfHeader}
-${RandHeader}
-#endif
-typedef ${IndexType} IndexType;
-template<typename T, size_t N>
-struct TensorInfo {
-  T * data;
-  IndexType sizes[N];
-  IndexType strides[N];
-};
-template<typename T>
-struct TensorInfo<T, 0> {
-  T * data;
-};
-)");
-
-// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
-// curand header correctly.
-constexpr auto rand_support_literal = R"(
-
-  class Philox {
-  public:
-    __device__ inline Philox(unsigned long long seed,
-                             unsigned long long subsequence,
-                             unsigned long long offset) {
-      key.x = (unsigned int)seed;
-      key.y = (unsigned int)(seed >> 32);
-      counter = make_uint4(0, 0, 0, 0);
-      counter.z = (unsigned int)(subsequence);
-      counter.w = (unsigned int)(subsequence >> 32);
-      STATE = 0;
-      incr_n(offset / 4);
-    }
-
-    __device__ inline unsigned long operator()() {
-      if(STATE == 0) {
-        uint4 counter_ = counter;
-        uint2 key_ = key;
-        for(int i = 0; i < 9; i++) {
-          counter_ = single_round(counter_, key_);
-          key_.x += (kPhilox10A); key_.y += (kPhilox10B);
-        }
-        output = single_round(counter_, key_);
-        incr();
-      }
-      unsigned long ret;
-      switch(STATE) {
-        case 0: ret = output.x; break;
-        case 1: ret = output.y; break;
-        case 2: ret = output.z; break;
-        case 3: ret = output.w; break;
-      }
-      STATE = (STATE + 1) % 4;
-      return ret;
-    }
-
-  private:
-    uint4 counter;
-    uint4 output;
-    uint2 key;
-    unsigned int STATE;
-    __device__ inline void incr_n(unsigned long long n) {
-      unsigned int nlo = (unsigned int)(n);
-      unsigned int nhi = (unsigned int)(n >> 32);
-      counter.x += nlo;
-      if (counter.x < nlo)
-        nhi++;
-      counter.y += nhi;
-      if (nhi <= counter.y)
-        return;
-      if (++counter.z)
-        return;
-      ++counter.w;
-    }
-    __device__ inline void incr() {
-      if (++counter.x)
-        return;
-      if (++counter.y)
-        return;
-      if (++counter.z)
-        return;
-      ++counter.w;
-    }
-    __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
-                                      unsigned int *result_high) {
-      *result_high = __umulhi(a, b);
-      return a*b;
-    }
-
-    __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
-      unsigned int hi0;
-      unsigned int hi1;
-      unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
-      unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
-
-      uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
-      return ret;
-    }
-
-    static const unsigned long kPhilox10A = 0x9E3779B9;
-    static const unsigned long kPhilox10B = 0xBB67AE85;
-    static const unsigned long kPhiloxSA = 0xD2511F53;
-    static const unsigned long kPhiloxSB = 0xCD9E8D57;
-  };
-
-  // Inverse of 2^32.
-  #define M_RAN_INVM32 2.3283064e-10f
-  __device__  __inline__ float uniform(unsigned int x) {
-    return x * M_RAN_INVM32;
-  }
-)";
-
-constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset";
-constexpr auto rand_init = R"(
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  Philox rnd(seed, idx, offset);
-)";
-auto cuda_compilation_unit_template = CodeTemplate(R"(
-${type_declarations}
-
-extern "C" __global__
-void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
-  ${RandInit}
-  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
-        linearIndex < totalElements;
-        linearIndex += gridDim.x * blockDim.x) {
-      // Convert `linearIndex` into an offset of tensor:
-      ${tensorOffsets}
-      // calculate the results
-      ${kernelBody}
-    }
-}
-)");
-
-auto cpu_compilation_unit_template = CodeTemplate(R"(
-#include <cstddef>
-#include <cstdint>
-#include <math.h>
-${type_declarations}
-
-#define OMP_THRESHOLD 100000
-static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
-  #pragma omp parallel for if(totalElements > OMP_THRESHOLD)
-  for (IndexType linearIndex = 0;
-        linearIndex < totalElements;
-        linearIndex += 1) {
-      // Convert `linearIndex` into an offset of tensor:
-      ${tensorOffsets}
-      // calculate the results
-      ${kernelBody}
-    }
-}
-
-extern "C"
-void ${kernelName}(IndexType totalElements, void ** args) {
-  ${kernelName}_kernel(totalElements ${,argument_loads});
-}
-)");
-
-// This snippet enables half support in the jit. Following the pattern for
-// reductions, fp16 input data is immediately upconverted to float
-// with __half2float(). All mathematical operations are done on float
-// values, and if needed the intermediate float representation is
-// converted to half with __float2half() when writing to a half tensor.
-constexpr auto half_support_literal  = R"(
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
-#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
-#if defined(__cplusplus)
-  struct __align__(2) __half {
-    __host__ __device__ __half() { }
-
-  protected:
-    unsigned short __x;
-  };
-
-  /* All intrinsic functions are only available to nvcc compilers */
-  #if defined(__CUDACC__)
-    /* Definitions of intrinsics */
-    __device__ __half __float2half(const float f) {
-      __half val;
-      asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
-      return val;
-    }
-
-    __device__ float __half2float(const __half h) {
-      float val;
-      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
-      return val;
-    }
-  #endif /* defined(__CUDACC__) */
-#endif /* defined(__cplusplus) */
-#undef __HALF_TO_US
-#undef __HALF_TO_CUS
-
-typedef __half half;
-)";
-
-// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
-// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
-// linearId /= sizes[i];
-auto dim_calc = CodeTemplate(R"(
-//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
-size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
-${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
-)");
-
-static void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
-  TemplateEnv env;
-  env.s("tensor",tensor);
-  out << format("IndexType ${tensor}_offset = 0;\n",env);
-  out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
-  for(int d = ndim - 1; d >= 0; --d) {
-    env.d("d",d);
-    env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
-    env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
-      format("* ${tensor}.strides[${d}]",env) : "");
-    out << dim_calc.format(env);
-    if(d > 0) {
-      out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
-    }
-  }
-}
-
-static std::string valueName(Value * n) {
-  return "n" + std::to_string(n->unique());
-}
-
-static std::string scalarValue(int64_t v) {
-  return std::to_string(v);
-}
-
-static std::string scalarValue(double v) {
-  std::ostringstream out;
-  out << std::scientific << v << "f";
-  return out.str();
-}
-
-static const char * scalarTypeName(at::ScalarType type) {
-  if (type == at::ScalarType::Half) {
-    return "half";
-  }
-
-  switch(type) {
-    #define DEFINE_CASE(ctype,name,_) \
-      case at::ScalarType::name: return #ctype;
-    AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE)
-    #undef DEFINE_CASE
-    default:
-      throw std::runtime_error("unknown scalar type");
-  }
-}
-
-std::string encodeRHS(Node * n) {
-  static std::unordered_map<NodeKind, std::string> simple_map_ops = {
-    // unary
-    {aten::abs, "absf(${0})"},
-    {aten::sigmoid, "1.f / (1.f + expf(-${0}))"},
-    {aten::relu, "${0} < 0 ? 0.f : ${0} "},
-    {aten::log, "logf(${0})"},
-    {aten::log10, "log10f(${0})"},
-    {aten::log1p, "log1pf(${0})"},
-    {aten::log2,  "log2f(${0})"},
-    {aten::lgamma, "lgammaf(${0})"},
-    {aten::exp, "expf(${0})"},
-    {aten::expm1, "expm1f(${0})"},
-    {aten::cos, "cosf(${0})"},
-    {aten::acos, "acosf(${0})"},
-    {aten::cosh, "coshf(${0})"},
-    {aten::sin, "sinf(${0})"},
-    {aten::asin, "asinf(${0})"},
-    {aten::sinh, "sinhf(${0})"},
-    {aten::tan, "tanf(${0})"},
-    {aten::atan, "atanf(${0})"},
-    {aten::tanh, "tanhf(${0})"},
-    {aten::sqrt, "sqrtf(${0})"},
-    {aten::rsqrt, "rsqrtf(${0})"},
-    {aten::ceil, "ceilf(${0})"},
-    {aten::floor, "floorf(${0})"},
-    {aten::round, "roundf(${0})"},
-    {aten::trunc, "truncf(${0})"},
-    {aten::frac, "fracf(${0})"},
-    {aten::reciprocal, "reciprocalf(${0})"},
-    {aten::neg, "-${0}"},
-    //simple binary
-    {aten::atan2, "atan2(${0}, ${1})"},
-    {aten::min, "fminf(${0}, ${1})"},
-    {aten::max, "fmaxf(${0}, ${1})"},
-
-    //binary with other
-    // TODO: some of these ops will not get generated because
-    // we only work on float inputs/outputs, but they are here to record
-    // that they are valid mappable ops once we handle more type
-    {aten::__and__, "${0} && ${1}"},
-    {aten::__lshift__, "${0} << ${1}"},
-    {aten::__or__, "${0} || ${1}"},
-    {aten::__rshift__, "${0} >> ${1}"},
-    {aten::__xor__, "${0} ^ ${1}"},
-    {aten::div, "${0} / ${1}"},
-    {aten::eq, "${0} == ${1}"},
-    {aten::fmod, "fmodf(${0}, ${1})"},
-    {aten::ge, "(${0} >= ${1})"},
-    {aten::gt, "${0} > ${1}"},
-    {aten::le, "(${0} <= ${1})"},
-    {aten::lt, "${0} < ${1}"},
-    {aten::type_as, "(${0})"}, //everything is implicitly convertible to float
-    {aten::mul, "${0} * ${1}"},
-    {aten::ne, "${0} != ${1}"},
-    {aten::remainder, "remainderf(${0}, ${1})"},
-    {aten::pow, "powf(${0}, ${1})"},
-
-    //alpha
-    {aten::add, "${0} + ${2}*${1}"},
-    {aten::sub, "(${0} - ${2}*${1})"},
-    {aten::rand_like, "uniform(rnd())"},
-
-    // simple derivatives
-    {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
-    {aten::_tanh_backward,    "${0} * (1.f - ${1} * ${1})"},
-  };
-
-  if (n->kind() == prim::Constant) {
-    auto val = toIValue(n->output()).value();
-    if (val.isDouble()) {
-      return scalarValue(val.toDouble());
-    } else {
-      JIT_ASSERT(val.isInt());
-      return scalarValue(val.toInt());
-    }
-  }
-
-  TemplateEnv env;
-  size_t i = 0;
-  for(auto in : n->inputs()) {
-    env.s(std::to_string(i++), valueName(in));
-  }
-
-  const auto & str = simple_map_ops.at(n->kind());
-  return format(str, env);
-}
-
-// Returns: (input chunk metadata, output concat metadata, is_random)
-std::tuple<std::vector<PartitionDesc>,std::vector<PartitionDesc>,bool> emitCompilationUnit(
-    std::ostream& out,
-    const std::string& name,
-    AnnotatedGraph& agraph,
-    bool use_cuda) {
-  bool has_random = false;
-  Graph& subgraph = *agraph.graph;
-  TemplateEnv env;
-  env.s("kernelName",name);
-  // TODO: handle cases where we need to generate > 2^32 element tensors
-  env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
-
-  std::stringstream body;
-  std::stringstream tensorOffsets;
-  std::vector<std::string> formals;
-  std::vector<std::string> argument_loads;
-  auto emitFormal = [&](Value * n, const TensorDesc & desc) {
-    std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
-    size_t nDim = desc.nDim();
-    emitIndexingFor(tensorOffsets, tensor, nDim,  desc.lastIsContiguous());
-    env.s("tensor",tensor);
-    env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex
-    env.d("nDim",nDim);
-    env.s("scalar_type",scalarTypeName(desc.scalar_type));
-    formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env));
-    argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])",env));
-  };
-
-  std::vector<PartitionDesc> chunk_desc;
-  std::vector<std::pair<Value*,TensorDesc&>> flat_inputs;
-  {
-    size_t input_index = 0;
-    for(auto p : subgraph.inputs()) {
-      if (Node * chunk = usedInFusedChunk(p)) {
-        int64_t dim = chunk->i(attr::dim);
-        int64_t chunks = chunk->i(attr::chunks);
-        chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim);
-        for (auto * o : chunk->outputs()) {
-          flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc);
-        }
-      } else {
-        chunk_desc.emplace_back();
-        flat_inputs.emplace_back(p, agraph.input_desc[input_index++]);
-      }
-    }
-    for (auto & input : flat_inputs) {
-      emitFormal(input.first, input.second);
-    }
-  }
-
-  std::vector<PartitionDesc> concat_desc;
-  std::vector<std::pair<Value*,TensorDesc>> flat_output_nodes;
-  {
-    size_t i = 0;
-    for(auto o : subgraph.outputs()) {
-      auto & desc = agraph.output_desc[i++];
-      if(o->node()->kind() != prim::FusedConcat) {
-        emitFormal(o, desc);
-        concat_desc.emplace_back();
-        flat_output_nodes.emplace_back(o, desc);
-      } else {
-        auto cat = o->node();
-        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
-        for(auto c : cat->inputs()) {
-          emitFormal(c, *concat_desc.back().subtensorDesc);
-          flat_output_nodes.emplace_back(c, desc);
-        }
-      }
-    }
-  }
-
-  bool has_half_tensor = false;
-  size_t formal_count = 0;
-  for(auto input : flat_inputs) {
-    auto p = input.first;
-    env.s("node",valueName(p));
-    env.d("formal",formal_count++);
-
-    // Acquires and converts (if needed) inputs
-    bool is_half = input.second.scalar_type == at::ScalarType::Half;
-    if (is_half) {
-      AT_ASSERT(use_cuda);
-      env.s(
-        "access"
-      , format("__half2float(t${formal}.data[t${formal}_offset])", env));
-      has_half_tensor = true;
-    } else {
-      env.s("access", format("t${formal}.data[t${formal}_offset]", env));
-    }
-
-    //TODO: actual type propagation rather than relying on auto..
-    body << format("auto ${node} = ${access};\n",env);
-  }
-
-  for(auto n : subgraph.nodes()) {
-    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
-    if (n->kind() == prim::FusedConcat)
-      continue;
-    if (n->kind() == prim::ConstantChunk)
-      continue;
-    if(n->kind() == aten::rand_like) {
-      has_random = true;
-      if(!use_cuda)
-        throw std::runtime_error("Fusion doesn't support rand on CPU");
-    }
-    env.s("node",valueName(n->output()));
-    env.s("rhs", encodeRHS(n));
-    body << format("auto ${node} = ${rhs};\n",env);
-  }
-
-  for(auto output : flat_output_nodes) {
-    auto o = output.first;
-    env.d("formal",formal_count++);
-    env.s("access",format("t${formal}.data[t${formal}_offset]",env));
-    env.s("node",valueName(o));
-
-    // Acquires and converts (if needed) outputs
-    bool is_half = output.second.scalar_type == at::ScalarType::Half;
-    if (is_half) {
-      AT_ASSERT(use_cuda);
-      body << format("${access} = __float2half(${node});\n",env);
-      has_half_tensor = true;
-    } else {
-      body << format("${access} = ${node};\n",env);
-    }
-  }
-
-  // Includes half support if any half tensors are involved
-  if (has_half_tensor) {
-    env.s("HalfHeader", half_support_literal);
-  } else {
-    env.s("HalfHeader", "");
-  }
-
-  if (has_random) {
-    env.s("RandHeader", rand_support_literal);
-    env.s("RandParam", rand_param);
-    env.s("RandInit", rand_init);
-  } else {
-    env.s("RandHeader", "");
-    env.s("RandParam", "");
-    env.s("RandInit", "");
-  }
-
-  env.s("tensorOffsets",tensorOffsets.str());
-  env.s("kernelBody",body.str());
-  env.v("formals",formals);
-  env.v("argument_loads",argument_loads);
-  env.s("type_declarations", type_declarations_template.format(env));
-  if(use_cuda) {
-    out << cuda_compilation_unit_template.format(env);
-  } else {
-    out << cpu_compilation_unit_template.format(env);
-  }
-
-  return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // codegen namespace
-} // anonymous namespace
-
-////////////////////////////////////////////////////////////////////////////////
-// CompiledFunctionFunction
-
-// Host-side view of TensorInfo (that visivle for the kernel is defined above).
-// Note dims[0] - we need to dynamically allocate the dims.
-struct TensorInfo {
-  void * data;
-#pragma GCC diagnostic ignored "-Wpedantic"
-  uint32_t sizes_strides[0];
-#pragma GCC diagnostic pop
-
-  uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
-  uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
-};
-
-FusedKernel::FusedKernel(const std::string & name, AnnotatedGraph & agraph)
-  : name(name)
-  , input_desc(agraph.input_desc)
-  , output_desc(agraph.output_desc) {}
-
-namespace {
-
-// Tries to compress sizes and strides according to cont. Emits the result t
-// c_sizes, c_strides and throws an error on failure (if can't compress)
-void compressContiguous(
-    at::IntList sizes,
-    at::IntList strides,
-    const std::vector<bool> & cont,
-    uint32_t * c_sizes,
-    uint32_t * c_strides) {
-  size_t compressed_dims = 0;
-  size_t cur = 0;
-  size_t ndim = sizes.size();
-  while(cur < ndim) {
-    size_t total_size = sizes[cur];
-    cur++;
-    while(cont[cur-1] && cur < ndim) {
-      JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
-      total_size *= sizes[cur];
-      cur++;
-    }
-   // cur starts pointing at the beginning of run to compress
-   // cur ends one _after_ the terminating false or end of list.
-   // total_size is the size of all dimensions [begin,end)
-   // examples:
-   // f = not cont.
-   // t = cont.
-   // x = don't care, including past end of list
-   // s = start of cur
-   // e = end of cur
-
-
-   // f x x x
-   // s e
-
-   //  t f x x
-   //  s   e
-
-   //  t t f x
-   //  s     e
-
-    c_sizes[compressed_dims] = total_size;
-    c_strides[compressed_dims] = strides[cur-1];
-    compressed_dims++;
-  }
-  if (ndim > 0) {
-    JIT_ASSERT(!cont.back() || strides.back() == 1);
-  }
-}
-
-} // anonymous namespace
-
-// XXX: Assumes that after at::chunk, all inputs are the same size
-static std::vector<int64_t> computeMapSize(
-    const at::Tensor& tensor,
-    const PartitionDesc& chunkDesc) {
-  std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
-  // Should have been checked in graph fuser
-  JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0);
-  sizes[chunkDesc.dim] /= chunkDesc.nSubtensors;
-  return sizes;
-}
-
-// XXX: this code assumes that inputs are 32-bit addressable
-static uint32_t computeNumel(at::ArrayRef<int64_t> sizes) {
-  uint32_t result = 1;
-  if (sizes.size() == 0) {
-    return 1; // scalar tensor
-  }
-  for (int64_t size : sizes) {
-    result *= size;
-  }
-  return result;
-}
-
-void FusedKernel::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
-  at::DeviceGuard device_guard(inputs);
-  JIT_ASSERT(inputs.size() == input_desc.size());
-  JIT_ASSERT(outputs.size() == output_desc.size());
-  size_t flat_inputs_size = 0;
-  size_t flat_outputs_size = 0;
-  for(auto & c : chunk_desc)
-    flat_inputs_size += c.nSubtensors;
-  for(auto & c : concat_desc)
-    flat_outputs_size += c.nSubtensors;
-  // XXX: this code assumes that inputs are 32-bit addressable
-  // XXX: this code assumes that all inputs are of the same size
-  JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
-
-  // Compute map_size, numel from the first input
-  at::IntList map_size;
-  uint32_t numel;
-  std::vector<int64_t> keep_alive_size;
-  if (chunk_desc[0].isNoop()) {
-    map_size = inputs[0].sizes();
-    numel = inputs[0].numel();
-  } else {
-    keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]);
-    map_size = keep_alive_size;
-    numel = computeNumel(map_size);
-  }
-
-  // Compute the storage needed to store TensorInfo structs for inputs and outputs.
-  size_t uncompressedDim = input_desc.at(0).contiguity.size();
-  size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
-  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
-  std::vector<char> buffer(maxPossibleBufferSize);
-  char * buffer_next = buffer.data();
-  // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
-  std::vector<void*> arguments;
-  arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
-  auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) {
-    size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
-    JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
-    auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
-    ti->data = data_ptr;
-    compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
-    buffer_next += maxPossibleTensorInfoSize;
-    arguments.push_back(ti);
-  };
-  // Asserts that t's dims can be compressed in the same way as in desc
-  // (that's what the kernel assumes), and appends it to the arguments vector.
-  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
-    addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
-  };
-  arguments.push_back(&numel);
-  for (size_t i = 0; i < input_desc.size(); ++i) {
-    auto & chunk = chunk_desc[i];
-    const at::Tensor& tensor = inputs[i];
-    if (chunk.isNoop()) {
-      addTensorInfo(input_desc[i], tensor);
-    } else {
-      size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType());
-      char * data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
-      for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) {
-        addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides());
-        data_ptr += chunk_offset;
-      }
-    }
-  }
-  for (size_t i = 0; i < output_desc.size(); ++i) {
-    auto & c = concat_desc[i];
-    at::Tensor o = outputs[i];
-    if(c.isNoop()) {
-      o.resize_(map_size);
-      addTensorInfo(output_desc[i], outputs[i]);
-    } else {
-      size_t small_size = map_size[c.dim];
-      std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
-      concat_size[c.dim] = small_size * c.nSubtensors;
-      o.resize_(concat_size);
-      size_t offset = 0;
-      for(size_t j = 0; j < c.nSubtensors; ++j) {
-        // because the concatenated_output stays live, the underlying data
-        // in this view remains live through the end of this function
-        // so there is not need to hold onto this tensor
-        auto view = o.narrow(c.dim, offset, small_size);
-        addTensorInfo(*c.subtensorDesc, view);
-        offset += small_size;
-      }
-    }
-  }
-
-  // If the kernel call contains a random op, we need to pass in random seeds as
-  // well.
-  #ifdef USE_CUDA
-  if(has_random && this->backend() == at::Backend::CUDA) {
-    auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
-    uint64_t offset =
-        gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel));
-    arguments.push_back(&gen_->state.initial_seed);
-    arguments.push_back(&offset);
-  }
-  #endif
-
-  launch_raw(numel, arguments.data());
-}
-
-void FusedKernel::launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs) {
-  at::DeviceGuard guard(inputs.back());
-  JIT_ASSERT(inputs.size() > 0);
-  auto & ref_type = inputs[0].type();
-  outputs.clear();
-  outputs.reserve(outputDescriptors().size());
-  for(auto & od : outputDescriptors()) {
-    outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
-  }
-  launch_with_tensors(inputs, outputs);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// CUDAFusedKernel
-
-#ifdef USE_CUDA
-
-void checkCUDAVersion(const cudaDeviceProp & prop) {
-  if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
-      (prop.major >= 7 && CUDA_VERSION < 9000)) {
-    std::stringstream err_string;
-    err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: "
-         << CUDA_VERSION << " for the current GPU device " << prop.name
-         << " with device capability " << prop.major << "." << prop.minor;
-    throw std::runtime_error(err_string.str());
-  }
-}
-
-struct CUDAFusedKernel : public FusedKernel {
-  CUDAFusedKernel(const std::string & name, AnnotatedGraph & agraph)
-  : FusedKernel(name, agraph) {
-    at::DeviceGuard device_guard(agraph.device);
-
-    TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
-    checkCUDAVersion(prop);
-
-    std::stringstream cu;
-    std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, true);
-    compilation_unit = cu.str();
-    nvrtcProgram program;
-    TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr));
-
-    std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
-    std::vector<const char *> args = {"--std=c++11", compute.c_str(), "-default-device"};
-    nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
-    if (result == NVRTC_ERROR_COMPILATION) {
-      size_t logsize;
-      nvrtcGetProgramLogSize(program, &logsize);
-      std::vector<char> log(logsize);
-      nvrtcGetProgramLog(program, log.data());
-      cu << log.data();
-      throw std::runtime_error(cu.str());
-    }
-    ResourceGuard holdProgram([&] {
-      TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program));
-    });
-    TORCH_NVRTC_CHECK(result);
-
-    size_t ptx_size;
-    TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
-    ptx.resize(ptx_size);
-    TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
-
-    TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
-    TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
-
-    TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
-      &maxBlocks, function, 128, 0));
-    maxBlocks *= prop.multiProcessorCount;
-  }
-  virtual ~CUDAFusedKernel() override {
-    TORCH_CU_CHECK(cuModuleUnload(module));
-  }
-protected:
-  virtual at::Backend backend() const override {
-    return at::Backend::CUDA;
-  }
-  virtual uint64_t get_rand_offset(uint32_t numel) override {
-     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
-     return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1);
-  }
-  virtual void launch_raw(uint32_t numel, void ** arguments) override {
-     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
-
-     //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
-     //          << " numblocks =  " << numBlocks;
-
-     // it is possible that this is the first cuda call on this thread
-     // so make sure we initialize the Driver API's context
-     // cudaFree(0) accomplishes this.
-     CUcontext pctx = 0;
-     TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
-     if (!pctx) {
-        std::unique_lock<std::mutex> cudaFreeMutexLock(
-            *(THCCachingAllocator_getCudaFreeMutex()));
-        cudaFree(0);
-     }
-     CUstream stream = at::cuda::getCurrentCUDAStream();
-     TORCH_CU_CHECK(cuLaunchKernel(
-       function,
-       numBlocks, 1, 1,
-       blockSize, 1, 1,
-       0, stream,
-       arguments,
-       nullptr));
-  }
-  std::vector<char> ptx;
-  CUmodule module;
-  CUfunction function;
-
-  // we record prop/device so if they are availiable for launch heuristics
-  // querying at launch is too slow for device properties.
-  int device;
-  cudaDeviceProp prop;
-  int blockSize = 128;
-  int maxBlocks;
-};
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// CPUFusedKernel
-
-struct TempFile {
-  TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
-  TempFile(const std::string & t, int suffix) {
-    // mkstemps edits its first argument in places
-    // so we make a copy of the string here, including null terminator
-    std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
-    int fd = mkstemps(tt.data(), suffix);
-    JIT_ASSERT(fd != -1);
-    file_ = fdopen(fd, "r+");
-
-    // - 1 becuase tt.size() includes the null terminator,
-    // but std::string does not expect one
-    name_ = std::string(tt.begin(), tt.end() - 1);
-  }
-  const std::string & name() const {
-    return name_;
-  }
-  void sync() {
-    fflush(file_);
-  }
-  void write(const std::string & str) {
-    size_t result = fwrite(str.c_str(), 1, str.size(), file_);
-    JIT_ASSERT(str.size() == result);
-  }
-  FILE* file()  {
-    return file_;
-  }
-  ~TempFile() {
-    if(file_ != nullptr) {
-      // unlink first to ensure another mkstemps doesn't
-      // race between close and unlink
-      unlink(name_.c_str());
-      fclose(file_);
-    }
-  }
-private:
-  FILE * file_ = nullptr;
-  std::string name_;
-};
-
-static void* checkDL(void * x) {
-  if(!x) {
-    AT_ERROR("error in dlopen or dlsym: ", dlerror());
-  }
-  return x;
-}
-
-struct DynamicLibrary {
-  TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
-  DynamicLibrary(const char * name) {
-    handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
-  }
-  void * sym(const char * name) {
-    JIT_ASSERT(handle);
-    return checkDL(dlsym(handle, name));
-  }
-  ~DynamicLibrary() {
-    if(!handle) return;
-    dlclose(handle);
-  }
-private:
-  void * handle = nullptr;
-};
-
-static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
-static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
-
-// NB: -march=native not supported on PPC64 g++.  It's a bit annoying
-// to do a configure-style test to decide whether or not the g++
-// actually supports it or not, so we heuristically use the host
-// compiler to predict if the runtime compiler supports the option we
-// want.  This probably won't work if you're cross-compiling.
-// NB: -march=native is disabled because it has caused problems where
-// compiler and assembler do not agree on what native instruction they
-// understand for AVX512. When we need better CPU performance this
-// optimization can be re-enabled by tracking down the platforms where
-// this error occurs and only selectively disabling it.
-static const std::string compile_string =
-  "\"${cxx}\" -O3 -g "
-#ifndef __PPC64__
-//  "-march=native "
-#endif
-  "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
-
-static void runCompiler(FusionCompilerConfig & config, const std::string & cpp_file, const std::string & so_file) {
-  TemplateEnv env;
-  env.s("cxx", config.cxx);
-  env.s("fopenmp", config.openmp ? "-fopenmp" : "");
-  env.s("cpp_file",cpp_file);
-  env.s("so_file",so_file);
-  std::string result = format(compile_string,env);
-  int r = system(result.c_str());
-  if(config.openmp && r != 0) {
-    std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
-    config.openmp = false; // disable for future compiles
-    return runCompiler(config, cpp_file, so_file);
-  }
-  JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel");
-}
-
-
-static const std::string disas_string =
-  "objdump -M  intel -d \"${so_file}\"";
-static void disas(const std::string & so_file) {
-  TemplateEnv env;
-  env.s("so_file", so_file);
-  std::string cmd = format(disas_string, env);
-  int r = system(cmd.c_str());
-  JIT_ASSERT(r == 0);
-}
-
-struct CPUFusedKernel : public FusedKernel {
-  CPUFusedKernel(const std::string & name, AnnotatedGraph & agraph, FusionCompilerConfig & config)
-  : FusedKernel(name, agraph) {
-    TempFile so_file(so_template, 3);
-    TempFile cpp_file(cpp_template, 4);
-
-    std::stringstream cu;
-    std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, false);
-    JIT_ASSERT(!has_random);
-    compilation_unit = cu.str();
-    cpp_file.write(compilation_unit);
-    cpp_file.sync();
-    runCompiler(config, cpp_file.name(), so_file.name());
-    if(config.debug) {
-      disas(so_file.name());
-    }
-    so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
-#pragma GCC diagnostic ignored "-Wpedantic"
-    kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
-#pragma GCC diagnostic pop
-  }
-protected:
-  virtual at::Backend backend() const override {
-    return at::Backend::CPU;
-  }
-  virtual uint64_t get_rand_offset(uint32_t numel) override {
-     return numel;
-  }
-  virtual void launch_raw(uint32_t numel, void ** arguments) override {
-    kernel(numel, arguments);
-  }
-  std::unique_ptr<DynamicLibrary> so_lib;
-  void (*kernel)(uint32_t, void**) = nullptr;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// FusedKernelCache
-
-// Note [Run-time shape checking code]
-// There are multiple assumptions that our codegen makes, which we can't check
-// in the fusion pass, because we don't have the shape information. Most notably,
-// that all values (post-input-chunk, and pre-output-concat) have the same shape
-// (hereinafter referred to as map size). One way to check this would be to run
-// shape propagation for every size configuration we get as an input, but that
-// requires a full graph traversal, and might incur unnecessary overhead. The code
-// below uses a few nice properties of broadcasting rules and their interactions with
-// pointwise operations, and takes a smarter approach, to quickly verify validity of
-// the kernel.
-//
-// Notation:
-//   - a.s when a is a tensor is a shorthand for a.shape.
-//   - B is a shorthand for the broadcasting/expanding function. It is used as a
-//     vararg function.
-//   - E is a shorthand for expand function.
-//   - Every pointwise operation can be equivalently rewritten as
-//     f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))),
-//     where f^ is a non-broadcasting verison of f.
-//   - A set of inputs that are used to produce a certain graph output is referred to
-//     as the output's broadcasting group (see Lemma 2. for explanation why).
-//
-// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the
-//          operation of broadcasting (returning bottom upon shape mismatch) forms a monoid.
-//          In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c).
-//
-// Proof.   Satisfies all monoid laws:
-//            - Closed under broadcasting (trivial)
-//            - Empty shape is the identity element: B(a, []) == B([], a) == a
-//            - Associativity: A simple visual proof is that you can expand 3 tensors
-//                at the same time by stacking their sizes (with alignment to the right),
-//                just as you'd do in the case of 2 tensors, but with an intermediate
-//                (the algorithm ends up being pretty much the same).
-//
-// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set
-//          of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]).
-//
-// Proof.   Let G be any DAG of pointwise ops and < be any valid topological
-//          ordering on nodes of G. Proof by induction over <.
-//          Base case (graph input):
-//            Trivial (input is also an output).
-//          Step (n = f(q, r)):
-//            Let QS (RS) be the set of shapes of inputs that q (r) depends on.
-//            Note that the set of inputs that n depends on is exactly QS + RS.
-//            shape(n) == shape(f(q, r))
-//                          (def of f)
-//                     == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s))))
-//                          (output shape of f^ is equal to either of argument shapes)
-//                     == shape(E(q, B(q.s, r.s)))
-//                          (property of expand)
-//                     == B(q.s, r.s)
-//                          (induction assumption)
-//                     == B(B(QS...), B(RS...))
-//                          (Lemma 1.)
-//                     == B(QS..., RS...)
-//                          (repeated shapes don't matter for broadcasting)
-//                     == B((QS + RS)...)
-//
-// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s))
-// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)).
-// Proof.   A simple exercise for the reader :)
-//
-// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to
-//          (post-chunk-)inputs, and have all intermediates of the same shape
-//          (no broadcasting happening in the body).
-//
-// Proof.   Using the above lemmas we can easily show that a graph with a single output
-//          can be easily rewritten by taking the shape given by B applied to all input
-//          shapes, expanding inputs to it, and using only non-broadcasting operations.
-//          Example:
-//
-//          let d = f(a, b) in
-//          let e = h(b, c) in
-//          g(d, e)
-//
-//          (By def. of broadcasting pointwise ops applied to g, f and h)
-//          (Lemma 2. for a closed formula for the size of g = gs)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in
-//          let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in
-//          g^(d', e')
-//
-//          (Lemma 3.)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in
-//          let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in
-//          g^(d', e')
-//
-//          (Lemma 4. + Lemma 1. to simplify broadcasting function)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let d' = f^(E(a, gs), E(b, gs)) in
-//          let e' = h^(E(b, gs), E(c, gs)) in
-//          g^(d', e')
-//
-//          (Simple rewrite)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let a' = E(a, gs) in
-//          let b' = E(b, gs) in
-//          let c' = E(c, gs) in
-//          let d' = f^(a', b') in
-//          let e' = h^(b', c') in
-//          g^(d', e')
-//
-//          This example can be easily formalized to arbitrary DAGs using induction
-//          over topological ordering, similar to Lemma 2. Now, if broadcasting groups
-//          for all outputs have the same shape, then performing an expand to this size
-//          on all inputs will ensure that all intermediates on all paths to outputs
-//          will have the same shape, proving that the body of the kernel is valid.
-//
-//          This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs
-//          is straightforward (needs a simple lemma for moving expands through chunks).
-
-// Register implementations of fused operators, so that we can reuse the fused graph
-// to generate fallback code.
-RegisterOperators reg_fused_operators({
-  Operator(
-    prim::FusedConcat,
-    [](Node* node) {
-      int64_t dim = node->i(attr::dim);
-      int64_t num_inputs = node->inputs().size();
-      return [dim, num_inputs](Stack& stack) {
-        auto result = at::cat(
-          fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }),
-          dim
-        );
-        drop(stack, num_inputs);
-        pack(stack, std::move(result));
-        return 0;
-      };
-    })
-});
-
-FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> _graph, int device)
-  : device(device)
-  , fallback_code(_graph)
-  , compiler(compiler)
-  , graph(std::move(_graph))
-  , input_broadcast_groups(getInputBroadcastGroups())
-  , input_chunks(getInputChunkDescriptors())
-  , kernels() {}
-
-std::atomic<size_t> FusedKernelCache::next_kernel_id {0};
-
-auto FusedKernelCache::getInputChunkDescriptors() -> std::vector<PartitionInfo> {
-  std::vector<PartitionInfo> descs;
-  descs.reserve(graph->inputs().size());
-  for (Value * input : graph->inputs()) {
-    if (Node * chunk = usedInFusedChunk(input)) {
-      descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim));
-    } else {
-      descs.emplace_back(1, 0);
-    }
-  }
-  return descs;
-}
-
-// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access
-static std::vector<int64_t> getInputDependencies(Value* output) {
-  // Run a DFS traversal to find all inputs that affect a given output value
-  std::vector<Value*> queue { output };
-  std::unordered_set<Value*> inputs;
-  std::unordered_set<Value*> seen;
-  while (!queue.empty()) {
-    Value * val = queue.back(); queue.pop_back();
-    Node * producer = val->node();
-    if (producer->kind() == prim::Param) {
-      inputs.insert(val);
-      continue;
-    }
-    for (Value * input : producer->inputs()) {
-      if (/*bool inserted = */seen.insert(input).second) {
-        queue.push_back(input);
-      }
-    }
-  }
-
-  // Convert Value* into offsets into the graph's input list
-  std::vector<int64_t> offsets;
-  offsets.reserve(inputs.size());
-  for (Value * input : inputs) {
-    offsets.push_back(input->offset());
-  }
-  std::sort(offsets.begin(), offsets.end());
-  return offsets;
-}
-
-std::vector<std::vector<int64_t>> FusedKernelCache::getInputBroadcastGroups() {
-  std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>> broadcast_groups;
-  for (Value * output : graph->outputs()) {
-    broadcast_groups.insert(getInputDependencies(output));
-  }
-  return std::vector<std::vector<int64_t>>{ broadcast_groups.begin(), broadcast_groups.end() };
-}
-
-void FusedKernelCache::run(Stack& stack) {
-  int64_t num_inputs = graph->inputs().size();
-  auto args = fmap(last(stack, num_inputs), [](const IValue& i) {
-                return i.toTensor();
-              });
-
-  auto maybe_map_size = canRunKernel(args);
-  if (!maybe_map_size) {
-    return runFallback(stack);
-  }
-  expandArgs(args, *maybe_map_size);
-
-  FusedKernelArgSpec spec { args };
-  auto it = kernels.find(spec);
-  if (it == kernels.end()) {
-    std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size));
-  }
-  auto & fn = it->second;
-
-  std::vector<at::Tensor> outputs;
-  fn->launch(args, outputs);
-  drop(stack, num_inputs);
-  stack.insert(stack.end(), std::make_move_iterator(outputs.begin()),
-                            std::make_move_iterator(outputs.end()));
-}
-
-at::optional<std::vector<int64_t>> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) {
-  int64_t dim_after_broadcast = 0;
-  for (int64_t arg_idx : arg_subset) {
-    dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim());
-  }
-  // TODO: this keeps reallocating map_size at every iteration, but we know
-  // exactly how much storage do we need, so this could be fixed in-place at
-  // every step. We're just missing a few functions for ATen, but the fix
-  // should be straightforward.
-  // NB: we leave this uninitialized, because an empty size is trivially
-  // broadcastable to any other size.
-  std::vector<int64_t> map_size;
-  for (size_t i = 0; i < arg_subset.size(); ++i) {
-    auto & arg = args.at(arg_subset[i]);
-    auto & chunk_desc = input_chunks.at(arg_subset[i]);
-    if (chunk_desc.nSubtensors == 1) {
-      try {
-        map_size = at::infer_size(map_size, arg.sizes());
-      } catch (std::exception& e) {
-        return at::nullopt;
-      }
-    } else {
-      auto tensor_sizes = arg.sizes().vec();
-      int64_t num_chunks = chunk_desc.nSubtensors;
-      int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size());
-      if (tensor_sizes[dim] % num_chunks != 0) {
-        return at::nullopt;
-      }
-      tensor_sizes[dim] /= num_chunks;
-      try {
-        map_size = at::infer_size(map_size, tensor_sizes);
-      } catch (std::exception& e) {
-        return at::nullopt;
-      }
-    }
-  }
-
-  return {map_size};
-}
-
-// See Note [Run-time shape checking code] for more explanation on the algorithm.
-at::optional<std::vector<int64_t>> FusedKernelCache::canRunKernel(at::TensorList args) {
-  AT_CHECK(args.size() == input_chunks.size(),
-           "Expected ", input_chunks.size(), " arguments, but got ", args.size());
-
-  at::optional<std::vector<int64_t>> map_size;
-  for (const auto & broadcast_group : input_broadcast_groups) {
-    if (!map_size) {
-      map_size = getMapSize(args, broadcast_group);
-      if (!map_size) {
-        return at::nullopt;
-      }
-    } else {
-      auto group_map_size = getMapSize(args, broadcast_group);
-      // NB: this checks that group_map_size is defined AND equal to map_size
-      if (map_size != group_map_size) {
-        return at::nullopt;
-      }
-    }
-  }
-  return map_size;
-}
-
-void FusedKernelCache::runFallback(Stack& stack) {
-  InterpreterState(fallback_code).runOneStage(stack);
-}
-
-// NB: args are mutated in this call. map_size is mutated too, but is restored to its original
-// value before this function returns (it's an optimization).
-void FusedKernelCache::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {
-  for (size_t i = 0; i < args.size(); ++i) {
-    auto & arg = args[i];
-    auto & pdesc = input_chunks[i];
-    if (pdesc.nSubtensors == 1) {
-      if (arg.sizes().equals(map_size)) continue;
-      arg = arg.expand(map_size);
-    } else {
-      map_size.at(pdesc.dim) *= pdesc.nSubtensors;
-      if (!arg.sizes().equals(map_size)) {
-        arg = arg.expand(map_size);
-      }
-      map_size.at(pdesc.dim) /= pdesc.nSubtensors;
-    }
-  }
-}
-
-std::unique_ptr<FusedKernel> FusedKernelCache::compileSpec(
-      const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size) {
-  AnnotatedGraph agraph {*graph, device};
-
-  agraph.input_desc = spec.descs();
-  // XXX: this assumes that fused kernels only operate on floating-point values inside
-  at::optional<at::ScalarType> scalar_type;
-  for (TensorDesc& desc : agraph.input_desc) {
-    if (isFloatingType(desc.scalar_type)) {
-      scalar_type = desc.scalar_type;
-      break;
-    }
-  }
-  JIT_ASSERT(scalar_type);
-
-  for (Value * output : graph->outputs()) {
-    std::vector<int64_t> sizes = map_size;
-    if (output->node()->kind() == prim::FusedConcat) {
-      sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size();
-    }
-    auto type = CompleteTensorType::create(*scalar_type, device, sizes);
-    agraph.output_desc.emplace_back(std::move(type));
-  }
-
-  std::string name = "kernel_" + std::to_string(next_kernel_id++);
-  FusedKernel * raw_func;
-  if (device != kCPUDevice) {
-#ifdef USE_CUDA
-    raw_func = new CUDAFusedKernel(name, agraph);
-#else
-    throw std::runtime_error("cannot compile a CUDA fusion group, CUDA is not enabled.");
-#endif
-  } else {
-    JIT_ASSERT(compiler.canCompileOnCPU());
-    raw_func = new CPUFusedKernel(name, agraph, compiler.config_);
-  }
-  return std::unique_ptr<FusedKernel>(raw_func);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// FusionCompiler
-
-std::shared_ptr<FusedKernelCache> FusionCompiler::getOrCompile(Node* fusion_group) {
-  int device = fusion_group->i(attr::device);
-  if (device == kCPUDevice) {
-    JIT_ASSERT(canCompileOnCPU());
-  } else {
-#ifndef USE_CUDA
-    throw std::runtime_error("cannot compile a CUDA fusion group - CUDA is not enabled.");
-#endif
-  }
-  auto graph = fusion_group->g(attr::Subgraph)->copy();
-  EraseShapeInformation(*graph);
-  std::stringstream key;
-  key << "device " << device << "\n";
-  key << *graph << "\n";
-  std::string key_ = key.str();
-  auto it = cache_map.find(key_);
-  if (it == cache_map.end()) {
-    std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared<FusedKernelCache>(*this, graph, device));
-  }
-  return it->second;
-}
-
-std::vector<at::Tensor> FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs) {
-  auto wrapper_graph = std::make_shared<Graph>();
-  Node * fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
-  fusion_group->g_(attr::Subgraph, graph.copy());
-  for (size_t i = 0; i < graph.inputs().size(); ++i) {
-    fusion_group->addInput(wrapper_graph->addInput());
-  }
-  for (size_t i = 0; i < graph.outputs().size(); ++i) {
-    wrapper_graph->registerOutput(fusion_group->addOutput());
-  }
-  auto cache = getOrCompile(fusion_group);
-  Stack stack = fmap<IValue>(inputs);
-  cache->run(stack);
-  return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
-}
-
-static const std::string check_exists_string =
-  "which '${program}' > /dev/null";
-
-static bool programExists(const std::string & program) {
-  TemplateEnv env;
-  env.s("program", program);
-  std::string cmd = format(check_exists_string, env);
-  return 0 == system(cmd.c_str());
-}
-
-FusionCompiler::FusionCompiler() {
-  const char * cxx_env = getenv("CXX");
-  if(cxx_env != nullptr) {
-    config_.cxx = cxx_env;
-  }
-  if(!programExists(config_.cxx)) {
-    config_.cxx = "";
-  }
-  const char * debug_env = getenv("PYTORCH_FUSION_DEBUG");
-  config_.debug = debug_env && atoi(debug_env) != 0;
-}
-
-//TODO: thread safety
-FusionCompiler & sharedFusionCompiler() {
-  static FusionCompiler compiler;
-  return compiler;
-}
-
-}}
-
-# else
-// dummy implementations for windows
-
-#include "torch/csrc/jit/fusion_compiler.h"
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/code_template.h"
-#include "torch/csrc/jit/resource_guard.h"
-#include "torch/csrc/utils/disallow_copy.h"
-#include "ATen/ATen.h"
-#ifdef USE_CUDA
-#include "torch/csrc/cuda/cuda_check.h"
-#include <nvrtc.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-#include <string>
-#include <algorithm>
-#include <unordered_map>
-#include <vector>
-#include <sstream>
-#include <iostream>
-
-namespace torch { namespace jit {
-
-struct FusedKernel {
-  char padding;
-};
-
-FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> graph, int device)
-  : compiler(compiler) {}
-void FusedKernelCache::run(Stack& inputs) {}
-void FusedKernelCache::runFallback(Stack& stack) {}
-void FusedKernelCache::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {}
-at::optional<std::vector<int64_t>> FusedKernelCache::canRunKernel(at::TensorList args) { return at::nullopt; }
-at::optional<std::vector<int64_t>> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) { return at::nullopt; }
-std::vector<std::vector<int64_t>> FusedKernelCache::getInputBroadcastGroups() { return {}; }
-auto FusedKernelCache::getInputChunkDescriptors() -> std::vector<PartitionInfo> { return {}; }
-std::unique_ptr<FusedKernel> FusedKernelCache::compileSpec(
-      const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size) { return nullptr; }
-std::atomic<size_t> FusedKernelCache::next_kernel_id {0};
-
-FusionCompiler::FusionCompiler() {}
-std::shared_ptr<FusedKernelCache> FusionCompiler::getOrCompile(Node* fusion_group) { return nullptr; }
-std::vector<at::Tensor> FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs) { return {}; }
-
-FusionCompiler & sharedFusionCompiler() {
-  throw std::runtime_error("NYI: fuser is not supported on Windows.");
-}
-
-}}
-
-# endif
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
deleted file mode 100644
index a24fe0817e0517..00000000000000
--- a/torch/csrc/jit/fusion_compiler.h
+++ /dev/null
@@ -1,188 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/ir.h>
-#include "torch/csrc/utils/disallow_copy.h"
-#include "torch/csrc/utils/hash.h"
-#include <torch/csrc/jit/assertions.h>
-#include <torch/csrc/jit/stack.h>
-#include <torch/csrc/jit/argument_spec.h>
-#include <torch/csrc/jit/interpreter.h>
-
-#include "ATen/ATen.h"
-#include <string>
-#include <algorithm>
-#include <unordered_map>
-#include <vector>
-#include <memory>
-
-namespace torch { namespace jit {
-
-struct FusedKernel;
-struct FusionCompiler;
-
-// type information needed by the compiler for input/outputs
-// contiguity[i] is true if the dim i is contiguous with dim i + 1.
-// contiguity.back() == true means strides.back() == 1.
-struct TensorDesc {
-  at::ScalarType scalar_type;
-  std::vector<bool> contiguity;
-
-  TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
-  : scalar_type(type), contiguity(contiguity) {
-    if (contiguity.size() == 0) {
-      nDim_ = 0;
-    } else {
-      nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
-    }
-  }
-
-  TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
-  : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
-  TensorDesc(const at::Tensor& t)
-    : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
-  TensorDesc(CompleteTensorTypePtr type)
-    : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}
-
-  // number of dimensions after contiguity compression
-  size_t nDim() const {
-    return nDim_;
-  }
-
-  // do we have inner stride == 1?
-  bool lastIsContiguous() const {
-    return contiguity.size() == 0 || contiguity.back();
-  }
-
-  static std::vector<bool> findContiguous(
-    const at::IntList& sizes,
-    const at::IntList& strides);
-
-  bool operator==(const TensorDesc & desc) const {
-    return scalar_type == desc.scalar_type && contiguity == desc.contiguity;
-  }
-  bool operator!=(const TensorDesc & desc) const {
-    return !(*this == desc);
-  }
-  static size_t hash(const TensorDesc& spec) {
-    return torch::get_hash(spec.scalar_type, spec.nDim_, std::hash<std::vector<bool>>{}(spec.contiguity));
-  }
-
-private:
-  size_t nDim_;
-};
-
-inline std::ostream& operator<<(std::ostream & out, const TensorDesc & d) {
-  out << d.scalar_type << "[";
-  for(auto b : d.contiguity)
-    out << b << ";";
-  out << "]";
-  return out;
-}
-
-struct FusedKernelArgSpec {
-  FusedKernelArgSpec(at::TensorList inputs)
-    : descs_(fmap<TensorDesc>(inputs))
-    , hash_code_(torch::get_hash(inputs.size(), descs_)) {}
-
-  bool operator==(const FusedKernelArgSpec & spec) const {
-    return hash_code_ == spec.hash_code_ && descs_ == spec.descs_;
-  }
-  bool operator!=(const FusedKernelArgSpec & spec) const {
-    return !(*this == spec);
-  }
-  static size_t hash(const FusedKernelArgSpec& spec) {
-    return spec.hash_code_;
-  }
-  const std::vector<TensorDesc>& descs() const {
-    return descs_;
-  }
-
-private:
-  std::vector<TensorDesc> descs_;
-  size_t hash_code_;
-};
-
-constexpr int kCPUDevice = -1;
-struct AnnotatedGraph {
-  // short-term storage only, so it borrows Graph.
-  AnnotatedGraph(Graph & graph, int device)
-  : graph(&graph), device(device) {}
-  Graph* graph = nullptr; // TODO: this should really be const
-  int device = kCPUDevice;
-  std::vector<TensorDesc> input_desc;
-  std::vector<TensorDesc> output_desc;
-};
-
-// FusionCompiler has very limited shape information available at the time getOrCompile
-// is called, and this is why it can't really prepare the kernels at that time. Instead,
-// it returns this object, which will take care of matching the run-time shapes to whatever
-// kernels we have compiled already.
-//
-// Two configurations are considered eligible for the same fused kernel if:
-//   - the shapes satisfy graph invariants for our fused code (e.g. that all intermediate shapes
-//     are the same - see fusion_compiler.cpp for more details).
-//   - their FusedKernelArgSpecs compare equal
-struct FusedKernelCache {
-  FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> graph, int device);
-
-  void run(Stack& inputs);
-private:
-  struct PartitionInfo {
-    PartitionInfo(int64_t nsub, int64_t dim)
-      : nSubtensors(nsub), dim(dim) {};
-    int64_t nSubtensors;
-    int64_t dim;
-  };
-
-  void runFallback(Stack& stack);
-  void expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size);
-  at::optional<std::vector<int64_t>> canRunKernel(at::TensorList args);
-  at::optional<std::vector<int64_t>> getMapSize(at::TensorList args, at::IntList arg_subset);
-  std::vector<std::vector<int64_t>> getInputBroadcastGroups();
-  std::vector<PartitionInfo> getInputChunkDescriptors();
-  std::unique_ptr<FusedKernel> compileSpec(
-        const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size);
-
-  static std::atomic<size_t> next_kernel_id;
-
-  int device;
-  Code fallback_code;
-  FusionCompiler& compiler;
-  std::shared_ptr<Graph> graph;
-  std::vector<std::vector<int64_t>> input_broadcast_groups;
-  std::vector<PartitionInfo> input_chunks;
-  std::unordered_map<FusedKernelArgSpec, std::unique_ptr<FusedKernel>, torch::hash<FusedKernelArgSpec>> kernels;
-};
-
-struct FusionCompilerConfig {
-  std::string cxx = "g++"; // compiler location
-  bool debug = false; // emit debugging information about fusions
-  bool openmp = true;
-};
-
-// caching compiler
-struct FusionCompiler {
-  friend struct FusedKernelCache;
-
-  FusionCompiler();
-  TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler);
-
-  // uses type annotations in fusion_group to create Annotated graph
-  std::shared_ptr<FusedKernelCache> getOrCompile(Node * fusion_group);
-
-  // debugging function that lets you do everything from compilation to execution
-  // in one step.
-  // this should not be used in the hot path of execution because it has to serialize
-  // the graph each time
-  std::vector<at::Tensor> debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs);
-  bool canCompileOnCPU() const {
-    return config_.cxx.size() > 0;
-  }
-private:
-  FusionCompilerConfig config_;
-  std::unordered_map<std::string, std::shared_ptr<FusedKernelCache>> cache_map;
-};
-
-FusionCompiler & sharedFusionCompiler();
-
-}}
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 5484c04716c406..ab9a9ee4a81518 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -33,6 +33,7 @@
 #include "torch/csrc/jit/function_schema.h"
 #include "torch/csrc/jit/serialization.h"
 #include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/fusers/interface.h"
 
 #include <pybind11/functional.h>
 
@@ -115,7 +116,8 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_onnx_block", BlockToONNX)
    .def("_jit_pass_fixup_onnx_loops", FixupONNXLoops)
    .def("_jit_pass_canonicalize_ops", CanonicalizeOps)
-    .def("_jit_pass_specialize_undef", specializeUndef)
+   .def("_jit_pass_specialize_undef", specializeUndef)
+   .def("_jit_override_can_fuse_on_cpu", &overrideCanFuseOnCPU)
    .def("_jit_differentiate", [](Graph &g) {
        // the python binding slightly differs in semantics
        // it makes a copy of the input Graph, and works on that
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 6864d8ad797087..d7e8c31f8cb5cf 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -6,7 +6,6 @@
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/fusion_compiler.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/ivalue.h"
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 81a1768f48a302..e8123690789b24 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -1,7 +1,7 @@
 #include "torch/csrc/jit/passes/graph_fuser.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
 #include "torch/csrc/jit/symbolic_variable.h"
-#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/interface.h"
 #include "torch/csrc/jit/autodiff.h"
 #include "torch/csrc/jit/assertions.h"
 #include "ATen/ExpandUtils.h"
@@ -343,7 +343,7 @@ struct GraphFuser {
     // is that if we're compiling on CPU, the fusion compiler works.
     if (consumer_device.type() == DeviceType::CPU ||
         producer_device.type() == DeviceType::CPU) {
-      return sharedFusionCompiler().canCompileOnCPU();
+      return canFuseOnCPU();
     }
     return true;
   }
diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp
index 85f0c37c4ea7e3..5c2115a20413d8 100644
--- a/torch/csrc/jit/python_interpreter.cpp
+++ b/torch/csrc/jit/python_interpreter.cpp
@@ -5,7 +5,6 @@
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/fusion_compiler.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/jit/custom_operator.h"
 #include "torch/csrc/jit/graph_executor.h"
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 13f5f6dbbad275..42184ac318ad58 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -3,7 +3,7 @@
 #include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/interface.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/operator.h"
@@ -53,10 +53,10 @@ RegisterOperators reg({
     Operator(
         prim::FusionGroup,
         [](Node* node) {
-          auto kernel_cache = sharedFusionCompiler().getOrCompile(node);
-          return [kernel_cache](Stack& stack) {
+          auto handle = getFusionHandle(node);
+          return [handle](Stack& stack) {
             autograd::profiler::RecordFunction record("FusionGroup");
-            kernel_cache->run(stack);
+            handle->run(stack);
             return 0;
           };
         }),
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index 7a652970243f34..28bf95847d634f 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -12,7 +12,7 @@ using Catch::StartsWith;
 #endif
 
 #include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/interface.h"
 #include "torch/csrc/jit/code_template.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/attributes.h"
@@ -136,8 +136,6 @@ Value * appendNewNode(NodeKind kind, Graph& graph, ArrayRef<Value*> inputs) {
 
 
 static void fusionTests() {
-  FusionCompiler comp;
-
   auto testSimple = [&] {
     Graph graph;
     Var i0 = Var::asNewInput(graph);
@@ -147,7 +145,7 @@ static void fusionTests() {
     auto a = at::rand({3,4}, at::kCUDA);
     auto b = at::rand({4,3}, at::kCUDA).transpose(0,1);
     auto o = at::zeros({3,4}, at::kCUDA);
-    auto outputs = comp.debugLaunchGraph(graph, 0, {a,b});
+    auto outputs = debugLaunchGraph(graph, 0, {a,b});
     REQUIRE(outputs.size() == 1);
     auto o2 = a*b;
     float max_diff = (o2 - outputs[0]).abs().max().toCDouble();
@@ -201,7 +199,7 @@ static void fusionTests() {
     auto t5 = out1.tanh();
     auto out0 = t16*t5;
 
-    auto outputs = comp.debugLaunchGraph(graph, 0, inputs);
+    auto outputs = debugLaunchGraph(graph, 0, inputs);
     REQUIRE(outputs.size() == graph.outputs().size());
     REQUIRE(out0.is_same_size(outputs.front()));
     float max_diff = (outputs.front() - out0).abs().max().toCDouble();
@@ -235,7 +233,7 @@ static void fusionTests() {
 
     auto o_r = a*b;
     auto o2_r = at::cat({a, o_r}, dim);
-    auto outputs = comp.debugLaunchGraph(graph, 0, {a,b});
+    auto outputs = debugLaunchGraph(graph, 0, {a,b});
     REQUIRE(outputs.size() == 2);
 
     float max_diff = (o_r - outputs[0]).abs().max().toCDouble();
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index e4543a6be12b87..5f6a1c1e011167 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -315,6 +315,7 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None):
 
 
 # Check the traced module against a set of user-provided validation inputs
+@torch.no_grad()
 def _check_trace(check_inputs, func, executor_options, module, check_tolerance):
     for inputs in check_inputs:
         if isinstance(inputs, torch.Tensor):

From 8e3f8c52e85ee91995f8a9137a4649f5c39dfc3a Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Fri, 14 Sep 2018 15:06:12 -0700
Subject: [PATCH 167/237] Document the Sequential module (#11648)

Summary:
1. Document the Sequential module in the C++ API at a high, why-does-this-exist, and low, how-to-use, level
2. Change the Sequential tests to be in a style that makes them easier to convert to gtest. No code changes.

ebetica ezyang apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11648

Differential Revision: D9834526

Pulled By: goldsborough

fbshipit-source-id: 39f2f5c6cbbf8ed5a1b69986978c8ef127036de1
---
 docs/cpp/Doxyfile                             |   4 +-
 test/cpp/api/sequential.cpp                   | 541 +++++++++---------
 torch/csrc/api/include/torch/jit.h            |   2 +-
 torch/csrc/api/include/torch/nn/module.h      |   8 +-
 torch/csrc/api/include/torch/nn/modules/any.h |   8 +-
 .../api/include/torch/nn/modules/sequential.h | 123 +++-
 6 files changed, 396 insertions(+), 290 deletions(-)

diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile
index a21dcf8184ad52..3c942a9025c2dd 100644
--- a/docs/cpp/Doxyfile
+++ b/docs/cpp/Doxyfile
@@ -419,7 +419,7 @@ EXTRACT_ALL            = YES
 # be included in the documentation.
 # The default value is: NO.
 
-EXTRACT_PRIVATE        = YES
+EXTRACT_PRIVATE        = NO
 
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
@@ -799,7 +799,7 @@ INPUT_ENCODING         = UTF-8
 # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
 # *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
 
-FILE_PATTERNS          = *.h *.cpp
+FILE_PATTERNS          = *.h
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp
index 389393fe480b21..aef1332720e4a4 100644
--- a/test/cpp/api/sequential.cpp
+++ b/test/cpp/api/sequential.cpp
@@ -1,11 +1,16 @@
 #include <catch.hpp>
 
 #include <torch/nn/modules.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/dropout.h>
 #include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/rnn.h>
 #include <torch/nn/modules/sequential.h>
 #include <torch/tensor.h>
 #include <torch/utils.h>
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 
@@ -16,298 +21,304 @@ using namespace torch::test;
 
 using Catch::StartsWith;
 
-TEST_CASE("sequential") {
-  SECTION("construction from shared pointer") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int value;
-      int forward() {
-        return value;
-      }
-    };
-    Sequential sequential(
-        std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("construction from concrete type") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int value;
-      int forward() {
-        return value;
-      }
-    };
-
-    Sequential sequential(M(1), M(2), M(3));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("construction from module holders") {
-    struct MImpl : torch::nn::Module {
-      explicit MImpl(int value_) : value(value_) {}
-      int forward() {
-        return value;
-      }
-      int value;
-    };
-
-    struct M : torch::nn::ModuleHolder<MImpl> {
-      using torch::nn::ModuleHolder<MImpl>::ModuleHolder;
-      using torch::nn::ModuleHolder<MImpl>::get;
-    };
-
-    Sequential sequential(M(1), M(2), M(3));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("push_back") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int forward() {
-        return value;
-      }
-      int value;
-    };
-    Sequential sequential;
-    REQUIRE(sequential->size() == 0);
-    REQUIRE(sequential->is_empty());
-    sequential->push_back(Linear(3, 4));
-    REQUIRE(sequential->size() == 1);
-    sequential->push_back(std::make_shared<M>(1));
-    REQUIRE(sequential->size() == 2);
-    sequential->push_back(M(2));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("access") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int forward() {
-        return value;
-      }
-      int value;
-    };
-    std::vector<std::shared_ptr<M>> modules = {
-        std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
-
-    Sequential sequential;
-    for (auto& module : modules) {
-      sequential->push_back(module);
-    }
-    REQUIRE(sequential->size() == 3);
-
-    SECTION("at()") {
-      SECTION("returns the correct module for a given index") {
-        for (size_t i = 0; i < modules.size(); ++i) {
-          REQUIRE(&sequential->at<M>(i) == modules[i].get());
-        }
-      }
-      SECTION("throws for a bad index") {
-        REQUIRE_THROWS_WITH(
-            sequential->at<M>(modules.size() + 1),
-            StartsWith("Index out of range"));
-        REQUIRE_THROWS_WITH(
-            sequential->at<M>(modules.size() + 1000000),
-            StartsWith("Index out of range"));
-      }
+TEST_CASE("Sequential/ConstructsFromSharedPointer") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int value;
+    int forward() {
+      return value;
     }
+  };
+  Sequential sequential(
+      std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3));
+  REQUIRE(sequential->size() == 3);
+}
 
-    SECTION("ptr()") {
-      SECTION("returns the correct module for a given index") {
-        for (size_t i = 0; i < modules.size(); ++i) {
-          REQUIRE(sequential->ptr(i).get() == modules[i].get());
-          REQUIRE(sequential[i].get() == modules[i].get());
-          REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
-        }
-      }
-      SECTION("throws for a bad index") {
-        REQUIRE_THROWS_WITH(
-            sequential->ptr(modules.size() + 1),
-            StartsWith("Index out of range"));
-        REQUIRE_THROWS_WITH(
-            sequential->ptr(modules.size() + 1000000),
-            StartsWith("Index out of range"));
-      }
+TEST_CASE("Sequential/ConstructsFromConcreteType") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int value;
+    int forward() {
+      return value;
     }
-  }
-  SECTION("forward") {
-    SECTION("calling forward() on an empty sequential is disallowed") {
-      Sequential empty;
-      REQUIRE_THROWS_WITH(
-          empty->forward<int>(),
-          StartsWith("Cannot call forward() on an empty Sequential"));
+  };
+
+  Sequential sequential(M(1), M(2), M(3));
+  REQUIRE(sequential->size() == 3);
+}
+TEST_CASE("Sequential/ConstructsFromModuleHolder") {
+  struct MImpl : torch::nn::Module {
+    explicit MImpl(int value_) : value(value_) {}
+    int forward() {
+      return value;
     }
+    int value;
+  };
 
-    SECTION("calling forward() on a non-empty sequential chains correctly") {
-      struct MockModule : torch::nn::Module {
-        explicit MockModule(int value) : expected(value) {}
-        int expected;
-        int forward(int value) {
-          REQUIRE(value == expected);
-          return value + 1;
-        }
-      };
+  struct M : torch::nn::ModuleHolder<MImpl> {
+    using torch::nn::ModuleHolder<MImpl>::ModuleHolder;
+    using torch::nn::ModuleHolder<MImpl>::get;
+  };
 
-      Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3});
+  Sequential sequential(M(1), M(2), M(3));
+  REQUIRE(sequential->size() == 3);
+}
 
-      REQUIRE(sequential->forward<int>(1) == 4);
+TEST_CASE("Sequential/PushBackAddsAnElement") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int forward() {
+      return value;
     }
+    int value;
+  };
+  Sequential sequential;
+  REQUIRE(sequential->size() == 0);
+  REQUIRE(sequential->is_empty());
+  sequential->push_back(Linear(3, 4));
+  REQUIRE(sequential->size() == 1);
+  sequential->push_back(std::make_shared<M>(1));
+  REQUIRE(sequential->size() == 2);
+  sequential->push_back(M(2));
+  REQUIRE(sequential->size() == 3);
+}
 
-    SECTION("calling forward() with the wrong return type throws") {
-      struct M : public torch::nn::Module {
-        int forward() {
-          return 5;
-        }
-      };
-
-      Sequential sequential(M{});
-      REQUIRE(sequential->forward<int>() == 5);
-      REQUIRE_THROWS_WITH(
-          sequential->forward<float>(),
-          StartsWith("The type of the return value "
-                     "is int, but you asked for type float"));
+TEST_CASE("Sequential/AccessWithAt") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int forward() {
+      return value;
     }
+    int value;
+  };
+  std::vector<std::shared_ptr<M>> modules = {
+      std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
+
+  Sequential sequential;
+  for (auto& module : modules) {
+    sequential->push_back(module);
+  }
+  REQUIRE(sequential->size() == 3);
 
-    SECTION("The return type of forward() defaults to Tensor") {
-      struct M : public torch::nn::Module {
-        torch::Tensor forward(torch::Tensor v) {
-          return v;
-        }
-      };
-
-      Sequential sequential(M{});
-      auto variable = torch::ones({3, 3}, torch::requires_grad());
-      REQUIRE(sequential->forward(variable).equal(variable));
-    }
+  // returns the correct module for a given index
+  for (size_t i = 0; i < modules.size(); ++i) {
+    REQUIRE(&sequential->at<M>(i) == modules[i].get());
   }
 
-  SECTION("returns the last value") {
-    torch::manual_seed(0);
-    Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100));
+  // throws for a bad index
+  REQUIRE_THROWS_WITH(
+      sequential->at<M>(modules.size() + 1), StartsWith("Index out of range"));
+  REQUIRE_THROWS_WITH(
+      sequential->at<M>(modules.size() + 1000000),
+      StartsWith("Index out of range"));
+}
 
-    auto x = torch::randn({1000, 10}, torch::requires_grad());
-    auto y = sequential->forward(x);
-    REQUIRE(y.ndimension() == 2);
-    REQUIRE(y.size(0) == 1000);
-    REQUIRE(y.size(1) == 100);
+TEST_CASE("Sequential/AccessWithPtr") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int forward() {
+      return value;
+    }
+    int value;
+  };
+  std::vector<std::shared_ptr<M>> modules = {
+      std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
+
+  Sequential sequential;
+  for (auto& module : modules) {
+    sequential->push_back(module);
   }
+  REQUIRE(sequential->size() == 3);
 
-  SECTION("can hold other important modules") {
-    Sequential sequential(
-        Linear(10, 3),
-        Conv2d(1, 2, 3),
-        Dropout(0.5),
-        BatchNorm(5),
-        Embedding(4, 10),
-        LSTM(4, 5));
+  // returns the correct module for a given index
+  for (size_t i = 0; i < modules.size(); ++i) {
+    REQUIRE(sequential->ptr(i).get() == modules[i].get());
+    REQUIRE(sequential[i].get() == modules[i].get());
+    REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
   }
 
-  SECTION("converts at::Tensor to torch::Tensor correctly") {
-    struct M : torch::nn::Module {
-      torch::Tensor forward(torch::Tensor input) {
-        return input;
-      }
-    };
-
-    Sequential sequential(M{});
-    torch::Tensor variable = torch::ones(5);
-    REQUIRE(sequential->forward(variable).sum().toCFloat() == 5);
-
-    at::Tensor tensor_that_is_actually_a_variable = variable * 2;
-    REQUIRE(
-        sequential->forward(tensor_that_is_actually_a_variable)
-            .sum()
-            .toCFloat() == 10);
-  }
-  SECTION("extend() pushes modules from other Sequential") {
-    struct A : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    struct B : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    struct C : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    struct D : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    Sequential a(A{}, B{});
-    Sequential b(C{}, D{});
-    a->extend(*b);
-
-    REQUIRE(a->size() == 4);
-    REQUIRE(a[0]->as<A>());
-    REQUIRE(a[1]->as<B>());
-    REQUIRE(a[2]->as<C>());
-    REQUIRE(a[3]->as<D>());
-
-    REQUIRE(b->size() == 2);
-    REQUIRE(b[0]->as<C>());
-    REQUIRE(b[1]->as<D>());
-
-    std::vector<std::shared_ptr<A>> c = {std::make_shared<A>(),
-                                         std::make_shared<A>()};
-    b->extend(c);
-
-    REQUIRE(b->size() == 4);
-    REQUIRE(b[0]->as<C>());
-    REQUIRE(b[1]->as<D>());
-    REQUIRE(b[2]->as<A>());
-    REQUIRE(b[3]->as<A>());
-  }
-  SECTION("has reference semantics") {
-    Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5));
-    Sequential second(first);
-
-    REQUIRE(first.get() == second.get());
-    REQUIRE(first->size() == second->size());
-    REQUIRE(std::equal(
-        first->begin(),
-        first->end(),
-        second->begin(),
-        [](const AnyModule& first, const AnyModule& second) {
-          return &first == &second;
-        }));
-  }
-  SECTION("Is cloneable") {
-    Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
-    Sequential clone =
-        std::dynamic_pointer_cast<SequentialImpl>(sequential->clone());
-    REQUIRE(sequential->size() == clone->size());
-
-    for (size_t i = 0; i < sequential->size(); ++i) {
-      // The modules should be the same kind (type).
-      REQUIRE(sequential[i]->name() == clone[i]->name());
-      // But not pointer-equal (distinct objects).
-      REQUIRE(sequential[i] != clone[i]);
+  // throws for a bad index
+  REQUIRE_THROWS_WITH(
+      sequential->ptr(modules.size() + 1), StartsWith("Index out of range"));
+  REQUIRE_THROWS_WITH(
+      sequential->ptr(modules.size() + 1000000),
+      StartsWith("Index out of range"));
+}
+
+TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") {
+  Sequential empty;
+  REQUIRE_THROWS_WITH(
+      empty->forward<int>(),
+      StartsWith("Cannot call forward() on an empty Sequential"));
+}
+
+TEST_CASE("Sequential/CallingForwardChainsCorrectly") {
+  struct MockModule : torch::nn::Module {
+    explicit MockModule(int value) : expected(value) {}
+    int expected;
+    int forward(int value) {
+      REQUIRE(value == expected);
+      return value + 1;
     }
+  };
 
-    // Verify that the clone is deep, i.e. parameters of modules are cloned too.
+  Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3});
 
-    torch::NoGradGuard no_grad;
+  REQUIRE(sequential->forward<int>(1) == 4);
+}
 
-    auto params1 = sequential->parameters();
-    auto params2 = clone->parameters();
-    REQUIRE(params1.size() == params2.size());
-    for (auto& param : params1) {
-      REQUIRE(!pointer_equal(param.value, params2[param.key]));
-      REQUIRE(param->device() == params2[param.key].device());
-      REQUIRE(param->allclose(params2[param.key]));
-      param->add_(2);
+TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") {
+  struct M : public torch::nn::Module {
+    int forward() {
+      return 5;
     }
-    for (auto& param : params1) {
-      REQUIRE(!param->allclose(params2[param.key]));
+  };
+
+  Sequential sequential(M{});
+  REQUIRE(sequential->forward<int>() == 5);
+  REQUIRE_THROWS_WITH(
+      sequential->forward<float>(),
+      StartsWith("The type of the return value "
+                 "is int, but you asked for type float"));
+}
+
+TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") {
+  struct M : public torch::nn::Module {
+    torch::Tensor forward(torch::Tensor v) {
+      return v;
     }
+  };
+
+  Sequential sequential(M{});
+  auto variable = torch::ones({3, 3}, torch::requires_grad());
+  REQUIRE(sequential->forward(variable).equal(variable));
+}
+
+TEST_CASE("Sequential/ForwardReturnsTheLastValue") {
+  torch::manual_seed(0);
+  Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100));
+
+  auto x = torch::randn({1000, 10}, torch::requires_grad());
+  auto y = sequential->forward(x);
+  REQUIRE(y.ndimension() == 2);
+  REQUIRE(y.size(0) == 1000);
+  REQUIRE(y.size(1) == 100);
+}
+
+TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") {
+  Sequential sequential(
+      Linear(10, 3),
+      Conv2d(1, 2, 3),
+      Dropout(0.5),
+      BatchNorm(5),
+      Embedding(4, 10),
+      LSTM(4, 5));
+}
+
+TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") {
+  struct A : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  struct B : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  struct C : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  struct D : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  Sequential a(A{}, B{});
+  Sequential b(C{}, D{});
+  a->extend(*b);
+
+  REQUIRE(a->size() == 4);
+  REQUIRE(a[0]->as<A>());
+  REQUIRE(a[1]->as<B>());
+  REQUIRE(a[2]->as<C>());
+  REQUIRE(a[3]->as<D>());
+
+  REQUIRE(b->size() == 2);
+  REQUIRE(b[0]->as<C>());
+  REQUIRE(b[1]->as<D>());
+
+  std::vector<std::shared_ptr<A>> c = {std::make_shared<A>(),
+                                       std::make_shared<A>()};
+  b->extend(c);
+
+  REQUIRE(b->size() == 4);
+  REQUIRE(b[0]->as<C>());
+  REQUIRE(b[1]->as<D>());
+  REQUIRE(b[2]->as<A>());
+  REQUIRE(b[3]->as<A>());
+}
+
+TEST_CASE("Sequential/HasReferenceSemantics") {
+  Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5));
+  Sequential second(first);
+
+  REQUIRE(first.get() == second.get());
+  REQUIRE(first->size() == second->size());
+  REQUIRE(std::equal(
+      first->begin(),
+      first->end(),
+      second->begin(),
+      [](const AnyModule& first, const AnyModule& second) {
+        return &first == &second;
+      }));
+}
+
+TEST_CASE("Sequential/IsCloneable") {
+  Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
+  Sequential clone =
+      std::dynamic_pointer_cast<SequentialImpl>(sequential->clone());
+  REQUIRE(sequential->size() == clone->size());
+
+  for (size_t i = 0; i < sequential->size(); ++i) {
+    // The modules should be the same kind (type).
+    REQUIRE(sequential[i]->name() == clone[i]->name());
+    // But not pointer-equal (distinct objects).
+    REQUIRE(sequential[i] != clone[i]);
   }
+
+  // Verify that the clone is deep, i.e. parameters of modules are cloned too.
+
+  torch::NoGradGuard no_grad;
+
+  auto params1 = sequential->parameters();
+  auto params2 = clone->parameters();
+  REQUIRE(params1.size() == params2.size());
+  for (auto& param : params1) {
+    REQUIRE(!pointer_equal(param.value, params2[param.key]));
+    REQUIRE(param->device() == params2[param.key].device());
+    REQUIRE(param->allclose(params2[param.key]));
+    param->add_(2);
+  }
+  for (auto& param : params1) {
+    REQUIRE(!param->allclose(params2[param.key]));
+  }
+}
+
+TEST_CASE("Sequential/RegistersElementsAsSubmodules") {
+  Sequential sequential(Linear(10, 3), Conv2d(1, 2, 3), FeatureDropout(0.5));
+
+  auto modules = sequential->modules();
+  REQUIRE(modules.size() == sequential->children().size());
+
+  REQUIRE(modules[0]->as<Linear>());
+  REQUIRE(modules[1]->as<Conv2d>());
+  REQUIRE(modules[2]->as<FeatureDropout>());
 }
 
-TEST_CASE("sequential/clone-to-device", "[cuda]") {
+TEST_CASE("Sequential/CloneToDevice", "[cuda]") {
   Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
   torch::Device device(torch::kCUDA, 0);
   Sequential clone =
diff --git a/torch/csrc/api/include/torch/jit.h b/torch/csrc/api/include/torch/jit.h
index 5a4b262101ab68..d43a22c3e9f50d 100644
--- a/torch/csrc/api/include/torch/jit.h
+++ b/torch/csrc/api/include/torch/jit.h
@@ -17,7 +17,7 @@ namespace jit {
 ///
 /// For example:
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
 ///   auto module = torch::jit::compile(R"JIT(
 ///     def relu_script(a, b):
 ///       return torch.relu(a + b)
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index ccfa10a90ad2a4..2cb71e3470574e 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -225,8 +225,8 @@ class Module {
   ///     }
   ///   }
   ///
-  /// MyModule module;
-  /// module->modules().apply(initialize_weights);
+  ///   MyModule module;
+  ///   module->modules().apply(initialize_weights);
   /// \endrst
   template <typename ModuleType>
   typename ModuleType::ContainedType* as() noexcept;
@@ -244,8 +244,8 @@ class Module {
   ///     }
   ///   }
   ///
-  /// MyModule module;
-  /// module->modules().apply(initialize_weights);
+  ///   MyModule module;
+  ///   module->modules().apply(initialize_weights);
   /// \endrst
   template <
       typename ModuleType,
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index 626149469b5035..512e6d2cc1c5af 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -38,7 +38,7 @@ namespace nn {
 /// Example:
 ///
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
 ///   struct GenericTrainer {
 ///     torch::nn::AnyModule module;
 ///
@@ -57,7 +57,7 @@ namespace nn {
 /// `AnyModule` will compile, but throw an exception at runtime:
 ///
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
 ///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
 ///   // Linear takes a tensor as input, but we are passing an integer.
 ///   // This will compile, but throw a `torch::Error` exception at runtime.
@@ -79,7 +79,7 @@ namespace nn {
 /// for example.
 ///
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
 ///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
 ///   auto output = module.forward(torch::ones({2, 3}));
 ///
@@ -97,7 +97,7 @@ namespace nn {
 /// using `.get<T>()` where `T` is the concrete module type.
 ///
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
 ///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
 ///   std::shared_ptr<nn::Module> ptr = module.ptr();
 ///   torch::nn::Linear linear(module.get<torch::nn::Linear>());
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
index 384afd1b7814bb..e4839ac41a910b 100644
--- a/torch/csrc/api/include/torch/nn/modules/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -18,18 +18,81 @@
 
 namespace torch {
 namespace nn {
-/// A `Sequential` module is a container for any number of other modules. Its
-/// `forward()` method chains outputs to inputs and returns the final output.
-/// The `Sequential` class reference semantics.
+
+/// A list of `Module`s that acts as a `Module` itself.
+///
+/// A `Sequential` is fundamentally a list of `Module`s, each with a `forward()`
+/// method. `Sequential` provides a `forward()` method of its own, which accepts
+/// any input and forwards it to the first module it stores. It then "chains"
+/// outputs to inputs sequentially for each subsequent module, finally returning
+/// the output of the last module. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   auto output = seq->forward(torch::ones(3));
+///
+/// \endrst
+///
+/// This can conceptually be thought of as the following loop (using Python as
+/// pseudocode):
+///
+/// \rst
+/// .. code-block:: python
+///
+///   def forward(sequential, input):
+///     for module in sequential:
+///       input = module(input)
+///     return input
+///
+/// \endrst
+///
+/// Why should you use `Sequential` instead of a simple `std::vector`? The value
+/// a `Sequential` provides over manually calling a sequence of modules is that
+/// it allows treating the whole container *as a single module*, such that
+/// performing a transformation on the `Sequential` applies to each of the
+/// modules it stores (which are each a registered submodule of the
+/// `Sequential`). For example, calling
+/// `.to(torch::kCUDA)` on a `Sequential` will move each module in the list to
+/// CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   // Convert all modules to CUDA.
+///   seq->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `Sequential` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding a new module after
+/// construction via `push_back`, as well as joining two `Sequential`s via
+/// `extend`.
+///
+/// \rst
+/// .. attention::
+///   One current limitation of `Sequential` is that all except the first module
+///   must accept a single argument. If your modules need to take multiple
+///   arguments, you should define them to take and return tuples.
+/// \endrst
 class SequentialImpl : public Cloneable<SequentialImpl> {
  public:
   using Iterator = std::vector<AnyModule>::iterator;
   using ConstIterator = std::vector<AnyModule>::const_iterator;
 
-  /// Constructs the `Sequential` from a pack of modules. Each module can either
-  /// be a plain value (e.g. `Linear`) or a boxed value (e.g.
-  /// `shared_ptr<Linear>`). Unboxed modules will be moved into `shared_ptr`s
-  /// internally.
+  /// Constructs the `Sequential` from a variadic list of modules.
   template <typename... Modules>
   explicit SequentialImpl(Modules&&... modules) {
     modules_.reserve(sizeof...(Modules));
@@ -47,19 +110,43 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
     return clone;
   }
 
-  /// `reset()` is empty for `Sequential`, since it does not have parameter of
+  /// `reset()` is empty for `Sequential`, since it does not have parameters of
   /// its own.
   void reset() override {}
 
-  /// Feeds the `inputs` to the first module, then chains the output of each
-  /// module with the input of the next, in order of construction.
-  template <typename ReturnType = Tensor, typename... ArgumentTypes>
-  ReturnType forward(ArgumentTypes&&... arguments) {
+  /// Feeds `inputs` to the first module and then chains outputs to inputs,
+  /// returning the last output.
+  ///
+  /// Conceptually the following loop in Python:
+  ///
+  /// \rst
+  /// .. code-block:: python
+  ///
+  ///   def forward(sequential, input):
+  ///     for module in sequential:
+  ///       input = module(input)
+  ///     return input
+  ///
+  /// \endrst
+  ///
+  /// The return type is taken as the first template parameter. It defaults to
+  /// `Tensor`. If the last module in the `Sequential` returns another type `T`,
+  /// you should call `forward<T>(inputs)` instead of just `forward(inputs)`:
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   torch::Tensor tensor = sequential1->forward(inputs);
+  ///   int integer = sequential2->forward<int>(inputs);
+  ///   float value = sequential3->forward<float>(inputs);
+  ///
+  /// \endrst
+  template <typename ReturnType = Tensor, typename... InputTypes>
+  ReturnType forward(InputTypes&&... inputs) {
     AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
 
     auto iterator = modules_.begin();
-    auto input =
-        iterator->any_forward(std::forward<ArgumentTypes>(arguments)...);
+    auto input = iterator->any_forward(std::forward<InputTypes>(inputs)...);
 
     for (++iterator; iterator != modules_.end(); ++iterator) {
       input = iterator->any_forward(std::move(input));
@@ -126,6 +213,8 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   Iterator begin() {
     return modules_.begin();
   }
+
+  /// Returns a const iterator to the start of the `Sequential`.
   ConstIterator begin() const {
     return modules_.begin();
   }
@@ -134,6 +223,8 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   Iterator end() {
     return modules_.end();
   }
+
+  /// Returns a const iterator to the end of the `Sequential`.
   ConstIterator end() const {
     return modules_.end();
   }
@@ -227,6 +318,10 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   std::vector<AnyModule> modules_;
 };
 
+/// A `ModuleHolder` subclass for `SequentialImpl`.
+/// See the documentation for `SequentialImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Sequential);
 } // namespace nn
 } // namespace torch

From d24bcfd9304f0e38c4024620de4da8f2379f6956 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Fri, 14 Sep 2018 15:32:17 -0700
Subject: [PATCH 168/237] Suppress hiprand "duplicate-decl-specifier" warning
 (#11698)

Summary:
Otherwise each build produces 65MB of warnings log, which makes the CI hard to debug.

iotamudelta Jorghi12
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11698

Differential Revision: D9840356

Pulled By: bddppq

fbshipit-source-id: b69bf6a5c38a97b188221f9c084c608ffc9b37c8
---
 cmake/Dependencies.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 551a84180b087a..dbec0d07ebf6e3 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -542,6 +542,7 @@ if(NOT BUILD_ATEN_MOBILE)
     set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-negative")
     set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-overflow")
     set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-unused-command-line-argument")
+    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-duplicate-decl-specifier")
 
     set(Caffe2_HIP_INCLUDES
       ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDES})

From 8e76dcf173f7fb28c2736b86b38a829531496bab Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Fri, 14 Sep 2018 15:55:35 -0700
Subject: [PATCH 169/237] Prevent raising KeyboardInterrupt in worker (#11718)

Summary:
Current behavior is that each process (main and workers) will print trace from `KeyboardInterrupt`. And the main process will also print
```
RuntimeError: DataLoader worker (pid 46045) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with nm_workers=0 may give better error trace.
```
due to our SIGCLD handler.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11718

Differential Revision: D9840844

Pulled By: SsnL

fbshipit-source-id: 1a05060bb02907fef5aac3f274d2c84f9f42d187
---
 torch/utils/data/dataloader.py | 76 ++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 0874fd1185028d..4618e731406e68 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -73,49 +73,53 @@ def is_alive(self):
 
 
 def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed, init_fn, worker_id):
-    global _use_shared_memory
-    _use_shared_memory = True
+    try:
+        global _use_shared_memory
+        _use_shared_memory = True
 
-    # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
-    # module's handlers are executed after Python returns from C low-level
-    # handlers, likely when the same fatal signal happened again already.
-    # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
-    _set_worker_signal_handlers()
+        # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+        # module's handlers are executed after Python returns from C low-level
+        # handlers, likely when the same fatal signal happened again already.
+        # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
+        _set_worker_signal_handlers()
 
-    torch.set_num_threads(1)
-    random.seed(seed)
-    torch.manual_seed(seed)
+        torch.set_num_threads(1)
+        random.seed(seed)
+        torch.manual_seed(seed)
 
-    # Do not wait for putting thread to join when this worker exits. Otherwise,
-    # this worker may always be waiting to put and doesn't check index_queue
-    # and done_event for termination signal.
-    data_queue.cancel_join_thread()
+        # Do not wait for putting thread to join when this worker exits.
+        # Otherwise, this worker may always be waiting to put and doesn't check
+        # index_queue and done_event for termination signal.
+        data_queue.cancel_join_thread()
 
-    if init_fn is not None:
-        init_fn(worker_id)
+        if init_fn is not None:
+            init_fn(worker_id)
 
-    watchdog = ManagerWatchdog()
+        watchdog = ManagerWatchdog()
 
-    while True:
-        try:
-            r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
-        except queue.Empty:
-            if watchdog.is_alive() and not done_event.is_set():
-                continue
-            else:
+        while True:
+            try:
+                r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
+            except queue.Empty:
+                if watchdog.is_alive() and not done_event.is_set():
+                    continue
+                else:
+                    break
+            # use done_event so that we can get faster exiting signal even if there
+            # are still indices in index_queue
+            if r is None or done_event.is_set():
                 break
-        # use done_event so that we can get faster exiting signal even if there
-        # are still indices in index_queue
-        if r is None or done_event.is_set():
-            break
-        idx, batch_indices = r
-        try:
-            samples = collate_fn([dataset[i] for i in batch_indices])
-        except Exception:
-            data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
-        else:
-            data_queue.put((idx, samples))
-            del samples
+            idx, batch_indices = r
+            try:
+                samples = collate_fn([dataset[i] for i in batch_indices])
+            except Exception:
+                data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+            else:
+                data_queue.put((idx, samples))
+                del samples
+    except KeyboardInterrupt:
+        # Main process will raise KeyboardInterrupt anyways.
+        pass
 
 
 def _pin_memory_loop(in_queue, out_queue, done_event, pin_memory, device_id):

From 2c8a1b957ea19c25d672ccc5e51047279e68d85d Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 14 Sep 2018 16:34:15 -0700
Subject: [PATCH 170/237] Back out "Refactor Tensor/TensorImpl constructors."

Summary: Original commit changeset: 7501b54fe5f3

Reviewed By: gchanan

Differential Revision: D9838097

fbshipit-source-id: 093e4c47d5574ce99f706b0683ef369a89b62b38
---
 caffe2/core/tensor.h      | 76 +++++++++++----------------------------
 caffe2/core/tensor_impl.h | 71 +++++++++++++++++++++++++++++++++++-
 2 files changed, 91 insertions(+), 56 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index a167658060e7fc..c17667c006f54c 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -50,75 +50,45 @@ class CAFFE2_API Tensor final {
     return impl_.get();
   }
 
-  explicit Tensor(Storage storage)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(std::move(storage))) {}
-
-  /**
-   * @brief Creates a tensor of the given dimension.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * the first time mutable_data() is called.
-   */
+  explicit Tensor(DeviceType type)
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(type)) {}
+
   explicit Tensor(const vector<TIndex>& dims, DeviceType type)
-      : Tensor(Storage(type)) {
-    // TODO: here, we create a Storage
-    // and immediately discard it in Resize() since
-    // reset_tensor will be true and FreeMemory will be called,
-    // we might want to avoid creating Storage twice?
-    Resize(dims);
-  }
+      : impl_(
+            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
 
   explicit Tensor(const vector<int>& dims, DeviceType type)
-      : Tensor(Storage(type)) {
-    Resize(dims);
-  }
+      : impl_(
+            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
 
-  /**
-   * context_for_copy is required to have the same DeviceType as src
-   */
   Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type)
-      : Tensor(Storage(type)) {
-    CopyFrom(src, context_for_copy);
-  }
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
+            *src.impl_,
+            context_for_copy,
+            type)) {}
 
-  /**
-   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
-   * src Tensor
-   */
   Tensor(const Tensor& src, DeviceType type)
-      : Tensor(Storage(type)) {
-    CopyFrom(src);
-  }
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
+            *src.impl_,
+            type)) {}
 
-  /**
-   * @brief Creates a tensor, and fills its contents with the given values.
-   * The type of tensor will be decided by the context parameter
-   */
   template <typename T>
   Tensor(
       const vector<TIndex>& dims,
       const vector<T>& values,
       BaseContext* context)
-      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
-    Resize(dims);
-    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size());
-    context->CopyItemsFromCPU(
-        storage().dtype(), size(), values.data(), mutable_data<T>());
-  }
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
+            dims,
+            values,
+            context)) {}
 
-  /**
-   * @brief Creates a scalar tensor, and fills its content with the given value.
-   * The type of tensor will be decided by the context parameter
-   */
   template <
       typename T,
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
   Tensor(const T& value, BaseContext* context)
-      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
-    Resize(std::vector<TIndex>{});
-    context->CopyItemsFromCPU(
-        storage().dtype(), size(), &value, mutable_data<T>());
-  }
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
+            value,
+            context)) {}
 
   Tensor Clone() const {
     Tensor x(GetDeviceType());
@@ -306,10 +276,6 @@ class CAFFE2_API Tensor final {
   inline void ExtractDeviceOption(DeviceOption* device) const {
     return impl_.get()->ExtractDeviceOption(device);
   }
-
-  const Storage& storage() {
-    return impl_->storage();
-  }
 };
 
 using TensorCPU = Tensor;
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 2e01c55318d88f..4cdf612a1fb091 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -88,8 +88,77 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
 class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  public:
   TensorImpl() = delete;
+  explicit TensorImpl(DeviceType device_type) : storage_(device_type) {}
 
-  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {}
+  /**
+   * @brief Creates a tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   */
+  // TODO: here, we create a Storage
+  // and immediately discard it in Resize() since
+  // reset_tensor will be true and FreeMemory will be called,
+  // we might want to avoid creating Storage twice?
+  explicit TensorImpl(const std::vector<TIndex>& dims, at::DeviceType device_type)
+      : storage_(device_type) {
+    Resize(dims);
+  }
+
+  explicit TensorImpl(const std::vector<int>& dims, at::DeviceType device_type)
+      : storage_(device_type) {
+    Resize(dims);
+  }
+
+  /* Now we require that context_for_copy has the same device type as src since
+   * template is removed
+   */
+  TensorImpl(
+      const TensorImpl& src,
+      at::BaseContext* context_for_copy,
+      at::DeviceType device_type)
+      : storage_(device_type) {
+    CopyFrom(src, context_for_copy);
+  }
+
+  /**
+   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
+   * src Tensor
+   */
+  TensorImpl(const TensorImpl& src, at::DeviceType device_type)
+      : storage_(device_type) {
+    CopyFrom(src);
+  }
+
+  /**
+   * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
+   */
+  template <typename T>
+  TensorImpl(
+      const std::vector<TIndex>& dims,
+      const std::vector<T>& values,
+      at::BaseContext* context)
+      : storage_(context->device_type(), TypeMeta::Make<T>()) {
+    Resize(dims);
+    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
+    context->CopyItemsFromCPU(
+        storage_.dtype(), numel_, values.data(), mutable_data<T>());
+  }
+
+  /**
+   * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
+   */
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
+  TensorImpl(const T& value, at::BaseContext* context)
+      : storage_(context->device_type(), TypeMeta::Make<T>()) {
+    Resize(std::vector<TIndex>{});
+    context->CopyItemsFromCPU(
+        storage_.dtype(), numel_, &value, mutable_data<T>());
+  }
 
   TensorImpl(const TensorImpl&) = default;
   TensorImpl& operator=(const TensorImpl&) = default;

From f4d9fe395d59f140d1a52c6becf89f6ca59f303b Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 14 Sep 2018 16:54:22 -0700
Subject: [PATCH 171/237] Remove intrusive_ptr::reclaim() in Storage (#11352)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11352

Pushing manual refcounting further back, making things safer.

Reviewed By: ezyang

Differential Revision: D9694327

fbshipit-source-id: befdbcac199225383a93520472ee7c6511a0e9cd
---
 aten/src/ATen/templates/TypeDefault.cpp |  2 +-
 aten/src/TH/THTensor.cpp                |  2 +-
 aten/src/TH/generic/THTensor.cpp        | 18 +++++++++++++++---
 aten/src/THC/generic/THCTensor.cpp      | 24 ++++++++++++++++++------
 torch/csrc/utils/python_arg_parser.h    |  2 +-
 5 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 5e68470ecc0abd..0891f6d9f4f492 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -120,7 +120,7 @@ Storage TypeDefault::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   if (retain && th_pointer) {
     c10::raw::intrusive_ptr::incref(static_cast<StorageImpl*>(th_pointer));
   }
-  return Storage(static_cast<StorageImpl*>(th_pointer));
+  return Storage(c10::intrusive_ptr<StorageImpl>::reclaim(static_cast<StorageImpl*>(th_pointer)));
 }
 
 
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 2d1200eebd37aa..887b0dcf2dff97 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -202,5 +202,5 @@ void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
   // Caffe2 might have tensors whose storages are null, but we
   // don't allow it in PyTorch.
   AT_ASSERT(storage);
-  tensor->storage_ = at::Storage(storage);
+  tensor->storage_ = at::Storage(c10::intrusive_ptr<THStorage>::reclaim(storage));
 }
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index 5eac1e319347d1..9a7bd0be736f25 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -54,13 +54,21 @@ scalar_t *THTensor_(data)(const THTensor *self) {
 /* Empty init */
 THTensor *THTensor_(new)(void)
 {
-  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
+    at::CPUTensorId(),
+    false
+  ).release();
 }
 
 /* Pointer-copy init */
 THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
-  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
+    at::CPUTensorId(),
+    false
+  ).release();
   THTensor_(setStorageNd)(self,
                           THTensor_getStoragePtr(tensor),
                           tensor->storage_offset(),
@@ -75,7 +83,11 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
   if (strides.data()) {
     AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
-  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
+    at::CPUTensorId(),
+    false
+  ).release();
   THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(),
                           const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index 483a1fbec593e7..a7779047863466 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -63,13 +63,21 @@ scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self)
 /* Empty init */
 THCTensor *THCTensor_(new)(THCState *state)
 {
-  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
+    at::CUDATensorId(),
+    false
+  ).release();
 }
 
 /* Pointer-copy init */
 THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 {
-  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
+    at::CUDATensorId(),
+    false
+  ).release();
   THCTensor_(setStorageNd)(state,
                            self,
                            THTensor_getStoragePtr(tensor),
@@ -85,7 +93,11 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
   if (strides.data()) {
     AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
-  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
+    at::CUDATensorId(),
+    false
+  ).release();
   THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(),
                            const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
@@ -594,13 +606,13 @@ int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...)
     }
 
     const int tensorDev = THCTensor_(getDevice)(state, tensor);
-    
+
     // Skips CPU tensors
     if (tensorDev == -1) { continue; }
 
     // Checks all tensors are on the same device
-    if (tensorDev != curDev) { 
-      valid = 0; 
+    if (tensorDev != curDev) {
+      valid = 0;
       break;
     }
   }
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 294819a5a11550..9ff25d2d4e5134 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -434,7 +434,7 @@ inline at::Generator* PythonArgs::generator(int i) {
 }
 
 inline at::Storage PythonArgs::storage(int i) {
-  if (!args[i]) return nullptr;
+  if (!args[i]) return at::Storage();
   return createStorage(args[i]);
 }
 

From 270fb22bd84e915d76fd6316bf4f22a8c7882da6 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 14 Sep 2018 16:54:24 -0700
Subject: [PATCH 172/237] Remove intrusive_ptr::reclaim() in Storage (2/2)
 (#11547)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11547

Pushing manual refcounting further back, making things safer.

Reviewed By: ezyang

Differential Revision: D9778042

fbshipit-source-id: c9572edc440c5ce5ea1b2355b5c54f87078ea28e
---
 aten/src/ATen/core/Storage.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h
index 23a3327da1cafc..ab201be88d630e 100644
--- a/aten/src/ATen/core/Storage.h
+++ b/aten/src/ATen/core/Storage.h
@@ -7,9 +7,7 @@ namespace at {
 struct AT_API Storage {
 public:
   Storage() {}
-  Storage(StorageImpl* storage_impl) : storage_impl_(c10::intrusive_ptr<StorageImpl>::reclaim(storage_impl)) {}
-  Storage(const c10::intrusive_ptr<StorageImpl>& ptr) : storage_impl_(ptr) {}
-  Storage(c10::intrusive_ptr<StorageImpl>&& ptr) : storage_impl_(std::move(ptr)) {}
+  Storage(c10::intrusive_ptr<StorageImpl> ptr) : storage_impl_(std::move(ptr)) {}
   Storage(
       caffe2::TypeMeta data_type,
       size_t size,

From 690c999bba2da3bb25370fe49181e7d0eb3e4b69 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 14 Sep 2018 16:54:26 -0700
Subject: [PATCH 173/237] Simplify union payload copying (#11353)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11353

Before, there was one extra member in the union that had to be at least as large as the largest other member, because it was used for copying.

Now, this isn't needed anymore and we copy the union directly.

Reviewed By: ezyang

Differential Revision: D9694326

fbshipit-source-id: 42b2f7d51ac5d4ea5ebafea3a598b018e10fed68
---
 torch/csrc/jit/ivalue.h | 47 +++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index d32a24b54917a8..ffff496fdd6386 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -69,7 +69,7 @@ using DoubleList = ConstantList<double>;
 
 struct TORCH_API IValue final {
   IValue()
-  : payload(0)
+  : payload{0}
   , tag(Tag::None)
   , is_intrusive_ptr(false) {}
   IValue(const IValue& rhs)
@@ -77,7 +77,7 @@ struct TORCH_API IValue final {
         tag(rhs.tag),
         is_intrusive_ptr(rhs.is_intrusive_ptr) {
     if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(as_intrusive_ptr);
+      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
     }
   }
   IValue(IValue&& rhs) noexcept : IValue() {
@@ -85,7 +85,7 @@ struct TORCH_API IValue final {
   }
   ~IValue() {
     if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(as_intrusive_ptr);
+      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
     }
   }
   IValue & operator=(IValue && rhs) & noexcept {
@@ -113,19 +113,19 @@ struct TORCH_API IValue final {
     // This is not an optional optimization: our incref call
     // *will not* do the right thing when called on an
     // undefined tensor.
-    as_tensor_impl = t.unsafeReleaseTensorImpl();
+    payload.as_tensor_impl = t.unsafeReleaseTensorImpl();
   }
   bool isTensor() const { return Tag::Tensor == tag; }
   at::Tensor toTensor() && {
     JIT_ASSERT(isTensor());
-    at::Tensor t(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(as_tensor_impl));
+    at::Tensor t(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(payload.as_tensor_impl));
     clearToNone();
     return t;
   }
   at::Tensor toTensor() const & {
     JIT_ASSERT(isTensor());
-    JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensorImpl::singleton()));
-    auto tensor_impl = c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(as_tensor_impl);
+    JIT_ASSERT(is_intrusive_ptr == (payload.as_tensor_impl != at::UndefinedTensorImpl::singleton()));
+    auto tensor_impl = c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(payload.as_tensor_impl);
     if (is_intrusive_ptr) {
       c10::raw::intrusive_ptr::incref(tensor_impl.get());
     }
@@ -147,18 +147,18 @@ struct TORCH_API IValue final {
   // Double
   IValue(double d)
   : tag(Tag::Double), is_intrusive_ptr(false) {
-    as_double = d;
+    payload.as_double = d;
   }
   bool isDouble() const { return Tag::Double == tag; }
   double toDouble() const {
     JIT_ASSERT(isDouble());
-    return as_double;
+    return payload.as_double;
   }
 
   // Int
   IValue(int64_t i)
   : tag(Tag::Int), is_intrusive_ptr(false) {
-    as_int = i;
+    payload.as_int = i;
   }
 
   // allow you to pass literals (3, 4) without ambiguity
@@ -171,7 +171,7 @@ struct TORCH_API IValue final {
 
   int64_t toInt() const {
     JIT_ASSERT(isInt());
-    return as_int;
+    return payload.as_int;
   }
 
   // IntList
@@ -301,31 +301,28 @@ struct TORCH_API IValue final {
 
   template<typename T>
   c10::intrusive_ptr<T> moveToIntrusivePtr() {
-    auto t = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(as_intrusive_ptr));
+    auto t = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
     clearToNone();
     return t;
   }
   template<typename T>
   c10::intrusive_ptr<T> toIntrusivePtr() const {
-    auto r = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(as_intrusive_ptr));
+    auto r = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
     auto p = r;
     r.release();
     return p;
   }
   void clearToNone() {
-    payload = 0;
+    payload.as_int = 0;
     tag = Tag::None;
     is_intrusive_ptr = false;
   }
   union {
+    int64_t as_int;
+    double as_double;
     at::TensorImpl* as_tensor_impl;
     c10::intrusive_ptr_target* as_intrusive_ptr;
-    double as_double;
-    int64_t as_int;
-    // this type should be as big as all the other types because it will
-    // be used to copy the union's value in certain cases
-    int64_t payload;
-  };
+  } payload;
   Tag tag;
   bool is_intrusive_ptr;
 };
@@ -389,33 +386,33 @@ DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY)
 
 inline IValue::IValue(c10::intrusive_ptr<Tuple> v)
 : tag(Tag::Tuple), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
+  payload.as_intrusive_ptr = v.release();
 }
 
 inline IValue::IValue(c10::intrusive_ptr<IntList> v)
 : tag(Tag::IntList), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
+  payload.as_intrusive_ptr = v.release();
 }
 inline IValue::IValue(std::vector<int64_t> v)
 : IValue(IntList::create(std::move(v))) {}
 
 inline IValue::IValue(c10::intrusive_ptr<ConstantString> v)
 : tag(Tag::String), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
+  payload.as_intrusive_ptr = v.release();
 }
 inline IValue::IValue(std::string v)
 : IValue(ConstantString::create(std::move(v))) {}
 
 inline IValue::IValue(c10::intrusive_ptr<DoubleList> v)
 : tag(Tag::DoubleList), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
+  payload.as_intrusive_ptr = v.release();
 }
 inline IValue::IValue(std::vector<double> v)
 : IValue(DoubleList::create(std::move(v))) {}
 
 inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
 : tag(Tag::TensorList), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
+  payload.as_intrusive_ptr = v.release();
 }
 inline IValue::IValue(std::vector<at::Tensor> v)
 : IValue(TensorList::create(std::move(v))) {}

From bb6f18c44fbad0926e1eaf963375a8f934bc3c6b Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 14 Sep 2018 16:54:29 -0700
Subject: [PATCH 174/237] Simplify IValue::toTensor() (#11355)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11355

There is no reason to implement refcounting manually in this case.
Given the correct NullType, toIntrusivePtr() and moveToIntrusivePtr() will do the right thing.

Reviewed By: ezyang

Differential Revision: D9694918

fbshipit-source-id: 8aae4d66aec32ca5f85c438d66339bd80b72b656
---
 torch/csrc/jit/ivalue.h | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index ffff496fdd6386..16af0c9b292d5f 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -113,23 +113,16 @@ struct TORCH_API IValue final {
     // This is not an optional optimization: our incref call
     // *will not* do the right thing when called on an
     // undefined tensor.
-    payload.as_tensor_impl = t.unsafeReleaseTensorImpl();
+    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
   }
   bool isTensor() const { return Tag::Tensor == tag; }
   at::Tensor toTensor() && {
     JIT_ASSERT(isTensor());
-    at::Tensor t(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(payload.as_tensor_impl));
-    clearToNone();
-    return t;
+    return at::Tensor(moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
   }
   at::Tensor toTensor() const & {
     JIT_ASSERT(isTensor());
-    JIT_ASSERT(is_intrusive_ptr == (payload.as_tensor_impl != at::UndefinedTensorImpl::singleton()));
-    auto tensor_impl = c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(payload.as_tensor_impl);
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(tensor_impl.get());
-    }
-    return at::Tensor(std::move(tensor_impl));
+    return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
   }
 
   // Tuple
@@ -299,15 +292,15 @@ struct TORCH_API IValue final {
 #undef DEFINE_TAG
   };
 
-  template<typename T>
-  c10::intrusive_ptr<T> moveToIntrusivePtr() {
-    auto t = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+  template<class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr() {
+    auto t = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
     clearToNone();
     return t;
   }
-  template<typename T>
-  c10::intrusive_ptr<T> toIntrusivePtr() const {
-    auto r = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+  template<typename T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const {
+    auto r = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
     auto p = r;
     r.release();
     return p;
@@ -320,7 +313,6 @@ struct TORCH_API IValue final {
   union {
     int64_t as_int;
     double as_double;
-    at::TensorImpl* as_tensor_impl;
     c10::intrusive_ptr_target* as_intrusive_ptr;
   } payload;
   Tag tag;

From f09054f8d09cca58e3ef63568b18977e66ca56a6 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Fri, 14 Sep 2018 17:45:51 -0700
Subject: [PATCH 175/237] Remove deprecate warning for Upsampling (#11568)

Summary:
Fixes #11452 .

Based on the discussion with SsnL  and soumith , we want to bring back Upsample as a module instead of introducing a new nn.interpolate module for now. If anyone want to do downsample, they should use `nn.functional.interpolate ` instead.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11568

Differential Revision: D9804359

Pulled By: ailzhang

fbshipit-source-id: 2b232d55fc83c2b581bf336f1ee8d1cf1c1159ca
---
 torch/nn/modules/upsampling.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index e416326229a142..5b2307470a6060 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -50,8 +50,8 @@ class Upsample(Module):
         0.3.1. Since then, the default behavior is ``align_corners = False``.
         See below for concrete examples on how this affects the outputs.
 
-    .. warning::
-        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+    .. note::
+        If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
 
     Examples::
 
@@ -121,7 +121,6 @@ def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=N
         self.align_corners = align_corners
 
     def forward(self, input):
-        warnings.warn("nn.Upsampling is deprecated. Use nn.functional.interpolate instead.")
         return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners)
 
     def extra_repr(self):

From eb3c47bdd5136979f3ecb52a7c2f5c241515f2fe Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Sat, 15 Sep 2018 18:06:08 -0700
Subject: [PATCH 176/237] max -> fmaxf in cross_entropy kernel (#11733)

Summary:
Changing `max` to `fmaxf` in `LabelCrossEntropy` kernel for hip to work correctly.

bddppq petrex
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11733

Differential Revision: D9846783

Pulled By: bddppq

fbshipit-source-id: c1b394d2ba7ee0e819f7bf3b36b53d1962de5522
---
 caffe2/operators/cross_entropy_op.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
index 7030e846b714e2..df7a124d29711f 100644
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@@ -13,7 +13,7 @@ __global__ void LabelCrossEntropyKernel(
     const float log_threshold, float* Ydata) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D);
-    Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
+    Ydata[i] = -logf(fmaxf(Xdata[i * D + labeldata[i]], log_threshold));
   }
 }
 __global__ void LabelCrossEntropyGradientKernel(
@@ -21,7 +21,7 @@ __global__ void LabelCrossEntropyGradientKernel(
     const float* dYdata, const float log_threshold, float* dXdata) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     int idx = i * D + labeldata[i];
-    dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
+    dXdata[idx] = - dYdata[i] / fmaxf(Xdata[idx], log_threshold);
   }
 }
 }  // namespace

From b3e726042c1746824bb1cd0152d203a0ec00fd12 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Sat, 15 Sep 2018 18:14:50 -0700
Subject: [PATCH 177/237] Do not use FixedDivisor in ROCM order switch op
 (#11697)

Summary:
Fix the recent order_switch_test failure in ROCM CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11697

Reviewed By: BIT-silence

Differential Revision: D9831039

Pulled By: bddppq

fbshipit-source-id: 2368fd1ac7b1bab335ff3377071246cfd3392f3f
---
 caffe2/operators/order_switch_ops.cu | 61 ++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
index c213b7cc848d0d..f63a7d87fa88fe 100644
--- a/caffe2/operators/order_switch_ops.cu
+++ b/caffe2/operators/order_switch_ops.cu
@@ -8,20 +8,42 @@ namespace caffe2 {
 template <typename T>
 __global__ void NHWC2NCHWCUDAKernel(
     const int size,
+#ifndef __HIPCC__
     const FixedDivisor<int> C,
     const FixedDivisor<int> HxW,
+#else
+    const int C,
+    const int HxW,
+#endif
     const T* X,
     T* Y) {
   CUDA_1D_KERNEL_LOOP(i, size) {
     int n;
     int c;
     int hxw;
+
+    int c_d;
+    int hxw_d;
+#ifndef __HIPCC__
     HxW.DivMod(i, &c, &hxw);
     C.DivMod(c, &n, &c);
+
+    c_d = C.d();
+    hxw_d = HxW.d();
+#else
+    c = i / HxW;
+    hxw = i % HxW;
+    n = c / C;
+    c = c % C;
+
+    c_d = C;
+    hxw_d = HxW;
+#endif
+
 #if __CUDA_ARCH__ >= 350
-    Y[i] = __ldg(X + (n * HxW.d() + hxw) * C.d() + c);
+    Y[i] = __ldg(X + (n * hxw_d + hxw) * c_d + c);
 #else
-    Y[i] = X[(n * HxW.d() + hxw) * C.d() + c];
+    Y[i] = X[(n * hxw_d + hxw) * c_d + c];
 #endif
   }
 }
@@ -29,20 +51,41 @@ __global__ void NHWC2NCHWCUDAKernel(
 template <typename T>
 __global__ void NCHW2NHWCCUDAKernel(
     const int size,
+#ifndef __HIPCC__
     const FixedDivisor<int> C,
     const FixedDivisor<int> HxW,
+#else
+    const int C,
+    const int HxW,
+#endif
     const T* X,
     T* Y) {
   CUDA_1D_KERNEL_LOOP(i, size) {
     int n;
     int c;
     int hxw;
+
+    int c_d;
+    int hxw_d;
+#ifndef __HIPCC__
     C.DivMod(i, &hxw, &c);
     HxW.DivMod(hxw, &n, &hxw);
+
+    c_d = C.d();
+    hxw_d = HxW.d();
+#else
+    hxw = i / C;
+    c = i % C;
+    n = hxw / HxW;
+    hxw = hxw % HxW;
+
+    c_d = C;
+    hxw_d = HxW;
+#endif
 #if __CUDA_ARCH__ >= 350
-    Y[i] = __ldg(X + (n * C.d() + c) * HxW.d() + hxw);
+    Y[i] = __ldg(X + (n * c_d + c) * hxw_d + hxw);
 #else
-    Y[i] = X[(n * C.d() + c) * HxW.d() + hxw];
+    Y[i] = X[(n * c_d + c) * hxw_d + hxw];
 #endif
   }
 }
@@ -71,8 +114,13 @@ bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
          0,
          context_.cuda_stream()>>>(
           size,
+#ifndef __HIPCC__
           FixedDivisor<int>(C),
           FixedDivisor<int>(HxW),
+#else
+          C,
+          HxW,
+#endif
           X.data<float>(),
           Y->template mutable_data<float>());
   return true;
@@ -102,8 +150,13 @@ bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
          0,
          context_.cuda_stream()>>>(
           size,
+#ifndef __HIPCC__
           FixedDivisor<int>(C),
           FixedDivisor<int>(HxW),
+#else
+          C,
+          HxW,
+#endif
           X.data<float>(),
           Y->template mutable_data<float>());
   return true;

From ca6f08f359d7dfc2da536baf3001433dff2c8f5b Mon Sep 17 00:00:00 2001
From: Jiyan Yang <chocjy@fb.com>
Date: Sat, 15 Sep 2018 23:31:42 -0700
Subject: [PATCH 178/237] Set correct dtype for fp16 op inference function
 (#11693)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11693

as desc.

Reviewed By: hyuen

Differential Revision: D9829061

fbshipit-source-id: 0f4c8a9d2b95d4cf5fa20a2aefd5671f273a8e76
---
 caffe2/operators/half_float_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/operators/half_float_ops.h b/caffe2/operators/half_float_ops.h
index e6698a0b7283d1..b8d5dacf69472b 100644
--- a/caffe2/operators/half_float_ops.h
+++ b/caffe2/operators/half_float_ops.h
@@ -76,7 +76,7 @@ inline std::vector<TensorShape> Float16FillerTensorInference(
   vector<TensorShape> out(1);
   ArgumentHelper helper(def);
   out[0].set_data_type(static_cast<TensorProto_DataType>(
-      helper.GetSingleArgument<int>("dtype", TensorProto_DataType_FLOAT)));
+      helper.GetSingleArgument<int>("dtype", TensorProto_DataType_FLOAT16)));
   auto shape = helper.GetRepeatedArgument<int>("shape");
   for (int d : shape) {
     out[0].add_dims(d);

From 10c29c8970a1468cd157d371a0d0b9879cb703a9 Mon Sep 17 00:00:00 2001
From: peter <peterghost86@gmail.com>
Date: Sun, 16 Sep 2018 07:55:15 -0700
Subject: [PATCH 179/237] Fix CUDA 8 build on Windows (#11729)

Summary:
Tested via https://github.com/pytorch/pytorch/pull/11374.
Upstream PR: https://gitlab.kitware.com/cmake/cmake/merge_requests/2391
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11729

Differential Revision: D9847807

Pulled By: orionr

fbshipit-source-id: 69af3e6c5bba0abcbc8830495e867a0b1b399c22
---
 README.md                                      | 5 ++---
 cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4b996cc1b440a6..b23bc60aa19de6 100644
--- a/README.md
+++ b/README.md
@@ -201,9 +201,8 @@ set DISTUTILS_USE_SDK=1
 REM The following two lines are needed for Python 2.7, but the support for it is very experimental.
 set MSSdk=1
 set FORCE_PY27_BUILD=1
-REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following two lines.
-set "PREBUILD_COMMAND=%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat"
-set PREBUILD_COMMAND_ARGS=x64
+REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following line.
+set "CUDA_HOST_COMPILER=%VS140COMNTOOLS%\..\..\VC\bin\amd64\cl.exe"
 
 call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11
 python setup.py install
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index edc9b3ab3fda74..b2ca36a9677771 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -531,7 +531,9 @@ option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
 # Extra user settable flags
 cmake_initialize_per_config_variable(CUDA_NVCC_FLAGS "Semi-colon delimit multiple arguments.")
 
-if(CMAKE_GENERATOR MATCHES "Visual Studio")
+if(DEFINED ENV{CUDA_HOST_COMPILER})
+  set(CUDA_HOST_COMPILER "$ENV{CUDA_HOST_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+elseif(CMAKE_GENERATOR MATCHES "Visual Studio")
   set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)Tools/MSVC/$(VCToolsVersion)/bin/Host$(Platform)/$(PlatformTarget)")
   if(MSVC_VERSION LESS 1910)
    set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)bin")

From 6f6b03566ba3c4828f6ee87a772f9d161be0bae7 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Sun, 16 Sep 2018 20:30:14 -0700
Subject: [PATCH 180/237] Vectorize grid sample 2d CPU kernels (#10980)

Summary:
This PR vectorizes the CPU grid sample 2d forward and backward kernels. Specifically,

 1. add `.data()` in `TensorAccessor`
 2. support non-void return value for declaring CPU kernel stub
 2. add `bool at:: geometry_is_contiguous(IntList sizes, IntList strides)`
1. The following vectorized CPU primitives are added:

    + `gather<scale>(baseaddr, vindex)`: `result[i] = baseaddr[vindex[i] * scale]`
    + `mask_gather<scale>(src, baseaddr, vindex, mask)`: `result[i] = mask[i] ? baseaddr[vindex[i] * scale] : src[i]`.
    + comparison ops
    + binary logical ops
    + `min(a, b)`
    + `cast<dst_t, src_t>(src_vec)`: changing dtype but keeping the bit representation
    + `blendv(a, b, mask)`: `result[i] = mask[i] ? b[i] : a[i]`.
    + ctor with multiple values (i.e., `setr`)
    + `arange(start = 0, step = 1)`: constructs a vector with values specified by the arange parameters
    + `convert_to_int_of_same_size(vec)`: convert floating point vector to corresponding integral type of same size
    + `interleave2(a, b)` & `deinterleave2(x, y)`: interleave or deinterleaves two vectors. E.g., for `interleave`:
        ```
        inputs:
          {a0, a1, a2, a3, a4, a5, a6, a7}
          {b0, b1, b2, b3, b4, b5, b6, b7}
        outputs:
          {a0, b0, a1, b1, a2, b2, a3, b3}
          {a4, b4, a5, b5, a6, b6, a7, b7}
        ```

  2. Grid sample CPU kernel implementations are described in the following note (also in `GridSampleKernel.cpp`:

  ```
   NOTE [ Grid Sample CPU Kernels ]

   Implementation of vectorized grid sample CPU kernels is divided into three
   parts:

   1. `ComputeLocation` struct
      Transforms grid values into interpolation locations of the input tensor
      for a particular spatial dimension, basing on the size of that dimension
      in input tensor, and the padding mode.
```
```cpp
      template<typename scalar_t, GridSamplerPadding padding>
      struct ComputeLocation {
        using Vec = Vec256<scalar_t>;

        // ctor
        ComputeLocation(int64_t size);

        // Given grid values `in`, return the interpolation locations after
        // un-normalization and padding mechanism (elementwise).
        Vec apply(const Vec &in) const;

        // Similar to `apply`, but also returns `d apply(in) / d in`
        // (elementwise).
        // this is often used in gradient computation.
        std::pair<Vec, Vec> apply_get_grad(const Vec &in) const;
      };
```
```
   2. `ApplyGridSample` struct
      Owns N `ComputeLocation` structs, where N is the number of spatial
      dimensions. Given N input grid vectors (one for each spatial dimension)
      and spatial offset, it gets the interpolation locations from
      `ComputeLocation`s, applies interpolation procedure, and then writes to
      the output (or grad_input & grad_grid in backward).
```
```cpp
      template<typename scalar_t, int spatial_dim,
               GridSamplerInterpolation interp,
               GridSamplerPadding padding>
      struct ApplyGridSample {

        // ctor
        ApplyGridSample(const TensorAccessor<scalar_t, 4>& input);

        // Applies grid sampling (forward) procedure:
        //   1. computes interpolation locations from grid values `grid_x` and
        //      `grid_y`,
        //   2. interpolates output values using the locations and input data
        //      in `inp_slice`, and
        //   3. writes the first `len` values in the interpolated vector to
        //      `out_slice` with spatial offset being `offset`.
        //
        // This assimes that `grid_x` and `grid_y` all contain valid grid
        // values \in [-1, 1], even at indices greater than `len`.
        //
        // The `*_slice` argument namess mean samples within a batch (i.e.,
        // with the batch dimension sliced out).
        void forward(TensorAccessor<scalar_t, 3>& out_slice,
                     const TensorAccessor<scalar_t, 3>& inp_slice,
                     int64_t offset, const Vec& grid_x, const Vec& grid_y,
                     int64_t len) const;

        // Applies grid sampling (backward) procedure. Arguments semantics
        // and strategy are similar to those of `forward`.
        void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
                      TensorAccessor<scalar_t, 3>& gGrid_slice,
                      const TensorAccessor<scalar_t, 3>& gOut_slice,
                      const TensorAccessor<scalar_t, 3>& inp_slice,
                      int64_t offset, const Vec& grid_x, const Vec& grid_y,
                      int64_t len) const;
      }
```
```
   3. `grid_sample_2d_grid_slice_iterator` function
      Among the tensors we work with, we know that the output tensors are
      contiguous (i.e., `output` in forward, and `grad_input` & `grad_grid` in
      backward), we need to randomly read `input` anyways, and `grad_output`
      usually comes from autograd and is often contiguous. So we base our
      iterating strategy on the geometry of grid.
      `grid_sample_2d_grid_slice_iterator` function provides an abstract to
      efficiently iterates through a `grid` slice (without batch dimension).
      See comments of that function on the specific cases and strategies used.
```
```cpp
      template<typename scalar_t, typename ApplyFn>
      void grid_sample_2d_grid_slice_iterator(
        const TensorAccessor<scalar_t, 3>& grid_slice,
        const ApplyFn &apply_fn);

      // `apply_fn` is a function/lambda that can be called as if it has
      // declaration:
      //   void apply_fn(const Vec256<scalar_t>& grid_x,
      //                 const Vec256<scalar_t>& grid_y,
      //                 int64_t spatial_offset, int64_t len);
```
```
      `apply_fn` will be called multiple times, and together cover the entire
      output spatial space. Therefore, e.g., to implement forward 2d grid
      sample, we can do
```
```cpp
      ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(input_accessor);

      for (int n = 0; n < input_accessor.size(0); n++) {
        grid_sample_2d_grid_slice_iterator(
          grid_accessor[n],
          [&](const Vec256<scalar_t>& grid_x, const Vec256<scalar_t>& grid_y,
              int64_t spatial_offset, int64_t len) {
            grid_sample.forward(out_accessor[n], input_accessor[n],
                                spatial_offset, grid_x, grid_y, len);
          });
      }
   ```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10980

Differential Revision: D9564867

Pulled By: SsnL

fbshipit-source-id: 5b7c3c7ea63af00eec230ae9ee1c3e6c6c9679b4
---
 aten/src/ATen/Parallel.h                      |   2 +-
 aten/src/ATen/TensorGeometry.cpp              |  11 +-
 aten/src/ATen/TensorUtils.cpp                 |  20 +
 aten/src/ATen/TensorUtils.h                   |   7 +
 aten/src/ATen/core/TensorAccessor.h           |  29 +-
 aten/src/ATen/cpu/vec256/intrinsics.h         |   2 +-
 aten/src/ATen/cpu/vec256/vec256.h             | 190 ++++
 aten/src/ATen/cpu/vec256/vec256_base.h        | 228 ++++-
 aten/src/ATen/cpu/vec256/vec256_double.h      |  60 +-
 aten/src/ATen/cpu/vec256/vec256_float.h       |  65 +-
 aten/src/ATen/cpu/vec256/vec256_int.h         | 164 +++-
 aten/src/ATen/native/DispatchStub.h           |  23 +-
 aten/src/ATen/native/GridSampler.cpp          | 301 +-----
 aten/src/ATen/native/GridSampler.h            |   2 +
 .../src/ATen/native/cpu/GridSamplerKernel.cpp | 890 ++++++++++++++++++
 aten/src/ATen/native/cpu/GridSamplerKernel.h  |  18 +
 test/test_nn.py                               |  80 +-
 17 files changed, 1721 insertions(+), 371 deletions(-)
 create mode 100644 aten/src/ATen/native/cpu/GridSamplerKernel.cpp
 create mode 100644 aten/src/ATen/native/cpu/GridSamplerKernel.h

diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 6aadd62eb1d3fd..a4c8b50abe8263 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -27,7 +27,7 @@ inline void parallel_for(
     const int64_t grain_size,
     const F& f) {
 #ifdef _OPENMP
-#pragma omp parallel if ((end - begin) >= grain_size)
+#pragma omp parallel if (!omp_in_parallel() && ((end - begin) >= grain_size))
   {
     int64_t num_threads = omp_get_num_threads();
     int64_t tid = omp_get_thread_num();
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
index ca3fcd961feda0..b11c7bb159900b 100644
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -1,4 +1,5 @@
 #include <ATen/TensorGeometry.h>
+#include <ATen/TensorUtils.h>
 
 #include <ATen/ATen.h>
 
@@ -8,15 +9,7 @@ bool TensorGeometry::is_contiguous() const {
   if (numel_ == 0) {
     return true;
   }
-  int64_t dim = sizes_.size();
-  int64_t expected_stride = 1;
-  for (int64_t i = dim - 1; i >= 0; i--) {
-    if (sizes_[i] != 1 && strides_[i] != expected_stride) {
-      return false;
-    }
-    expected_stride *= sizes_[i];
-  }
-  return true;
+  return at::geometry_is_contiguous(sizes_, strides_);
 }
 
 Tensor TensorGeometry::zeros_with_stride(const Type& type) const {
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 454ab9e91fd29e..4ec8e374c2e515 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -215,4 +215,24 @@ void * maybe_data_ptr(const Tensor& tensor) {
 void * maybe_data_ptr(const TensorArg& tensor) {
   return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
 }
+
+// See TensorUtils.h on why this is useful now that we cache is_contiguous.
+bool geometry_is_contiguous(IntList sizes, IntList strides) {
+  int64_t dim = sizes.size();
+  int64_t expected_stride = 1;
+  bool contig_if_nonempty = true;
+  for (int64_t i = dim - 1; i >= 0; i--) {
+    if (sizes[i] == 0) {
+      return true;
+    }
+    if (contig_if_nonempty) {
+      if (sizes[i] != 1 && strides[i] != expected_stride) {
+        contig_if_nonempty = false;
+      }
+      expected_stride *= sizes[i];
+    }
+  }
+  return contig_if_nonempty;
+}
+
 }
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index cc7453f77375f6..2443bde4b482cb 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -78,4 +78,11 @@ AT_API void checkBackend(CheckedFrom c, at::ArrayRef<Tensor> t, at::Backend back
 AT_API void * maybe_data_ptr(const Tensor& tensor);
 AT_API void * maybe_data_ptr(const TensorArg& tensor);
 
+// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
+// Although we cache is_contiguous in tensor now, this is till useful because it
+// allows checking if a particular geometry is contiguous without explicitly
+// constructing a tensor, e.g., when you want to choose a kernel strategy based
+// on whether a subgeometry is contiguous.
+AT_API bool geometry_is_contiguous(IntList sizes, IntList strides);
+
 }
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index cbd130ed775d09..f686f0797b079d 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -44,14 +44,16 @@ class TensorAccessorBase {
 
   AT_HOSTDEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : data_(data_), sizes_(sizes_), strides_(strides_) {}
-  AT_HOST IntList sizes() {
+  AT_HOST IntList sizes() const {
     return IntList(sizes_,N);
   }
-  AT_HOST IntList strides() {
+  AT_HOST IntList strides() const {
     return IntList(strides_,N);
   }
-  AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; }
-  AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; }
+  AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; }
+  AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; }
+  AT_HOSTDEVICE T *data() { return data_; }
+  AT_HOSTDEVICE const T *data() const { return data_; }
 protected:
   PtrType data_;
   const int64_t* sizes_;
@@ -73,6 +75,10 @@ class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits> {
   AT_HOSTDEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
     return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
   }
+
+  AT_HOSTDEVICE const TensorAccessor<T,N-1> operator[](int64_t i) const {
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
 };
 
 template<typename T, template <typename U> class PtrTraits>
@@ -89,7 +95,7 @@ class TensorAccessor<T,1,PtrTraits> : public TensorAccessorBase<T,1,PtrTraits> {
 
 
 // PackedTensorAccessorBase and PackedTensorAccessor are used on for CUDA `Tensor`s on the host
-// and as 
+// and as
 // In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
 // in order to transfer them on the device when calling kernels.
 // On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
@@ -106,8 +112,8 @@ class PackedTensorAccessorBase {
     std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
     std::copy(strides_, strides_ + N, std::begin(this->strides_));
   }
-  AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; }
-  AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; }
+  AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; }
+  AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; }
 protected:
   PtrType data_;
   int64_t sizes_[N];
@@ -127,6 +133,12 @@ class PackedTensorAccessor : public PackedTensorAccessorBase<T,N,PtrTraits> {
     int64_t* new_strides = this->strides_+1;
     return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
   }
+
+  AT_DEVICE const TensorAccessor<T,N-1> operator[](int64_t i) const {
+    int64_t* new_sizes = this->sizes_+1;
+    int64_t* new_strides = this->strides_+1;
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
 };
 
 template<typename T, template <typename U> class PtrTraits>
@@ -139,6 +151,9 @@ class PackedTensorAccessor<T,1,PtrTraits> : public PackedTensorAccessorBase<T,1,
   AT_DEVICE T & operator[](int64_t i) {
     return this->data_[this->strides_[0]*i];
   }
+  AT_DEVICE const T& operator[](int64_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
 };
 
 }
diff --git a/aten/src/ATen/cpu/vec256/intrinsics.h b/aten/src/ATen/cpu/vec256/intrinsics.h
index 442e8fd0511fc7..76779aada7a0b3 100644
--- a/aten/src/ATen/cpu/vec256/intrinsics.h
+++ b/aten/src/ATen/cpu/vec256/intrinsics.h
@@ -19,7 +19,7 @@
 /* GCC-compatible compiler, targeting ARM with WMMX */
 #include <mmintrin.h>
 #elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
-    (defined(__VEC__) || defined(__ALTIVEC__))
+	(defined(__VEC__) || defined(__ALTIVEC__))
 /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
 #include <altivec.h>
 #elif defined(__GNUC__) && defined(__SPE__)
diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h
index 98f1158465f2f7..71688bd48e5090 100644
--- a/aten/src/ATen/cpu/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@@ -32,4 +32,194 @@ std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
   return stream;
 }
 
+
+#if defined(__AVX__) && !defined(_MSC_VER)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+Vec256<float> cast<float, double>(const Vec256<double>& src) {
+  return _mm256_castpd_ps(src);
+}
+
+template<>
+Vec256<double> cast<double, float>(const Vec256<float>& src) {
+  return _mm256_castps_pd(src);
+}
+
+#if defined(__AVX2__)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#define DEFINE_FLOAT_INT_CAST(int_t, float_t, float_ch)            \
+template<>                                                         \
+Vec256<int_t> cast<int_t, float_t>(const Vec256<float_t>& src) {   \
+  return _mm256_castp ## float_ch ## _si256(src);                  \
+}                                                                  \
+template<>                                                         \
+Vec256<float_t> cast<float_t, int_t>(const Vec256<int_t>& src) {   \
+  return _mm256_castsi256_p ## float_ch (src);                     \
+}
+
+DEFINE_FLOAT_INT_CAST(int64_t, double, d)
+DEFINE_FLOAT_INT_CAST(int32_t, double, d)
+DEFINE_FLOAT_INT_CAST(int16_t, double, d)
+DEFINE_FLOAT_INT_CAST(int64_t, float, s)
+DEFINE_FLOAT_INT_CAST(int32_t, float, s)
+DEFINE_FLOAT_INT_CAST(int16_t, float, s)
+
+#undef DEFINE_FLOAT_INT_CAST
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<double>>
+inline gather(const double* base_addr, const Vec256<int64_t>& vindex) {
+  return _mm256_i64gather_pd(base_addr, vindex, scale);
+}
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<float>>
+inline gather(const float* base_addr, const Vec256<int32_t>& vindex) {
+  return _mm256_i32gather_ps(base_addr, vindex, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<double>>
+inline mask_gather(const Vec256<double>& src, const double* base_addr,
+                   const Vec256<int64_t>& vindex, const Vec256<double>& mask) {
+  return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale);
+}
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<float>>
+inline mask_gather(const Vec256<float>& src, const float* base_addr,
+                   const Vec256<int32_t>& vindex, const Vec256<float>& mask) {
+  return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vec256<int64_t>
+inline convert_to_int_of_same_size<double>(const Vec256<double> &src) {
+  auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000));
+  return _mm256_sub_epi64(
+      _mm256_castpd_si256(x),
+      _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
+  );
+}
+
+template<>
+Vec256<int32_t>
+inline convert_to_int_of_same_size<float>(const Vec256<float> &src) {
+  return _mm256_cvttps_epi32(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vec256<double>, Vec256<double>>
+inline interleave2<double>(const Vec256<double>& a, const Vec256<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3}
+  //   b = {b0, b1, b2, b3}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, b0, b1}
+  //   b_swapped = {a2, a3, b2, b3}
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  auto a_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_a);
+  auto b_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_b);
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6);  // 0, 2, 1, 3
+  return std::make_pair(_mm256_permute4x64_pd(a_swapped, group_ctrl),
+                        _mm256_permute4x64_pd(b_swapped, group_ctrl));
+}
+
+template <>
+std::pair<Vec256<float>, Vec256<float>>
+inline interleave2<float>(const Vec256<float>& a, const Vec256<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  auto a_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_a);
+  auto b_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_b);
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl),
+                        _mm256_permutevar8x32_ps(b_swapped, group_ctrl));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vec256<double>, Vec256<double>>
+inline deinterleave2<double>(const Vec256<double>& a, const Vec256<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, b0, b1}
+  //   b_grouped = {a2, a3, b2, b3}
+  static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6);  // 0, 2, 1, 3
+  auto a_grouped = _mm256_permute4x64_pd(a, group_ctrl);
+  auto b_grouped = _mm256_permute4x64_pd(b, group_ctrl);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_a),
+                        _mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_b));
+}
+
+template <>
+std::pair<Vec256<float>, Vec256<float>>
+inline deinterleave2<float>(const Vec256<float>& a, const Vec256<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+  auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl);
+  auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_a),
+                        _mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_b));
+}
+
+#endif  // defined(__AVX2__)
+
+#endif // defined(__AVX__) && !defined(_MSC_VER)
+
 }}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index fa3c4e550d6a32..f1eba7e2d3c428 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -3,8 +3,11 @@
 #include <cstring>
 #include <functional>
 #include <cmath>
+#include <type_traits>
+#include <bitset>
 
 #include "ATen/Utils.h"
+#include "ATen/core/C++17.h"
 
 #if defined(__GNUC__)
 #define __at_align32__ __attribute__((aligned(32)))
@@ -18,6 +21,21 @@ namespace at {
 namespace vec256 {
 namespace {
 
+template<size_t n> struct int_of_size;
+
+#define DEFINE_INT_OF_SIZE(int_t) \
+template<> struct int_of_size<sizeof(int_t)> { using type = int_t; }
+
+DEFINE_INT_OF_SIZE(int64_t);
+DEFINE_INT_OF_SIZE(int32_t);
+DEFINE_INT_OF_SIZE(int16_t);
+DEFINE_INT_OF_SIZE(int8_t);
+
+#undef DEFINE_INT_OF_SIZE
+
+template <typename T>
+using int_same_size_t = typename int_of_size<sizeof(T)>::type;
+
 // NOTE: If you specialize on a type, you must define all operations!
 
 // emulates vectorized types
@@ -33,8 +51,13 @@ struct Vec256 {
       values[i] = val;
     }
   }
+  template<typename... Args,
+           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
+  Vec256(Args... vals) {
+    values = { vals... };
+  }
   template <int64_t mask_>
-  static Vec256<T> blend(Vec256<T> a, Vec256<T> b) {
+  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
     int64_t mask = mask_;
     Vec256 vec;
     for (int64_t i = 0; i < size; i++) {
@@ -47,7 +70,29 @@ struct Vec256 {
     }
     return vec;
   }
-  static Vec256<T> set(Vec256<T> a, Vec256<T> b, int64_t count = size) {
+  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
+                          const Vec256<T>& mask) {
+    Vec256 vec;
+    int_same_size_t<T> buffer[size];
+    mask.store(buffer);
+    for (int64_t i = 0; i < size; i++) {
+      if (buffer[i] & 0x01)
+       {
+        vec[i] = b[i];
+      } else {
+        vec[i] = a[i];
+      }
+    }
+    return vec;
+  }
+  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
+    Vec256 vec;
+    for (int64_t i = 0; i < size; i++) {
+      vec.values[i] = base + i * step;
+    }
+    return vec;
+  }
+  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
     Vec256 vec;
     for (int64_t i = 0; i < size; i++) {
       if (i < count) {
@@ -173,9 +218,28 @@ struct Vec256 {
     }
     return ret;
   }
+#define DEFINE_COMP(binary_pred)                                              \
+  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
+    Vec256<T> vec;                                                            \
+    for (int64_t i = 0; i != size; i++) {                                     \
+      if (values[i] binary_pred other.values[i]) {                            \
+        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
+      } else {                                                                \
+        std::memset(static_cast<void*>(vec.values + i), 0, sizeof(T));        \
+      }                                                                       \
+    }                                                                         \
+    return vec;                                                               \
+  }
+  DEFINE_COMP(==)
+  DEFINE_COMP(!=)
+  DEFINE_COMP(>=)
+  DEFINE_COMP(<=)
+  DEFINE_COMP(>)
+  DEFINE_COMP(<)
+#undef DEFINE_COMP
 };
 
-template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] + b[i];
@@ -183,7 +247,7 @@ template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> operator-(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] - b[i];
@@ -191,7 +255,7 @@ template <class T> Vec256<T> operator-(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] * b[i];
@@ -199,7 +263,7 @@ template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] / b[i];
@@ -207,7 +271,8 @@ template <class T> Vec256<T> operator/(const Vec256<T> &a, const Vec256<T> &b) _
   return c;
 }
 
-template <class T> Vec256<T> max(const Vec256<T> &a, const Vec256<T> &b) {
+
+template <class T> Vec256<T> inline max(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = std::max(a[i], b[i]);
@@ -215,7 +280,7 @@ template <class T> Vec256<T> max(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> min(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline min(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = std::min(a[i], b[i]);
@@ -223,9 +288,154 @@ template <class T> Vec256<T> min(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
+#define DEFINE_BITWISE_OP(op)                                               \
+template <class T>                                                          \
+Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
+  using iT = int_same_size_t<T>;                                            \
+  iT buffer[Vec256<T>::size];                                               \
+  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
+    auto a_val = a[i];                                                      \
+    auto b_val = b[i];                                                      \
+    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
+    iT *i_b_ptr = reinterpret_cast<iT*>(&b_val);                            \
+    buffer[i] = *i_a_ptr op *i_b_ptr;                                       \
+  }                                                                         \
+  return Vec256<T>::loadu(buffer);                                          \
+}
+DEFINE_BITWISE_OP(&)
+DEFINE_BITWISE_OP(|)
+DEFINE_BITWISE_OP(^)
+#undef DEFINE_BITWISE_OP
+
 template <typename T>
-T fmadd(const T& a, const T& b, const T& c) {
+inline T fmadd(const T& a, const T& b, const T& c) {
   return a * b + c;
 }
 
+template <int64_t scale = 1, typename T = void>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
+inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
+  static constexpr int size = Vec256<T>::size;
+  int_same_size_t<T> index_arr[size];
+  vindex.store(static_cast<void*>(index_arr));
+  T buffer[size];
+  for (int64_t i = 0; i < size; i++) {
+    buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
+  }
+  return Vec256<T>::loadu(static_cast<void*>(buffer));
+}
+
+template <int64_t scale = 1, typename T = void>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
+inline mask_gather(const Vec256<T>& src, T const* base_addr,
+                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
+  static constexpr int size = Vec256<T>::size;
+  T src_arr[size];
+  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
+  int_same_size_t<T> index_arr[size];
+  src.store(static_cast<void*>(src_arr));
+  mask.store(static_cast<void*>(mask_arr));
+  vindex.store(static_cast<void*>(index_arr));
+  T buffer[size];
+  for (int64_t i = 0; i < size; i++) {
+    if (mask_arr[i] & 0x01) {  // check highest bit
+      buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
+    } else {
+      buffer[i] = src_arr[i];
+    }
+  }
+  mask = Vec256<T>();  // "zero out" mask
+  return Vec256<T>::loadu(static_cast<void*>(buffer));
+}
+
+// Cast a given vector to another type without changing the bits representation.
+// So a Vec<double> of 256 bits containing all ones can be cast to a
+// Vec<int64_t> of 256 bits containing all ones (i.e., four negative 1s).
+namespace {
+  // There is a struct here because we don't have static_if and I can't
+  // partially specialize a templated function.
+  template<typename dst_t, typename src_t>
+  struct CastImpl {
+    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
+      src_t src_arr[Vec256<src_t>::size];
+      src.store(static_cast<void*>(src_arr));
+      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
+    }
+  };
+
+  template<typename scalar_t>
+  struct CastImpl<scalar_t, scalar_t> {
+    static inline Vec256<scalar_t> apply(const Vec256<scalar_t>& src) {
+      return src;
+    }
+  };
+}
+template<typename dst_t, typename src_t>
+Vec256<dst_t> cast(const Vec256<src_t>& src) {
+  return CastImpl<dst_t, src_t>::apply(src);
+}
+
+template <typename T>
+inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
+  static constexpr int size = Vec256<T>::size;
+  T src_arr[size];
+  src.store(static_cast<void*>(src_arr));
+  int_same_size_t<T> buffer[size];
+  for (int64_t i = 0; i < size; i++) {
+    buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
+  }
+  return Vec256<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
+}
+
+// E.g., inputs: a           Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
+//               b           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+//       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
+//                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+template <typename T>
+inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
+  static constexpr int size = Vec256<T>::size;
+  static constexpr int half_size = size / 2;
+  T a_arr[size];
+  T b_arr[size];
+  T buffer1[size];
+  T buffer2[size];
+  a.store(static_cast<void*>(a_arr));
+  b.store(static_cast<void*>(b_arr));
+  for (int64_t i = 0; i < half_size; i++) {
+    buffer1[i] = a_arr[i * 2];
+    buffer1[half_size + i] = b_arr[i * 2];
+    buffer2[i] = a_arr[i * 2 + 1];
+    buffer2[half_size + i] = b_arr[i * 2 + 1];
+  }
+  return std::make_pair(Vec256<T>::loadu(static_cast<void*>(buffer1)),
+                        Vec256<T>::loadu(static_cast<void*>(buffer2)));
+}
+
+// inverse operation of deinterleave2
+// E.g., inputs: a           Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
+//               b           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+//       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
+//                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+template <typename T>
+inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+interleave2(const Vec256<T>& a, const Vec256<T>& b) {
+  static constexpr int size = Vec256<T>::size;
+  static constexpr int half_size = size / 2;
+  T a_arr[size];
+  T b_arr[size];
+  T buffer1[size];
+  T buffer2[size];
+  a.store(static_cast<void*>(a_arr));
+  b.store(static_cast<void*>(b_arr));
+  for (int64_t i = 0; i < half_size; i++) {
+    buffer1[i * 2] = a_arr[i];
+    buffer1[i * 2 + 1] = b_arr[i];
+    buffer2[i * 2] = a_arr[half_size + i];
+    buffer2[i * 2 + 1] = b_arr[half_size + i];
+  }
+  return std::make_pair(Vec256<T>::loadu(static_cast<void*>(buffer1)),
+                        Vec256<T>::loadu(static_cast<void*>(buffer2)));
+}
+
 }}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index 05c21634659754..ced6fa6a37b6aa 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -22,14 +22,25 @@ template <> class Vec256<double> {
   Vec256(double val) {
     values = _mm256_set1_pd(val);
   }
+  Vec256(double val1, double val2, double val3, double val4) {
+    values = _mm256_setr_pd(val1, val2, val3, val4);
+  }
   operator __m256d() const {
     return values;
   }
   template <int64_t mask>
-  static Vec256<double> blend(Vec256<double> a, Vec256<double> b) {
+  static Vec256<double> blend(const Vec256<double>& a, const Vec256<double>& b) {
     return _mm256_blend_pd(a.values, b.values, mask);
   }
-  static Vec256<double> set(Vec256<double> a, Vec256<double> b, int64_t count = size) {
+  static Vec256<double> blendv(const Vec256<double>& a, const Vec256<double>& b,
+                               const Vec256<double>& mask) {
+    return _mm256_blendv_pd(a.values, b.values, mask.values);
+  }
+  static Vec256<double> arange(double base = 0., double step = 1.) {
+    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
+                            int64_t count = size) {
     switch (count) {
       case 0:
         return a;
@@ -56,7 +67,7 @@ template <> class Vec256<double> {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       double tmp_values[size];
       _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(double));
@@ -154,6 +165,32 @@ template <> class Vec256<double> {
   Vec256<double> pow(const Vec256<double> &b) const {
     return Vec256<double>(Sleef_powd4_u10(values, b));
   }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vec256<double> operator==(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vec256<double> operator!=(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_OQ);
+  }
+
+  Vec256<double> operator<(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vec256<double> operator<=(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vec256<double> operator>(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vec256<double> operator>=(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ);
+  }
 };
 
 template <>
@@ -186,9 +223,24 @@ Vec256<double> inline min(const Vec256<double>& a, const Vec256<double>& b) {
   return _mm256_min_pd(a, b);
 }
 
+template <>
+Vec256<double> inline operator&(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator|(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator^(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
 #ifdef __AVX2__
 template <>
-Vec256<double> fmadd(const Vec256<double>& a, const Vec256<double>& b, const Vec256<double>& c) {
+Vec256<double> inline fmadd(const Vec256<double>& a, const Vec256<double>& b, const Vec256<double>& c) {
   return _mm256_fmadd_pd(a, b, c);
 }
 #endif
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index c38fae11c24863..ebd0c10d2b62d1 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -16,20 +16,34 @@ template <> class Vec256<float> {
 private:
   __m256 values;
 public:
-  static constexpr int64_t size = 8;
+  static constexpr int size = 8;
   Vec256() {}
   Vec256(__m256 v) : values(v) {}
   Vec256(float val) {
     values = _mm256_set1_ps(val);
   }
+  Vec256(float val1, float val2, float val3, float val4,
+         float val5, float val6, float val7, float val8) {
+    values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
   operator __m256() const {
     return values;
   }
   template <int64_t mask>
-  static Vec256<float> blend(Vec256<float> a, Vec256<float> b) {
+  static Vec256<float> blend(const Vec256<float>& a, const Vec256<float>& b) {
     return _mm256_blend_ps(a.values, b.values, mask);
   }
-  static Vec256<float> set(Vec256<float> a, Vec256<float> b, int64_t count = size) {
+  static Vec256<float> blendv(const Vec256<float>& a, const Vec256<float>& b,
+                              const Vec256<float>& mask) {
+    return _mm256_blendv_ps(a.values, b.values, mask.values);
+  }
+  static Vec256<float> arange(float base = 0.f, float step = 1.f) {
+    return Vec256<float>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
+  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
+                           int64_t count = size) {
     switch (count) {
       case 0:
         return a;
@@ -61,7 +75,7 @@ template <> class Vec256<float> {
   void store(void* ptr, int64_t count = size) const {
     if (count == size) {
       _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       float tmp_values[size];
       _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(float));
@@ -159,6 +173,32 @@ template <> class Vec256<float> {
   Vec256<float> pow(const Vec256<float> &b) const {
     return Vec256<float>(Sleef_powf8_u10(values, b));
   }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vec256<float> operator==(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vec256<float> operator!=(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_OQ);
+  }
+
+  Vec256<float> operator<(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vec256<float> operator<=(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vec256<float> operator>(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vec256<float> operator>=(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ);
+  }
 };
 
 template <>
@@ -191,9 +231,24 @@ Vec256<float> inline min(const Vec256<float>& a, const Vec256<float>& b) {
   return _mm256_min_ps(a, b);
 }
 
+template <>
+Vec256<float> inline operator&(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator|(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator^(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
 #ifdef __AVX2__
 template <>
-Vec256<float> fmadd(const Vec256<float>& a, const Vec256<float>& b, const Vec256<float>& c) {
+Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const Vec256<float>& c) {
   return _mm256_fmadd_ps(a, b, c);
 }
 #endif
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index c9b643e7d4bb09..2ca4d614c21e7b 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -26,6 +26,9 @@ struct Vec256<int64_t> : public Vec256i {
   using Vec256i::Vec256i;
   Vec256() {}
   Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
+  Vec256(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
+    values = _mm256_setr_epi64x(val1, val2, val3, val4);
+  }
   template <int64_t mask>
   static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
     __at_align32__ int64_t tmp_values[size];
@@ -40,6 +43,13 @@ struct Vec256<int64_t> : public Vec256i {
       tmp_values[3] = _mm256_extract_epi64(b.values, 3);
     return loadu(tmp_values);
   }
+  static Vec256<int64_t> blendv(const Vec256<int64_t>& a, const Vec256<int64_t>& b,
+                                const Vec256<int64_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  static Vec256<int64_t> arange(int64_t base = 0, int64_t step = 1) {
+    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
+  }
   static Vec256<int64_t>
   set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
     switch (count) {
@@ -65,7 +75,7 @@ struct Vec256<int64_t> : public Vec256i {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       __at_align32__ int64_t tmp_values[size];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
@@ -79,6 +89,30 @@ struct Vec256<int64_t> : public Vec256i {
     auto inverse = _mm256_xor_si256(values, is_larger);
     return _mm256_sub_epi64(inverse, is_larger);
   }
+  Vec256<int64_t> operator==(const Vec256<int64_t>& other) const {
+    return _mm256_cmpeq_epi64(values, other.values);
+  }
+  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto eq = _mm256_cmpeq_epi64(values, other.values);
+    return _mm256_xor_si256(zero, eq);  // invert
+  }
+  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(other.values, values);
+  }
+  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto gt = _mm256_cmpgt_epi64(values, other.values);
+    return _mm256_xor_si256(zero, gt);  // invert
+  }
+  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(values, other.values);
+  }
+  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto lt = _mm256_cmpgt_epi64(other.values, values);
+    return _mm256_xor_si256(zero, lt);  // invert
+  }
 };
 
 template <>
@@ -87,10 +121,23 @@ struct Vec256<int32_t> : public Vec256i {
   using Vec256i::Vec256i;
   Vec256() {}
   Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
+  Vec256(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
+         int32_t val5, int32_t val6, int32_t val7, int32_t val8) {
+    values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
   template <int64_t mask>
   static Vec256<int32_t> blend(Vec256<int32_t> a, Vec256<int32_t> b) {
     return _mm256_blend_epi32(a, b, mask);
   }
+  static Vec256<int32_t> blendv(const Vec256<int32_t>& a, const Vec256<int32_t>& b,
+                                const Vec256<int32_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  static Vec256<int32_t> arange(int32_t base = 0, int32_t step = 1) {
+    return Vec256<int32_t>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
   static Vec256<int32_t>
   set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
     switch (count) {
@@ -124,7 +171,7 @@ struct Vec256<int32_t> : public Vec256i {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       __at_align32__ int32_t tmp_values[size];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
@@ -135,6 +182,30 @@ struct Vec256<int32_t> : public Vec256i {
   Vec256<int32_t> abs() const {
     return _mm256_abs_epi32(values);
   }
+  Vec256<int32_t> operator==(const Vec256<int32_t>& other) const {
+    return _mm256_cmpeq_epi32(values, other.values);
+  }
+  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto eq = _mm256_cmpeq_epi32(values, other.values);
+    return _mm256_xor_si256(zero, eq);  // invert
+  }
+  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(other.values, values);
+  }
+  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto gt = _mm256_cmpgt_epi32(values, other.values);
+    return _mm256_xor_si256(zero, gt);  // invert
+  }
+  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(values, other.values);
+  }
+  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto lt = _mm256_cmpgt_epi32(other.values, values);
+    return _mm256_xor_si256(zero, lt);  // invert
+  }
 };
 
 template <>
@@ -143,6 +214,13 @@ struct Vec256<int16_t> : public Vec256i {
   using Vec256i::Vec256i;
   Vec256() {}
   Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
+  Vec256(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
+         int16_t val5, int16_t val6, int16_t val7, int16_t val8,
+         int16_t val9, int16_t val10, int16_t val11, int16_t val12,
+         int16_t val13, int16_t val14, int16_t val15, int16_t val16) {
+    values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8,
+                               val9, val10, val11, val12, val13, val14, val15, val16);
+  }
   template <int64_t mask>
   static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
     __at_align32__ int16_t tmp_values[size];
@@ -181,6 +259,17 @@ struct Vec256<int16_t> : public Vec256i {
       tmp_values[15] = _mm256_extract_epi16(b.values, 15);
     return loadu(tmp_values);
   }
+  static Vec256<int16_t> blendv(const Vec256<int16_t>& a, const Vec256<int16_t>& b,
+                                const Vec256<int16_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  static Vec256<int16_t> arange(int16_t base = 0, int16_t step = 1) {
+    return Vec256<int16_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
+  }
   static Vec256<int16_t>
   set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
     switch (count) {
@@ -230,7 +319,7 @@ struct Vec256<int16_t> : public Vec256i {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       __at_align32__ int16_t tmp_values[size];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
@@ -241,6 +330,30 @@ struct Vec256<int16_t> : public Vec256i {
   Vec256<int16_t> abs() const {
     return _mm256_abs_epi16(values);
   }
+  Vec256<int16_t> operator==(const Vec256<int16_t>& other) const {
+    return _mm256_cmpeq_epi16(values, other.values);
+  }
+  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto eq = _mm256_cmpeq_epi16(values, other.values);
+    return _mm256_xor_si256(zero, eq);  // invert
+  }
+  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(other.values, values);
+  }
+  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto gt = _mm256_cmpgt_epi16(values, other.values);
+    return _mm256_xor_si256(zero, gt);  // invert
+  }
+  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(values, other.values);
+  }
+  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto lt = _mm256_cmpgt_epi16(other.values, values);
+    return _mm256_xor_si256(zero, lt);  // invert
+  }
 };
 
 template <>
@@ -258,6 +371,21 @@ Vec256<int16_t> inline operator+(const Vec256<int16_t>& a, const Vec256<int16_t>
   return _mm256_add_epi16(a, b);
 }
 
+template <>
+Vec256<int64_t> inline operator-(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {
+  return _mm256_sub_epi64(a, b);
+}
+
+template <>
+Vec256<int32_t> inline operator-(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {
+  return _mm256_sub_epi32(a, b);
+}
+
+template <>
+Vec256<int16_t> inline operator-(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
 // AVX2 has no intrinsic for int64_t multiply so it needs to be emulated
 // This could be implemented more efficiently using epi32 instructions
 // This is also technically avx compatible, but then we'll need AVX
@@ -293,7 +421,7 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 }
 
 template <typename T>
-Vec256<T> intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
+Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
   T values_a[Vec256<T>::size];
   T values_b[Vec256<T>::size];
   a.store(values_a);
@@ -304,20 +432,26 @@ Vec256<T> intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
   return Vec256<T>::loadu(values_a);
 }
 
-template <>
-Vec256<int64_t> inline operator/(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {
-  return intdiv_256(a, b);
+#define DEFINE_INTEGER_BINARY_OP(op, func)                                                \
+template <>                                                                               \
+Vec256<int64_t> inline operator op(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {  \
+  return func(a, b);                                                                      \
+}                                                                                         \
+template <>                                                                               \
+Vec256<int32_t> inline operator op(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {  \
+  return func(a, b);                                                                      \
+}                                                                                         \
+template <>                                                                               \
+Vec256<int16_t> inline operator op(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {  \
+  return func(a, b);                                                                      \
 }
 
-template <>
-Vec256<int32_t> inline operator/(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {
-  return intdiv_256(a, b);
-}
+DEFINE_INTEGER_BINARY_OP(/, intdiv_256)
+DEFINE_INTEGER_BINARY_OP(&, _mm256_and_si256)
+DEFINE_INTEGER_BINARY_OP(|, _mm256_or_si256)
+DEFINE_INTEGER_BINARY_OP(^, _mm256_xor_si256)
 
-template <>
-Vec256<int16_t> inline operator/(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {
-  return intdiv_256(a, b);
-}
+#undef DEFINE_INTEGER_BINARY_OP
 
 #endif
 
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index dad05dcf8b47a8..42ef6a4f6bb5f1 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -22,7 +22,10 @@
 //   DEFINE_DISPATCH(stub);
 //
 // In native/cpu/MyKernel.cpp:
-//   void kernel(const Tensor& x) { ... }
+//   namespace {
+//     // use anonymous namespace so that different cpu versions won't conflict
+//     void kernel(const Tensor& x) { ... }
+//   }
 //   REGISTER_DISPATCH(stub, &kernel);
 //
 // To call:
@@ -46,19 +49,22 @@ enum class CPUCapability {
 CPUCapability get_cpu_capability();
 
 template <typename FnPtr, typename T>
-struct AT_API DispatchStub {
-  static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
+struct AT_API DispatchStub;
+
+template <typename rT, typename T, typename... Args>
+struct AT_API DispatchStub<rT (*) (Args...), T> {
+  using FnPtr = rT (*) (Args...);
 
   template <typename... ArgTypes>
-  void operator()(DeviceType device_type, ArgTypes&&... args) {
+  rT operator()(DeviceType device_type, ArgTypes&&... args) {
     if (device_type == DeviceType::CPU) {
       if (!cpu_dispatch_ptr) {
         cpu_dispatch_ptr = choose_cpu_impl();
       }
-      (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
+      return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
     } else if (device_type == DeviceType::CUDA) {
       AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
-      (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
+      return (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
     } else {
       AT_ERROR("DispatchStub: unsupported device type", device_type);
     }
@@ -103,6 +109,11 @@ struct RegisterDispatch {
 };
 } // anonymous namespace
 
+// Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
+// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
+// adding parentheses and using helper struct to get rid of the parentheses, do
+// not work with MSVC. So do a `using`-declaration if you need to pass in such
+// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
 #define DECLARE_DISPATCH(fn, name) \
   struct name : DispatchStub<fn, name> {}; \
   extern AT_API struct name name
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 1547ab2c934053..4d09307bd640e0 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -3,8 +3,9 @@
 #include "ATen/Device.h"
 #include "ATen/Error.h"
 #include "ATen/NativeFunctions.h"
-#include "ATen/detail/CUDAHooksInterface.h"
 #include "ATen/native/GridSampler.h"
+#include "ATen/native/cpu/GridSamplerKernel.h"
+#include "ATen/cpu/vml.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -16,6 +17,7 @@ using at::native::detail::GridSamplerInterpolation;
 using at::native::detail::GridSamplerPadding;
 
 namespace {
+
   template<typename scalar_t>
   static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
     return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
@@ -117,121 +119,10 @@ namespace {
     }
   }
 
-  template<typename scalar_t>
-  Tensor grid_sampler_2d_cpu_impl(const Tensor& input, const Tensor& grid,
-                                 GridSamplerInterpolation interpolation_mode,
-                                 GridSamplerPadding padding_mode) {
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    auto output = at::empty({N, C, out_H, out_W}, input.options());
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t out_sN = output.stride(0);
-    int64_t out_sC = output.stride(1);
-    int64_t out_sH = output.stride(2);
-    int64_t out_sW = output.stride(3);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *out_ptr = output.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w) {
-          // get the corresponding input x, y co-ordinates from grid
-          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
-          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
-
-          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
-          ix = ((ix + 1) / 2) * (inp_W - 1);
-          iy = ((iy + 1) / 2) * (inp_H - 1);
-
-          if (padding_mode == GridSamplerPadding::Border) {
-            // clip coordinates to image borders
-            ix = clip_coordinates(ix, inp_W);
-            iy = clip_coordinates(iy, inp_H);
-          } else if (padding_mode == GridSamplerPadding::Reflection) {
-            // reflect coordinates by image borders
-            ix = reflect_coordinates(ix, inp_W);
-            iy = reflect_coordinates(iy, inp_H);
-          }
-
-          if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-            // get NE, NW, SE, SW pixel values from (x, y)
-            int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-            int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-            int64_t ix_ne = ix_nw + 1;
-            int64_t iy_ne = iy_nw;
-            int64_t ix_sw = ix_nw;
-            int64_t iy_sw = iy_nw + 1;
-            int64_t ix_se = ix_nw + 1;
-            int64_t iy_se = iy_nw + 1;
-
-            // get surfaces to each neighbor:
-            scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-            scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-            scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-            scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-            // calculate bilinear weighted pixel value and set output pixel
-            scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-              //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
-              // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
-              *out_ptr_NCHW = static_cast<scalar_t>(0);
-              if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-              }
-              if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-              }
-              if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-              }
-              if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-              }
-            }
-          } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-            int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-            int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-
-            // assign nearest neighor pixel value to output pixel
-            scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-              if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-                *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-              } else {
-                *out_ptr_NCHW = static_cast<scalar_t>(0);
-              }
-            }
-          }
-        }
-      }
-    }
-    return output;
-  }
-
   template<typename scalar_t>
   Tensor grid_sampler_3d_cpu_impl(const Tensor& input, const Tensor& grid,
-                                 GridSamplerInterpolation interpolation_mode,
-                                 GridSamplerPadding padding_mode) {
+                                  GridSamplerInterpolation interpolation_mode,
+                                  GridSamplerPadding padding_mode) {
     int64_t N = input.size(0);
     int64_t C = input.size(1);
     int64_t inp_D = input.size(2);
@@ -395,167 +286,12 @@ namespace {
     return output;
   }
 
-  template<typename scalar_t>
-  std::tuple<Tensor, Tensor>
-  grid_sampler_2d_backward_cpu_impl(const Tensor& grad_output,
-                                   const Tensor& input, const Tensor& grid,
-                                   GridSamplerInterpolation interpolation_mode,
-                                   GridSamplerPadding padding_mode) {
-    auto grad_input = at::zeros_like(input);
-    auto grad_grid = at::empty_like(grid);
-    // If interpolation mode is Nearest, then grad_grid is not filled in the
-    // loop below.
-    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      grad_grid.zero_();
-    }
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t gOut_sN = grad_output.stride(0);
-    int64_t gOut_sC = grad_output.stride(1);
-    int64_t gOut_sH = grad_output.stride(2);
-    int64_t gOut_sW = grad_output.stride(3);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sH = grad_input.stride(2);
-    int64_t gInp_sW = grad_input.stride(3);
-    int64_t gGrid_sN = grad_grid.stride(0);
-    int64_t gGrid_sW = grad_grid.stride(2);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
-    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
-          // get the corresponding input x, y co-ordinates from grid
-          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
-          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
-
-          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
-          ix = ((ix + 1) / 2) * (inp_W - 1);
-          iy = ((iy + 1) / 2) * (inp_H - 1);
-
-          // multipliers for gradients on ix and iy
-          // E.g.,  0 for out-of-bound indices when GridSamplerPadding::Border
-          scalar_t gix_mult, giy_mult;
-          if (padding_mode == GridSamplerPadding::Border) {
-            // clip coordinates to image borders
-            ix = clip_coordinates_set_grad(ix, inp_W, &gix_mult);
-            iy = clip_coordinates_set_grad(iy, inp_H, &giy_mult);
-          } else if (padding_mode == GridSamplerPadding::Reflection) {
-            // reflect coordinates by image borders
-            ix = reflect_coordinates_set_grad(ix, inp_W, &gix_mult);
-            iy = reflect_coordinates_set_grad(iy, inp_H, &giy_mult);
-          } else {  // padding_mode == GridSamplerPadding::Zeros
-            gix_mult = static_cast<scalar_t>(1);
-            giy_mult = static_cast<scalar_t>(1);
-          }
-
-          if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-            // get NE, NW, SE, SW pixel values from (x, y)
-            int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-            int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-            int64_t ix_ne = ix_nw + 1;
-            int64_t iy_ne = iy_nw;
-            int64_t ix_sw = ix_nw;
-            int64_t iy_sw = iy_nw + 1;
-            int64_t ix_se = ix_nw + 1;
-            int64_t iy_se = iy_nw + 1;
-
-            // get surfaces to each neighbor:
-            scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-            scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-            scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-            scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-            scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-            scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            // calculate bilinear weighted pixel value and set output pixel
-            for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-              scalar_t gOut = *gOut_ptr_NCHW;
-
-              // calculate and set grad_input
-              safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
-              safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
-              safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
-              safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
-
-              // calculate grad_grid
-              if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-                scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW];
-                gix -= nw_val * (iy_se - iy) * gOut;
-                giy -= nw_val * (ix_se - ix) * gOut;
-              }
-              if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-                scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW];
-                gix += ne_val * (iy_sw - iy) * gOut;
-                giy -= ne_val * (ix - ix_sw) * gOut;
-              }
-              if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-                scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW];
-                gix -= sw_val * (iy - iy_ne) * gOut;
-                giy += sw_val * (ix_ne - ix) * gOut;
-              }
-              if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-                scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW];
-                gix += se_val * (iy - iy_nw) * gOut;
-                giy += se_val * (ix - ix_nw) * gOut;
-              }
-            }
-
-            // un-normalize grad_grid values back to [-1, 1] constraints
-            gix = gix * (inp_W - 1) / 2;
-            giy = giy * (inp_H - 1) / 2;
-
-            // assuming grad_grid is contiguous
-            gGrid_ptr_NHW[0] = gix_mult * gix;
-            gGrid_ptr_NHW[1] = giy_mult * giy;
-          } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-            int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-            int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-
-            // assign nearest neighor pixel value to output pixel
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-            scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
-              // calculate and set grad_input
-              safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW);
-            }
-          }
-        }
-      }
-    }
-    return std::make_tuple(grad_input, grad_grid);
-  }
-
   template<typename scalar_t>
   std::tuple<Tensor, Tensor>
   grid_sampler_3d_backward_cpu_impl(const Tensor& grad_output,
-                                   const Tensor& input, const Tensor& grid,
-                                   GridSamplerInterpolation interpolation_mode,
-                                   GridSamplerPadding padding_mode) {
+                                    const Tensor& input, const Tensor& grid,
+                                    GridSamplerInterpolation interpolation_mode,
+                                    GridSamplerPadding padding_mode) {
     auto grad_input = at::zeros_like(input);
     auto grad_grid = at::empty_like(grid);
     // If interpolation mode is Nearest, then grad_grid is not filled in the
@@ -783,18 +519,18 @@ namespace {
     }
     return std::make_tuple(grad_input, grad_grid);
   }
-}
+
+}  // namespace
 
 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] {
-    return grid_sampler_2d_cpu_impl<scalar_t>(
-      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
+  return grid_sampler_2d_cpu_kernel(kCPU, input, grid, interpolation_mode, padding_mode);
 }
 
+DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel);
+
+
 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode) {
@@ -809,14 +545,11 @@ Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
 std::tuple<Tensor, Tensor>
 grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
                              int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] {
-    return grid_sampler_2d_backward_cpu_impl<scalar_t>(
-      grad_output, input, grid,
-      static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
+  return grid_sampler_2d_backward_cpu_kernel(kCPU, grad_output, input, grid, interpolation_mode, padding_mode);
 }
 
+DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel);
+
 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 std::tuple<Tensor, Tensor>
 grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h
index f39b4e996469fa..ac9c72002c66cc 100644
--- a/aten/src/ATen/native/GridSampler.h
+++ b/aten/src/ATen/native/GridSampler.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
new file mode 100644
index 00000000000000..648defd192e117
--- /dev/null
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -0,0 +1,890 @@
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/core/C++17.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/GridSampler.h>
+#include <ATen/native/cpu/GridSamplerKernel.h>
+#include <ATen/cpu/vml.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <cstring>
+#include <type_traits>
+
+namespace at { namespace native { namespace {
+
+/**  NOTE [ Grid Sample CPU Kernels ]
+ *
+ *   Implementation of vectorized grid sample CPU kernels is divided into three
+ *   parts. More detailed description exist after this paragraph, but on a high
+ *   level, they are
+ *   1. `ComputeLocation` struct
+ *      + Computes the interpolation location basing on padding mode.
+ *   2. `ApplyGridSample` struct
+ *      + Owns N (# spatial dims) `ComputeLocation` structs, and uses them to
+ *        compute the interpolation locations.
+ *      + Interpolates the values and writes to output.
+ *   3. `grid_sample_2d_grid_slice_iterator` function
+ *      + Iterates over a slice of the grid tensor based on the geometry by the
+ *        spatial ordering, i.e., the first iteration will process grid values
+ *           grid[n, 0, 0, :], grid[n, 0, 1, :], grid[n, 0, 2, :], ...
+ *        (Recall that, e.g., 2D grid has shape [N x H x W x 2], so grid[n, ...]
+ *         is a slice, and grid[n, h, w, :] contains the values for a single
+ *         output spatial location.)
+ *      + Applies a given operator at each iteration, so we can use the same
+ *        pattern for forward and backward.
+ *
+ *   Putting everything together, we have, e.g., the forward kernel implemented
+ *   as
+ *
+ *      // `ApplyGridSample` struct that processes grid values, extracts and
+ *      // interpolates input values, and write to output.
+ *      ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(input_accessor);
+ *
+ *      // For each slice, we call `grid_sample_2d_grid_slice_iterator` with
+ *      //   1. the grid slice, and
+ *      //   2. a lambda that takes in
+ *      //      i.   location vectors (x and y for 2D) extracted from grid
+ *      //      ii.  `spatial_offset` as the spatial offset of these vectors
+ *      //           from the beginning of this slice.
+ *      //      iii. `len` as the number of valid locations in the vectors.
+ *      //           (There might not be enough near boundary.)
+ *      for (int n = 0; n < input_accessor.size(0); n++) {
+ *        grid_sample_2d_grid_slice_iterator(
+ *          grid_accessor[n],
+ *          [&](const Vec256<scalar_t>& grid_x,
+ *              const Vec256<scalar_t>& grid_y,
+ *              int64_t spatial_offset, int64_t len) {
+ *            grid_sample.forward(out_accessor[n], input_accessor[n],
+ *                                spatial_offset, grid_x, grid_y, len);
+ *          });
+ *      }
+ *
+ *   Now we talk about details of each of these three parts:
+ *
+ *   1. `ComputeLocation` struct
+ *      Transforms grid values into interpolation locations of the input tensor
+ *      for a particular spatial dimension, based on the size of that dimension
+ *      in input tensor, and the padding mode.
+ *
+ *        template<typename scalar_t, GridSamplerPadding padding>
+ *        struct ComputeLocation {
+ *          using Vec = Vec256<scalar_t>;
+ *
+ *          // ctor
+ *          ComputeLocation(int64_t size);
+ *
+ *          // Given grid values `in`, return the interpolation locations after
+ *          // un-normalization and padding mechanism (elementwise).
+ *          Vec apply(const Vec &in) const;
+ *
+ *          // Similar to `apply`, but also returns `d apply(in) / d in`
+ *          // (elementwise).
+ *          // this is often used in gradient computation.
+ *          std::pair<Vec, Vec> apply_get_grad(const Vec &in) const;
+ *        };
+ *
+ *   2. `ApplyGridSample` struct
+ *      Owns N `ComputeLocation` structs, where N is the number of spatial
+ *      dimensions. Given N input grid vectors (one for each spatial dimension)
+ *      and spatial offset, it gets the interpolation locations from
+ *      `ComputeLocation`s, applies interpolation procedure, and then writes to
+ *      the output (or grad_input & grad_grid in backward).
+ *
+ *        template<typename scalar_t, int spatial_dim,
+ *                 GridSamplerInterpolation interp,
+ *                 GridSamplerPadding padding>
+ *        struct ApplyGridSample {
+ *
+ *          // ctor
+ *          ApplyGridSample(const TensorAccessor<scalar_t, 4>& input);
+ *
+ *          // Applies grid sampling (forward) procedure:
+ *          //   1. computes interpolation locations from grid values `grid_x`
+ *          //      and `grid_y`,
+ *          //   2. interpolates output values using the locations and input
+ *          //      data in `inp_slice`, and
+ *          //   3. writes the first `len` values in the interpolated vector to
+ *          //      `out_slice` with spatial offset being `offset`.
+ *          //
+ *          // This assimes that `grid_x` and `grid_y` all contain valid grid
+ *          // values \in [-1, 1], even at indices greater than `len`.
+ *          //
+ *          // The `*_slice` argument namess mean samples within a batch (i.e.,
+ *          // with the batch dimension sliced out).
+ *          void forward(TensorAccessor<scalar_t, 3>& out_slice,
+ *                       const TensorAccessor<scalar_t, 3>& inp_slice,
+ *                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
+ *                       int64_t len) const;
+ *
+ *          // Applies grid sampling (backward) procedure. Arguments semantics
+ *          // and strategy are similar to those of `forward`.
+ *          void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
+ *                        TensorAccessor<scalar_t, 3>& gGrid_slice,
+ *                        const TensorAccessor<scalar_t, 3>& gOut_slice,
+ *                        const TensorAccessor<scalar_t, 3>& inp_slice,
+ *                        int64_t offset, const Vec& grid_x, const Vec& grid_y,
+ *                        int64_t len) const;
+ *        };
+ *
+ *   3. `grid_sample_2d_grid_slice_iterator` function
+ *      Among the tensors we work with, we know that the output tensors are
+ *      contiguous (i.e., `output` in forward, and `grad_input` & `grad_grid` in
+ *      backward), we need to randomly read `input` anyways, and `grad_output`
+ *      usually comes from autograd and is often contiguous. So we base our
+ *      iterating strategy on the geometry of grid.
+ *      `grid_sample_2d_grid_slice_iterator` function provides an abstraction to
+ *      efficiently iterates through a `grid` slice (without batch dimension).
+ *      See comments of that function on the specific cases and strategies used.
+ *
+ *        template<typename scalar_t, typename ApplyFn>
+ *        void grid_sample_2d_grid_slice_iterator(
+ *          const TensorAccessor<scalar_t, 3>& grid_slice,
+ *          const ApplyFn &apply_fn);
+ *
+ *      `apply_fn` is a function/lambda that takes in
+ *           i.   location vectors (x and y for 2D) extracted from grid
+ *           ii.  `spatial_offset` as the spatial offset of these vectors
+ *                from the beginning of this slice.
+ *           iii. `len` as the number of valid locations in the vectors.
+ *                (There might not be enough near boundary.)
+
+ *       It should be callable as if it has declaration:
+ *          void apply_fn(const Vec256<scalar_t>& grid_x,
+ *                        const Vec256<scalar_t>& grid_y,
+ *                        int64_t spatial_offset, int64_t len);
+ *
+ *      `apply_fn` will be called multiple times, and together cover the entire
+ *      output spatial space.
+ *
+ *  Now you should be able tp understand everything about the implementaion of
+ *  2D forward kernel shown at the beginning of this note.
+ *
+ **/
+
+
+using at::native::detail::GridSamplerInterpolation;
+using at::native::detail::GridSamplerPadding;
+using namespace at::vec256;
+
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ComputeLocation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Struct to compute interpolation location from grid values, and to apply
+// padding mechanism (e.g., reflection).
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+template<typename scalar_t>
+struct ComputeLocationBase {
+  using Vec = Vec256<scalar_t>;
+
+  const scalar_t half_max_val;
+
+  ComputeLocationBase(int64_t size)
+    : half_max_val(static_cast<scalar_t>(size - 1) / 2) {}
+
+  inline Vec unnormalize(const Vec &in) const {
+    return (in + Vec(1)) * Vec(half_max_val);
+  }
+};
+
+template<typename scalar_t, GridSamplerPadding padding>
+struct ComputeLocation;
+
+template<typename scalar_t>
+struct ComputeLocation<scalar_t, GridSamplerPadding::Zeros>
+  : ComputeLocationBase<scalar_t> {
+  using Vec = Vec256<scalar_t>;
+  using ComputeLocationBase<scalar_t>::unnormalize;
+  using ComputeLocationBase<scalar_t>::half_max_val;
+
+  using ComputeLocationBase<scalar_t>::ComputeLocationBase;
+
+  inline Vec apply(const Vec &in) const {
+    return unnormalize(in);
+  }
+
+  inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
+    return std::make_pair(unnormalize(in), Vec(half_max_val));
+  }
+};
+
+template<typename scalar_t>
+struct ComputeLocation<scalar_t, GridSamplerPadding::Border>
+  : ComputeLocationBase<scalar_t> {
+  using Vec = Vec256<scalar_t>;
+  using ComputeLocationBase<scalar_t>::unnormalize;
+  using ComputeLocationBase<scalar_t>::half_max_val;
+
+  const scalar_t max_val;
+
+  ComputeLocation(int64_t size)
+    : ComputeLocationBase<scalar_t>(size)
+    , max_val(static_cast<scalar_t>(size - 1)) {}
+
+  inline Vec apply(const Vec &in) const {
+    return min(Vec(max_val), max(unnormalize(in), Vec(0)));
+  }
+  inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
+    using int_t = int_same_size_t<scalar_t>;
+    Vec max_val_vec(max_val), zeros(0);
+    auto indices = unnormalize(in);
+    auto bounded_lo = max(indices, zeros);
+    // Integral type equality comparison is very very fast because it just looks
+    // at the bits. Casting is free too. So we use the following pattern instead
+    // of comparison + blendv.
+    auto in_bound_lo = cast<scalar_t>(cast<int_t>(bounded_lo) == cast<int_t>(indices));
+    auto res = min(bounded_lo, max_val_vec);
+    auto in_bound_hi = cast<scalar_t>(cast<int_t>(res) == cast<int_t>(indices));
+    return std::make_pair(res, (in_bound_lo & in_bound_hi) & Vec(half_max_val));
+  }
+};
+
+template<typename scalar_t>
+struct ComputeLocation<scalar_t, GridSamplerPadding::Reflection>
+  : ComputeLocationBase<scalar_t> {
+  using Vec = Vec256<scalar_t>;
+  using ComputeLocationBase<scalar_t>::unnormalize;
+  using ComputeLocationBase<scalar_t>::half_max_val;
+
+  bool unit_size;  // whether size == 1, just return 0 in this case
+  const scalar_t double_max_val;
+  const scalar_t neg_half_max_val;
+
+  ComputeLocation(int64_t size)
+    : ComputeLocationBase<scalar_t>(size)
+    , unit_size(size == 1)
+    , double_max_val(static_cast<scalar_t>((size - 1) * 2))
+    , neg_half_max_val(-0.5 * static_cast<scalar_t>(size - 1)) {}
+
+  inline Vec apply(const Vec &in) const {
+    if (unit_size) {
+      return Vec(0);
+    }
+    Vec double_max_val_vec(double_max_val);
+    auto abs_in = unnormalize(in).abs();
+    auto fdouble_flips = abs_in / double_max_val_vec;
+    auto double_flips = fdouble_flips.trunc();
+    auto extra = abs_in - double_flips * double_max_val_vec;
+    // Now we need to test if extra > max_val to find out if another flip is
+    // needed. The following comparison does that and returns the correct
+    // flipped value.
+    return min(extra, double_max_val_vec - extra);
+  }
+
+  inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
+    if (unit_size) {
+      return std::make_pair(Vec(0), Vec(0));
+    }
+    Vec double_max_val_vec(double_max_val);
+    auto unnorm_in = unnormalize(in);
+    auto neg_in = unnorm_in < Vec(0);
+    auto abs_in = unnorm_in.abs();
+    auto fdouble_flips = abs_in / double_max_val_vec;
+    auto double_flips = fdouble_flips.trunc();
+
+    auto extra = abs_in - double_flips * double_max_val_vec;
+    auto reflected_extra = double_max_val_vec - extra;
+    auto one_more_flip = extra > reflected_extra;
+
+    return std::make_pair(
+      Vec::blendv(extra, reflected_extra, one_more_flip),
+      Vec::blendv(Vec(half_max_val), Vec(neg_half_max_val), one_more_flip ^ neg_in)
+    );
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ApplyGridSample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Struct to apply grid sample (reading from input, interpolate, and write to
+// output).
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+template<typename scalar_t>
+static inline void
+mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
+                 const int_same_size_t<scalar_t> *offsets,
+                 const int_same_size_t<scalar_t> *mask, int64_t len) {
+  #pragma unroll
+  for (int64_t i = 0; i < len; i++) {
+    if (mask[i] & 0x01) {
+      base_addr[offsets[i]] += src[i];
+    }
+  }
+}
+
+template<typename scalar_t, int spatial_dim,
+         GridSamplerInterpolation interp,
+         GridSamplerPadding padding>
+struct ApplyGridSample;
+
+template<typename scalar_t, GridSamplerPadding padding>
+struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding> {
+  using Vec = Vec256<scalar_t>;
+  using integer_t = int_same_size_t<scalar_t>;
+  using iVec = Vec256<integer_t>;
+
+  const int64_t inp_H;
+  const int64_t inp_W;
+  const int64_t inp_sH;
+  const int64_t inp_sW;
+  const int64_t C;
+  const int64_t inp_sC;
+  const ComputeLocation<scalar_t, padding> compute_H;
+  const ComputeLocation<scalar_t, padding> compute_W;
+  const bool must_in_bound = padding != GridSamplerPadding::Zeros;
+
+  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+    : inp_H(input.size(2))
+    , inp_W(input.size(3))
+    , inp_sH(input.stride(2))
+    , inp_sW(input.stride(3))
+    , C(input.size(1))
+    , inp_sC(input.stride(1))
+    , compute_H(input.size(2))
+    , compute_W(input.size(3)) {}
+
+  inline std::tuple<
+    Vec, Vec, Vec, Vec,       // distances to 4 sides
+    Vec, Vec, Vec, Vec,       // interpolation weights wrt 4 corners
+    Vec, Vec, Vec, Vec,       // in_bound masks
+    iVec, iVec                // y_n and x_w
+  >
+  compute_interp_params(const Vec& x, const Vec& y) const {
+    // get NE, NW, SE, SW pixel values from (x, y)
+    // assuming we get exact integer representation and just use scalar_t
+    // if we don't, the weights will be garbage anyways.
+    auto x_w = x.floor();
+    auto y_n = y.floor();
+
+    // get distances to each side
+    auto w = x - x_w;
+    auto e = Vec(1) - w;
+    auto n = y - y_n;
+    auto s = Vec(1) - n;
+
+    // get interpolation weights for each neighbor
+    // e.g., for the nw corder, the weight is `dist_to_south * dist_to_east`.
+    auto nw = s * e;
+    auto ne = s * w;
+    auto sw = n * e;
+    auto se = n * w;
+
+    auto i_x_w = convert_to_int_of_same_size(x_w);
+    auto i_y_n = convert_to_int_of_same_size(y_n);
+    auto i_x_e = i_x_w + iVec(1);
+    auto i_y_s = i_y_n + iVec(1);
+
+    // Use int comparison because it is much faster than float comp with AVX2
+    // (latency 1 cyc vs. 4 cyc on skylake)
+    // Avoid using the le and ge because those are not implemented in AVX2 and
+    // are actually simulated using multiple instructions.
+    auto w_mask = must_in_bound ? iVec(-1)  // true = all ones
+                                : (i_x_w > iVec(-1)) & (i_x_w < iVec(inp_W));
+    auto n_mask = must_in_bound ? iVec(-1)  // true = all ones
+                                : (i_y_n > iVec(-1)) & (i_y_n < iVec(inp_H));
+    auto e_mask = must_in_bound ? (i_x_e < iVec(inp_W))
+                                : (i_x_e > iVec(-1)) & (i_x_e < iVec(inp_W));
+    auto s_mask = must_in_bound ? (i_y_s < iVec(inp_H))
+                                : (i_y_s > iVec(-1)) & (i_y_s < iVec(inp_H));
+    auto nw_mask = cast<scalar_t>(must_in_bound ? iVec(-1) : (w_mask & n_mask));
+    auto ne_mask = cast<scalar_t>(e_mask & n_mask);
+    auto sw_mask = cast<scalar_t>(w_mask & s_mask);
+    auto se_mask = cast<scalar_t>(e_mask & s_mask);
+
+    return std::make_tuple(
+      n, s, w, e,
+      nw, ne, sw, se,
+      nw_mask, ne_mask, sw_mask, se_mask,
+      i_y_n, i_x_w);
+  }
+
+  inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
+                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                      int64_t len) const {
+    auto x = compute_W.apply(grid_x);
+    auto y = compute_H.apply(grid_y);
+
+    auto interp_params = compute_interp_params(x, y);
+
+    auto nw = std::get<4>(interp_params);
+    auto ne = std::get<5>(interp_params);
+    auto sw = std::get<6>(interp_params);
+    auto se = std::get<7>(interp_params);
+
+    auto nw_mask = std::get<8>(interp_params);
+    auto ne_mask = std::get<9>(interp_params);
+    auto sw_mask = std::get<10>(interp_params);
+    auto se_mask = std::get<11>(interp_params);
+
+    auto i_y_n = std::get<12>(interp_params);
+    auto i_x_w = std::get<13>(interp_params);
+
+    auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW);
+    auto i_ne_offset = i_nw_offset + iVec(inp_sW);
+    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
+    auto i_se_offset = i_sw_offset + iVec(inp_sW);
+
+    #pragma unroll
+    for (int64_t c = 0; c < C; ++c) {
+      auto inp_slice_C_ptr = inp_slice[c].data();
+
+      // mask_gather zeros out the mask, so we need to make copies
+      Vec nw_mask_copy = nw_mask;
+      Vec ne_mask_copy = ne_mask;
+      Vec sw_mask_copy = sw_mask;
+      Vec se_mask_copy = se_mask;
+      auto nw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy);
+      auto ne_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy);
+      auto sw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy);
+      auto se_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy);
+
+      auto interpolated = (nw_val * nw) + (ne_val * ne) + (sw_val * sw) + (se_val * se);
+      interpolated.store(out_slice[c].data() + offset, len);
+    }
+  }
+
+  inline void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
+                       TensorAccessor<scalar_t, 3>& gGrid_slice,
+                       const TensorAccessor<scalar_t, 3>& gOut_slice,
+                       const TensorAccessor<scalar_t, 3>& inp_slice,
+                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                       int64_t len) const {
+    Vec x, y, gx_mult, gy_mult;
+    std::tie(x, gx_mult) = compute_W.apply_get_grad(grid_x);
+    std::tie(y, gy_mult) = compute_H.apply_get_grad(grid_y);
+
+    Vec n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask;
+    iVec i_y_n, i_x_w;
+
+    std::tie(
+      n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask,
+      i_y_n, i_x_w) = compute_interp_params(x, y);
+
+    auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW);
+    auto i_ne_offset = i_nw_offset + iVec(inp_sW);
+    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
+    auto i_se_offset = i_sw_offset + iVec(inp_sW);
+
+    auto i_gInp_nw_offset = i_y_n * iVec(inp_W) + i_x_w;
+    auto i_gInp_ne_offset = i_gInp_nw_offset + iVec(1);
+    auto i_gInp_sw_offset = i_gInp_nw_offset + iVec(inp_W);
+    auto i_gInp_se_offset = i_gInp_sw_offset + iVec(1);
+
+    // When reading input values, we used mask_gather. Unfortunately, there is
+    // no mask_scatter_add (the backward of mask_gather) in Intel intrinsics.
+    // So we store the necessary vectors to temporary arrays and use the helper
+    // mask_scatter_add defined above.
+
+    integer_t i_gInp_nw_offset_arr[iVec::size];
+    integer_t i_gInp_ne_offset_arr[iVec::size];
+    integer_t i_gInp_sw_offset_arr[iVec::size];
+    integer_t i_gInp_se_offset_arr[iVec::size];
+    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
+    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
+    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
+    i_gInp_se_offset.store(i_gInp_se_offset_arr);
+
+    integer_t i_nw_mask_arr[iVec::size];
+    integer_t i_ne_mask_arr[iVec::size];
+    integer_t i_sw_mask_arr[iVec::size];
+    integer_t i_se_mask_arr[iVec::size];
+    nw_mask.store(i_nw_mask_arr);
+    ne_mask.store(i_ne_mask_arr);
+    sw_mask.store(i_sw_mask_arr);
+    se_mask.store(i_se_mask_arr);
+
+    scalar_t gInp_corner_arr[Vec::size];
+
+    auto gx = Vec(0), gy = Vec(0);
+    #pragma unroll
+    for (int64_t c = 0; c < C; ++c) {
+      auto inp_slice_C_ptr = inp_slice[c].data();
+      auto gInp_slice_C_ptr = gInp_slice[c].data();
+      auto gOut = Vec::loadu(gOut_slice[c].data() + offset, len);
+
+      (nw * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_nw_offset_arr, i_nw_mask_arr, len);
+      (ne * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_ne_offset_arr, i_ne_mask_arr, len);
+      (sw * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_sw_offset_arr, i_sw_mask_arr, len);
+      (se * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_se_offset_arr, i_se_mask_arr, len);
+
+      // mask_gather zeros out the mask, so we need to make copies
+      Vec nw_mask_copy = nw_mask;
+      Vec ne_mask_copy = ne_mask;
+      Vec sw_mask_copy = sw_mask;
+      Vec se_mask_copy = se_mask;
+      auto nw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy);
+      auto ne_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy);
+      auto sw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy);
+      auto se_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy);
+
+      gx = gx + ((ne_val - nw_val) * s + (se_val - sw_val) * n) * gOut;
+      gy = gy + ((sw_val - nw_val) * e + (se_val - ne_val) * w) * gOut;
+    }
+
+    gx = gx * gx_mult;
+    gy = gy * gy_mult;
+
+    constexpr int64_t step = Vec::size;
+    auto interleaved_gGrid = interleave2(gx, gy);
+    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
+    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
+                                         std::min(len * 2, step));
+    std::get<1>(interleaved_gGrid).store(gGrid_ptr + step,
+                                         std::max(static_cast<int64_t>(0), len * 2 - step));
+  }
+};
+
+template<typename scalar_t, GridSamplerPadding padding>
+struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding> {
+  using Vec = Vec256<scalar_t>;
+  using integer_t = int_same_size_t<scalar_t>;
+  using iVec = Vec256<integer_t>;
+
+  const int64_t inp_H;
+  const int64_t inp_W;
+  const int64_t inp_sH;
+  const int64_t inp_sW;
+  const int64_t C;
+  const int64_t inp_sC;
+  const ComputeLocation<scalar_t, padding> compute_H;
+  const ComputeLocation<scalar_t, padding> compute_W;
+  const bool must_in_bound = padding != GridSamplerPadding::Zeros;
+
+  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+    : inp_H(input.size(2))
+    , inp_W(input.size(3))
+    , inp_sH(input.stride(2))
+    , inp_sW(input.stride(3))
+    , C(input.size(1))
+    , inp_sC(input.stride(1))
+    , compute_H(input.size(2))
+    , compute_W(input.size(3)) {}
+
+  inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
+                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                      int64_t len) const {
+    auto x = compute_W.apply(grid_x);
+    auto y = compute_H.apply(grid_y);
+
+    auto x_nearest = x.round();
+    auto y_nearest = y.round();
+
+    auto i_x_nearest = convert_to_int_of_same_size(x_nearest);
+    auto i_y_nearest = convert_to_int_of_same_size(y_nearest);
+
+    auto i_mask = must_in_bound ? iVec(-1)
+                                : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) &
+                                  (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H));
+    auto mask = cast<scalar_t>(i_mask);
+
+    auto i_offset = i_y_nearest * iVec(inp_sH) + i_x_nearest * iVec(inp_sW);
+
+    auto out_ptr = out_slice.data() + offset;
+    auto out_sC = out_slice.stride(0);
+    auto inp_slice_ptr = inp_slice.data();
+    #pragma unroll
+    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
+      // mask_gather zeros out the mask, so we need to make a copy
+      auto mask_copy = mask;
+      auto inp_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_ptr, i_offset, mask_copy);
+      inp_val.store(static_cast<void*>(out_ptr), len);
+    }
+  }
+
+  inline void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
+                       TensorAccessor<scalar_t, 3>& gGrid_slice,
+                       const TensorAccessor<scalar_t, 3>& gOut_slice,
+                       const TensorAccessor<scalar_t, 3>& inp_slice,
+                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                       int64_t len) const {
+    auto x = compute_W.apply(grid_x);
+    auto y = compute_H.apply(grid_y);
+
+    auto x_nearest = x.round();
+    auto y_nearest = y.round();
+
+    auto i_x_nearest = convert_to_int_of_same_size(x_nearest);
+    auto i_y_nearest = convert_to_int_of_same_size(y_nearest);
+
+    auto i_mask = must_in_bound ? iVec(-1)
+                                : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) &
+                                  (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H));
+
+    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous
+
+    integer_t mask_arr[iVec::size];
+    i_mask.store(mask_arr);
+    integer_t gInp_offset_arr[iVec::size];
+    i_gInp_offset.store(gInp_offset_arr);
+
+    #pragma unroll
+    for (int64_t c = 0; c < C; ++c) {
+      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
+                       gInp_offset_arr, mask_arr, len);
+    }
+
+    // grid has zero 0 gradient in Nearest mode
+    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
+    std::memset(gGrid_ptr, 0, sizeof(scalar_t) * len * 2);
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~
+// Function to apply a vectorized function on a grid slice tensor (without batch
+// dimension).
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+template<typename scalar_t, typename ApplyFn>
+static inline void grid_sample_2d_grid_slice_iterator(
+    const TensorAccessor<scalar_t, 3>& grid_slice, const ApplyFn &apply_fn) {
+  int64_t out_H = grid_slice.size(0);
+  int64_t out_W = grid_slice.size(1);
+  int64_t grid_sH = grid_slice.stride(0);
+  int64_t grid_sW = grid_slice.stride(1);
+  int64_t grid_sCoor = grid_slice.stride(2);
+  auto grid_ptr = grid_slice.data();
+
+  using Vec = Vec256<scalar_t>;
+  using iVec = Vec256<int_same_size_t<scalar_t>>;
+  constexpr int64_t step = Vec::size;
+
+  // Loop over each output pixel in grid.
+  // We consider the following three cases (after slicing out the batch
+  // dimension).
+  // See detailed discussions under each if-case.
+
+  if (at::geometry_is_contiguous({out_H, out_W, 2}, {grid_sH, grid_sW, grid_sCoor})) {
+    // Case 1:
+    // Grid is contiguous.
+    // Strategy: Sequentially load two vectors at the same time, and get,
+    //           e.g.,  {x0, y0, x1, y1}, {x2, y2, x3, y3}. Then we use
+    //           at::vec256::deinterleave2 to get x and y vectors.
+    auto total_size = out_H * out_W;
+    for (int64_t spatial_offset = 0; spatial_offset < total_size; spatial_offset += step) {
+      auto grid_offset = spatial_offset * 2;
+      auto len = std::min(step, total_size - spatial_offset);
+      auto vec1 = Vec::loadu(grid_ptr + grid_offset,
+                             std::min(step, len * 2));
+      auto vec2 = Vec::loadu(grid_ptr + grid_offset + step,
+                             std::max(static_cast<int64_t>(0), len * 2 - step));
+      auto vec_xy_pair = deinterleave2(vec1, vec2);
+
+      auto x = std::get<0>(vec_xy_pair);
+      auto y = std::get<1>(vec_xy_pair);
+
+      // make sure that x and y are valid grid sample locations
+      if (len < step) {
+        x = Vec::set(Vec(0), x, len);
+        y = Vec::set(Vec(0), y, len);
+      }
+      apply_fn(x, y, spatial_offset, len);
+    }
+  } else if (grid_sW == 1 || out_W == 1) {
+    // Case 2:
+    // The W dimension is contiguous.
+    // This can be common, e.g., grid is from a conv net output of shape
+    // [N, 2, H, W].
+    // Strategy: Divide into two contiguous slices each of shape [H, W], and
+    //           each containing x and y vectors. So we sequentially load a
+    //           vector from each of them to get x and y vector
+
+    // Function to apply along a contiguous W dimension (or flattened H x W).
+    auto line_fn = [&](const scalar_t *grid_ptr_x, const scalar_t *grid_ptr_y,
+                       int64_t out_base_offset, int64_t total_size) {
+      for (int64_t i = 0; i < total_size; i += step) {
+        auto len = std::min(step, total_size - i);
+        auto x = Vec::loadu(grid_ptr_x + i, len);
+        auto y = Vec::loadu(grid_ptr_y + i, len);
+        // make sure that x and y are valid grid sample locations
+        if (len < step) {
+          x = Vec::set(Vec(0), x, len);
+          y = Vec::set(Vec(0), y, len);
+        }
+        apply_fn(x, y, out_base_offset + i, len);
+      }
+    };
+
+    if (at::geometry_is_contiguous({out_H, out_W}, {grid_sH, grid_sW})) {
+      // If [H, W] is contiguous, apply line_fn once.
+      line_fn(grid_ptr, grid_ptr + grid_sCoor, 0, out_H * out_W);
+    } else {
+      // If only [W] is contiguous, apply line_fn once for each h slice.
+      auto grid_ptr_NH = grid_ptr;
+      for (int64_t h = 0; h < out_H; h++) {
+        line_fn(grid_ptr_NH, grid_ptr_NH + grid_sCoor, h * out_W, out_W);
+        grid_ptr_NH += grid_sH;
+      }
+    }
+  } else {
+    // Case 3:
+    // General case.
+    // Strategy: Do a for-loop over H, for each W slice, use
+    //           at::vec256::gather to load the x and y vectors.
+    auto spatial_offset = 0;
+    auto i_offsets_delta = iVec(grid_sW * step);
+
+    #pragma unroll
+    for (int64_t h = 0; h < out_H; h++) {
+      auto grid_ptr_x = grid_ptr + h * grid_sH;
+      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
+      auto i_offsets = iVec::arange(0, grid_sW);
+      #pragma unroll
+      for (int64_t w = 0; w < out_W; w += step) {
+        auto len = std::min(step, out_W - w);
+        if (len < step) {
+          // prevents illegal memory access, sets the exceeding offsets to zero
+          i_offsets = iVec::set(iVec(0), i_offsets, len);
+        }
+        apply_fn(gather<sizeof(scalar_t)>(grid_ptr_x, i_offsets),
+                 gather<sizeof(scalar_t)>(grid_ptr_y, i_offsets),
+                 spatial_offset, len);
+
+        i_offsets = i_offsets + i_offsets_delta;
+        spatial_offset += len;
+      }
+    }
+  }
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~ Grid Sample Kernels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Use the structs & functions defined above to calculate grid sample forward
+// and backward.
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
+                                       int64_t interpolation_mode,
+                                       int64_t padding_mode) {
+  auto N = input.size(0);
+  auto H = grid.size(1);
+  auto W = grid.size(2);
+  auto output = at::empty({N, input.size(1), H, W}, input.options());
+  auto spatial_size = H * W;
+  auto grain_size = spatial_size == 0 ? (N + 1)
+                                      : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/);
+
+#define HANDLE_CASE(interp, padding)                                           \
+  case padding: {                                                              \
+    ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(inp_acc);        \
+    parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) {           \
+      for (int64_t n = begin; n < end; n++) {                                  \
+        auto out_slice = out_acc[n];                                           \
+        auto inp_slice = inp_acc[n];                                           \
+        grid_sample_2d_grid_slice_iterator(                                    \
+          grid_acc[n],                                                         \
+          [&](const Vec256<scalar_t>& grid_x, const Vec256<scalar_t>& grid_y,  \
+              int64_t spatial_offset, int64_t len) {                           \
+            grid_sample.forward(out_slice, inp_slice, spatial_offset,          \
+                                grid_x, grid_y, len);                          \
+          });                                                                  \
+        }                                                                      \
+      });                                                                      \
+    return;                                                                    \
+  }
+
+#define HANDLE_INTERP(interp)                                          \
+  case interp: {                                                       \
+    switch (static_cast<GridSamplerPadding>(padding_mode)) {           \
+      HANDLE_CASE(interp, GridSamplerPadding::Zeros);                  \
+      HANDLE_CASE(interp, GridSamplerPadding::Border);                 \
+      HANDLE_CASE(interp, GridSamplerPadding::Reflection);             \
+    }                                                                  \
+    return;                                                            \
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_cpu_kernel_impl", [&] {
+    auto out_acc = output.accessor<scalar_t, 4>();
+    auto inp_acc = input.accessor<scalar_t, 4>();
+    auto grid_acc = grid.accessor<scalar_t, 4>();
+    switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
+      HANDLE_INTERP(GridSamplerInterpolation::Bilinear);
+      HANDLE_INTERP(GridSamplerInterpolation::Nearest);
+    }
+  });
+#undef HANDLE_CASE
+#undef HANDLE_INTERP
+
+  return output;
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
+                                         const Tensor& input,
+                                         const Tensor& grid,
+                                         int64_t interpolation_mode,
+                                         int64_t padding_mode) {
+  // grad_output should be contiguous most of time. Ensuring that it is
+  // contiguous can greatly simplify this code.
+  auto grad_output = grad_output_.contiguous();
+
+  auto grad_input = at::zeros_like(input);
+  auto grad_grid = at::empty_like(grid);
+  auto N = input.size(0);
+  auto spatial_size = grid.size(1) * grid.size(2);
+  auto grain_size = spatial_size == 0 ? (N + 1)
+                                      : at::divup(at::internal::GRAIN_SIZE, spatial_size * 10 /* 2d * 5 tensors*/);
+
+#define HANDLE_CASE(interp, padding)                                             \
+  case padding: {                                                                \
+    ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(inp_acc);          \
+    parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) {             \
+      for (int64_t n = begin; n < end; n++) {                                    \
+        auto gInp_slice = gInp_acc[n];                                           \
+        auto gGrid_slice = gGrid_acc[n];                                         \
+        auto gOut_slice = gOut_acc[n];                                           \
+        auto inp_slice = inp_acc[n];                                             \
+        grid_sample_2d_grid_slice_iterator(                                      \
+          grid_acc[n],                                                           \
+          [&](const Vec256<scalar_t>& grid_x, const Vec256<scalar_t>& grid_y,    \
+              int64_t spatial_offset, int64_t len) {                             \
+            grid_sample.backward(gInp_slice, gGrid_slice, gOut_slice, inp_slice, \
+                                 spatial_offset, grid_x, grid_y, len);           \
+          });                                                                    \
+      }                                                                          \
+    });                                                                          \
+    return;                                                                      \
+  }
+
+#define HANDLE_INTERP(interp)                                          \
+  case interp: {                                                       \
+    switch (static_cast<GridSamplerPadding>(padding_mode)) {           \
+      HANDLE_CASE(interp, GridSamplerPadding::Zeros);                  \
+      HANDLE_CASE(interp, GridSamplerPadding::Border);                 \
+      HANDLE_CASE(interp, GridSamplerPadding::Reflection);             \
+    }                                                                  \
+    return;                                                            \
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] {
+    auto gInp_acc = grad_input.accessor<scalar_t, 4>();
+    auto gGrid_acc = grad_grid.accessor<scalar_t, 4>();
+    auto inp_acc = input.accessor<scalar_t, 4>();
+    auto grid_acc = grid.accessor<scalar_t, 4>();
+    auto gOut_acc = grad_output.accessor<scalar_t, 4>();
+    switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
+      HANDLE_INTERP(GridSamplerInterpolation::Bilinear);
+      HANDLE_INTERP(GridSamplerInterpolation::Nearest);
+    }
+  });
+#undef HANDLE_CASE
+#undef HANDLE_INTERP
+
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+}
+
+REGISTER_DISPATCH(grid_sampler_2d_cpu_kernel, &grid_sampler_2d_cpu_kernel_impl);
+REGISTER_DISPATCH(grid_sampler_2d_backward_cpu_kernel, &grid_sampler_2d_backward_cpu_kernel_impl);
+
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h
new file mode 100644
index 00000000000000..36ba3a91cc9bb8
--- /dev/null
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/native/DispatchStub.h"
+#include "ATen/cpu/vml.h"
+
+#include <tuple>
+
+namespace at { namespace native {
+
+using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t);
+using backward_2d_fn = std::tuple<Tensor, Tensor>(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t);
+DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel);
+DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel);
+
+}}  // namespace at::native
diff --git a/test/test_nn.py b/test/test_nn.py
index 809b820c42202e..ac2496620c3977 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -5097,34 +5097,55 @@ def test_grid_sample_error_checking(self):
     def test_grid_sample(self):
         def test(N, C, H, W, mode, padding_mode):
             def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
-                input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_()
-                grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_()
-                out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
-                self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
+                for grid_dim_contig_order in [(0, 1, 2, 3), (0, 3, 1, 2), (3, 0, 1, 2), (0, 2, 1, 3)]:
+                    # grid_dim_contig_order specifies the dimension order that can
+                    # make grid to be contiguous.
+                    # i.e., grid.permute(grid_dim_contig_order) is contiguous.
+                    # e.g., with grid_dim_contig_order=[0, 3, 1, 2], grid should be
+                    #       initialized with contiguous tensor of shape [N, 2, H, W]
+                    #       and permuted to [N, H, W, 2] afterwards.
+                    grid_shape = [N, H, W, 2]
+                    grid_init_shape = [grid_shape[d] for d in grid_dim_contig_order]
+                    grid_fwd_permute = [None, None, None, None]
+                    for i, d in enumerate(grid_dim_contig_order):
+                        grid_fwd_permute[d] = i
+
+                    def get_grid(device='cpu', data=None):
+                        if data is not None:
+                            assert list(data.shape) == grid_shape
+                            data = data.permute(grid_dim_contig_order).to(device)
+                        else:
+                            data = torch.randn(grid_init_shape, device=device)
+                        grid = data.permute(grid_fwd_permute)
+                        assert grid.permute(grid_dim_contig_order).is_contiguous()
+                        return grid
+
+                    input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_()
+                    grid_cpu = get_grid().requires_grad_()
+                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
+                    self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
 
-                gradients = torch.randn_like(out_cpu)
-                out_cpu.backward(gradients)
+                    gradients = torch.randn_like(out_cpu)
+                    out_cpu.backward(gradients)
 
-                if TEST_CUDA:
-                    input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
-                    grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
-                    out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
-                    self.assertEqual(out_cpu, out_cuda)
+                    if TEST_CUDA:
+                        input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                        grid_cuda = get_grid('cuda', grid_cpu.detach()).requires_grad_()
+                        out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
+                        self.assertEqual(out_cpu, out_cuda)
 
-                    out_cuda.backward(gradients.cuda())
-                    self.assertEqual(input_cpu.grad, input_cuda.grad)
-                    self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
+                        out_cuda.backward(gradients.cuda())
+                        self.assertEqual(input_cpu.grad, input_cuda.grad)
+                        self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
 
-                    # check that zero-dimensional input strides don't error out
-                    base_input = torch.randn(N, C, 1, IW)
-                    input_cpu = base_input.expand_as(input_cuda).requires_grad_()
-                    grid_cpu = torch.randn(N, H, W, 2, requires_grad=True)
-                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
+                        # check that zero-dimensional input strides don't error out
+                        base_input = torch.randn(N, C, 1, IW)
+                        input_cpu = base_input.expand_as(input_cuda).requires_grad_()
+                        out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
 
-                    input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
-                    grid_cuda = grid_cpu.detach().cuda().requires_grad_()
-                    out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
-                    self.assertEqual(out_cpu, out_cuda)
+                        input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
+                        out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
+                        self.assertEqual(out_cpu, out_cuda)
 
             # test same size output
             test_shape(N, C, H, W, H, W, mode, padding_mode)
@@ -5182,15 +5203,11 @@ def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
 
         for mode in ('bilinear', 'nearest'):
             for padding_mode in ('zeros', 'border', 'reflection'):
-
                 # test known input on CPU
                 input = torch.arange(1., 11).view(1, 1, 2, 5)
                 grid = torch.tensor(
-                    [[-0.9, -4.1, 0, 0.2, 1],
-                     [-1, -0.333, 0, 0.5, 1],
-                     [-1, -0.5, 0, 0.3333, 1],
-                     [-1, -0.2, 0, 1.5, 0.5]]).view(1, 2, 5, 2)
-                output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode)
+                    [[[-0.9, -4.1], [0, 0.2000], [1, -1], [-0.333, 1e-10], [0.5, 1.0]],
+                     [[-1.0, -0.5], [0, 0.3333], [1, -1], [-0.200, 1e-10], [1.5, 0.5]]]).view(1, 2, 5, 2)
                 if mode == 'bilinear':
                     if padding_mode == 'zeros':
                         groundtruth = torch.tensor(
@@ -5223,7 +5240,10 @@ def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
                         assert False, "missing groundtruth test for padding mode '{}'".format(padding_mode)
                 else:
                     assert False, "missing groundtruth test for interpolation mode '{}'".format(mode)
-                self.assertEqual(output, groundtruth)
+                output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode)
+                self.assertEqual(output, groundtruth,
+                                 "groundtruth comparison failed for mode={}, "
+                                 "padding_mode={}".format(mode, padding_mode))
 
                 # do gradcheck
                 N = random.randint(2, 8)

From f5bc2aef0790c82f002be79db718203284de531e Mon Sep 17 00:00:00 2001
From: JerryShih <jerry.shih@iclink.tw>
Date: Mon, 17 Sep 2018 07:44:08 -0700
Subject: [PATCH 181/237] Update OpenMP cmake setting for xcode 9
 compiler(AppleClang 9.0) (#11563)

Summary:
Fix the link OpenMP link error for AppleClang 9.0 compiler.

Built with the following command:
python setup.py build develop

The error message:

```
Undefined symbols for architecture x86_64:
  "___kmpc_critical", referenced from:
      _THFloatTensor_addmm in THTensorMath.cpp.o
      _THDoubleTensor_addmm in THTensorMath.cpp.o
      _THByteTensor_addmm in THTensorMath.cpp.o
      _THCharTensor_addmm in THTensorMath.cpp.o
      _THShortTensor_addmm in THTensorMath.cpp.o
      _THIntTensor_addmm in THTensorMath.cpp.o
      _THLongTensor_addmm in THTensorMath.cpp.o
      ...
  "___kmpc_end_critical", referenced from:
      _THFloatTensor_addmm in THTensorMath.cpp.o
      _THDoubleTensor_addmm in THTensorMath.cpp.o
      _THByteTensor_addmm in THTensorMath.cpp.o
      _THCharTensor_addmm in THTensorMath.cpp.o
      _THShortTensor_addmm in THTensorMath.cpp.o
      _THIntTensor_addmm in THTensorMath.cpp.o
      _THLongTensor_addmm in THTensorMath.cpp.o
      ...
  "___kmpc_end_reduce_nowait", referenced from:
      _.omp_outlined..270 in THTensorMoreMath.cpp.o
      _.omp_outlined..271 in THTensorMoreMath.cpp.o
      _.omp_outlined..273 in THTensorMoreMath.cpp.o
      _.omp_outlined..275 in THTensorMoreMath.cpp.o
      _.omp_outlined..43 in THTensorEvenMoreMath.cpp.o
      _.omp_outlined..44 in THTensorEvenMoreMath.cpp.o
      _.omp_outlined..46 in THTensorEvenMoreMath.cpp.o
      ...
  "___kmpc_end_serialized_parallel", referenced from:
      at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o
      at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o
      at::native::softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o
      at::native::log_softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o
      at::native::softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o
      at::native::log_softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o
      at::TensorIterator::for_each(std::__1::function<void (int, char**, long long const*, long long)> const&) in TensorIterator.cpp.o
      ...
  "___kmpc_for_static_fini", referenced from:
      _.omp_outlined..9 in Embedding.cpp.o
      _.omp_outlined. in EmbeddingBag.cpp.o
      _.omp_outlined. in GridSampler.cpp.o
      _.omp_outlined..42 in GridSampler.cpp.o
      _.omp_outlined..44 in GridSampler.cpp.o
      _.omp_outlined..45 in GridSampler.cpp.o
      _.omp_outlined..47 in GridSampler.cpp.o
      ...
  "___kmpc_for_static_init_4", referenced from:
      _.omp_outlined. in init.cpp.o
      _.omp_outlined..35 in init.cpp.o
      _.omp_outlined..36 in init.cpp.o
      _.omp_outlined..37 in init.cpp.o
      _.omp_outlined..49 in init.cpp.o
      _.omp_outlined..52 in init.cpp.o
      _.omp_outlined..220 in init.cpp.o
      ...
  "___kmpc_for_static_init_8", referenced from:
      _.omp_outlined..9 in Embedding.cpp.o
      _.omp_outlined. in EmbeddingBag.cpp.o
      _.omp_outlined. in GridSampler.cpp.o
      _.omp_outlined..42 in GridSampler.cpp.o
      _.omp_outlined..44 in GridSampler.cpp.o
      _.omp_outlined..45 in GridSampler.cpp.o
      _.omp_outlined..47 in GridSampler.cpp.o
      ...
  "___kmpc_for_static_init_8u", referenced from:
      _.omp_outlined..203 in init.cpp.o
      _.omp_outlined..207 in init.cpp.o
      _.omp_outlined..209 in init.cpp.o
      _.omp_outlined..210 in init.cpp.o
  "___kmpc_fork_call", referenced from:
      at::native::embedding_dense_backward_cpu(at::Tensor const&, at::Tensor const&, long long, long long, bool) in Embedding.cpp.o
      at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o
      at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o
      at::native::grid_sampler_2d_cpu(at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o
      at::native::grid_sampler_3d_cpu(at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o
      at::native::grid_sampler_2d_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o
      at::native::grid_sampler_3d_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o
      ...
  "___kmpc_global_thread_num", referenced from:
      at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o
      at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o
      at::native::softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o
      at::native::log_softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o
      at::native::softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o
      at::native::log_softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o
      at::TensorIterator::for_each(std::__1::function<void (int, char**, long long const*, long long)> const&) in TensorIterator.cpp.o
      ...
  "___kmpc_push_num_threads", referenced from:
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 0, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 0, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 1, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 1, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 1, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 1, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 0, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 0, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 1, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 1, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      ...
  "___kmpc_reduce_nowait", referenced from:
      _.omp_outlined..270 in THTensorMoreMath.cpp.o
      _.omp_outlined..271 in THTensorMoreMath.cpp.o
      _.omp_outlined..273 in THTensorMoreMath.cpp.o
      _.omp_outlined..275 in THTensorMoreMath.cpp.o
      _.omp_outlined..43 in THTensorEvenMoreMath.cpp.o
      _.omp_outlined..44 in THTensorEvenMoreMath.cpp.o
      _.omp_outlined..46 in THTensorEvenMoreMath.cpp.o
      ...
  "___kmpc_serialized_parallel", referenced from:
      at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o
      at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o
      at::native::softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o
      at::native::log_softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o
      at::native::softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o
      at::native::log_softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o
      at::TensorIterator::for_each(std::__1::function<void (int, char**, long long const*, long long)> const&) in TensorIterator.cpp.o
      ...
  "_omp_get_max_threads", referenced from:
      _THGetNumThreads in THGeneral.cpp.o
      caffe2::Caffe2SetOpenMPThreads(int*, char***) in init_omp.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 0, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 0, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 1, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 1, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 1, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 1, false, float, 1, false, 0>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose<Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      void Eigen::internal::parallelize_gemm<true, Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor<float, long, Eigen::internal::general_matrix_matrix_product<long, float, 0, false, float, 0, false, 0>, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1> const, 0, Eigen::OuterStride<-1> >, Eigen::Map<Eigen::Matrix<float, -1, -1, 0, -1, -1>, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o
      ...
  "_omp_get_num_procs", referenced from:
      _THGetNumCores in THGeneral.cpp.o
  "_omp_get_num_threads", referenced from:
      _.omp_outlined. in Embedding.cpp.o
      _.omp_outlined. in SoftMax.cpp.o
      _.omp_outlined..35 in SoftMax.cpp.o
      _.omp_outlined..37 in SoftMax.cpp.o
      _.omp_outlined..38 in SoftMax.cpp.o
      _.omp_outlined..46 in SoftMax.cpp.o
      _.omp_outlined..47 in SoftMax.cpp.o
      ...
  "_omp_get_thread_num", referenced from:
      _.omp_outlined. in Embedding.cpp.o
      _.omp_outlined. in SoftMax.cpp.o
      _.omp_outlined..35 in SoftMax.cpp.o
      _.omp_outlined..37 in SoftMax.cpp.o
      _.omp_outlined..38 in SoftMax.cpp.o
      _.omp_outlined..46 in SoftMax.cpp.o
      _.omp_outlined..47 in SoftMax.cpp.o
      ...
  "_omp_in_parallel", referenced from:
      _THFloatTensor_copy in THTensorCopy.cpp.o
      _THDoubleTensor_copy in THTensorCopy.cpp.o
      _THByteTensor_copy in THTensorCopy.cpp.o
      _THCharTensor_copy in THTensorCopy.cpp.o
      _THShortTensor_copy in THTensorCopy.cpp.o
      _THIntTensor_copy in THTensorCopy.cpp.o
      _THLongTensor_copy in THTensorCopy.cpp.o
      ...
  "_omp_set_num_threads", referenced from:
      _THSetNumThreads in THGeneral.cpp.o
      caffe2::Caffe2SetOpenMPThreads(int*, char***) in init_omp.cc.o
ld: symbol(s) not found for architecture x86_64
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11563

Differential Revision: D9831398

Pulled By: ezyang

fbshipit-source-id: db119d3f9c26a71180335ad955f2f62c5369f9ed
---
 caffe2/CMakeLists.txt    |  8 +++++
 cmake/Dependencies.cmake | 65 +++++++++++++++++-----------------------
 torch/CMakeLists.txt     | 13 ++++++--
 3 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4e9b3eef8937f4..361ce228f3227a 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -206,6 +206,14 @@ if (${CAFFE2_LINK_LOCAL_PROTOBUF})
 else()
   target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 endif()
+#cmake only check for separate OpenMP library on AppleClang 7+
+#https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
+if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
+      CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
+    target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
+  endif()
+endif()
 target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index dbec0d07ebf6e3..8fdfbd8ca8eb4a 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -456,12 +456,37 @@ endif()
 
 # ---[ OpenMP
 if(USE_OPENMP)
-  find_package(OpenMP)
+  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
+    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+    message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+    if(DARWIN_VERSION GREATER 9)
+      set(APPLE_OPENMP_SUCKS 1)
+    endif(DARWIN_VERSION GREATER 9)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
+      OUTPUT_VARIABLE GCC_VERSION)
+    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+      message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+      message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+      add_compile_options(-Wno-unknown-pragmas)
+      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+    endif()
+  endif()
+
+  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
+    find_package(OpenMP)
+    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
+
+    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
+    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
+    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
+  endif()
+
   if(OPENMP_FOUND)
     message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
   else()
     message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
     caffe2_update_option(USE_OPENMP OFF)
@@ -956,42 +981,6 @@ if (NOT BUILD_ATEN_MOBILE)
     STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
     STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
   ENDIF()
-
-  # OpenMP support?
-  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
-  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
-    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
-    IF (DARWIN_VERSION GREATER 9)
-      SET(APPLE_OPENMP_SUCKS 1)
-    ENDIF (DARWIN_VERSION GREATER 9)
-    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
-      OUTPUT_VARIABLE GCC_VERSION)
-    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
-      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
-      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
-      add_compile_options(-Wno-unknown-pragmas)
-      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
-    ENDIF()
-  ENDIF()
-
-  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
-    FIND_PACKAGE(OpenMP)
-    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
-
-    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
-    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
-    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
-  ENDIF()
-
-  IF (OPENMP_FOUND)
-    MESSAGE(STATUS "Compiling with OpenMP support")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-  ENDIF()
-
-
   SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 
   FIND_PACKAGE(MAGMA)
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 7679f3d6467ba0..706ff0c2d7091b 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -3,7 +3,7 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
     return()
   endif()
 else()
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
   project(torch CXX C)
   find_package(Caffe2 REQUIRED)
   option(USE_CUDA "Use CUDA" ON)
@@ -322,8 +322,15 @@ if(OPENMP_FOUND)
   if (VERBOSE)
     message(STATUS "Compiling with OpenMP")
   endif()
-  target_compile_options(torch INTERFACE -fopenmp)
-  target_link_libraries(torch -fopenmp)
+  target_compile_options(torch INTERFACE ${OpenMP_CXX_FLAGS})
+  #cmake only check for separate OpenMP library on AppleClang 7+
+  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
+  if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
+        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
+      target_link_libraries(torch ${OpenMP_libomp_LIBRARY})
+    endif()
+  endif()
 endif()
 
 if (NOT NO_API AND NOT USE_ROCM)

From d63bb72d89723d0d913ba38b7a98e288236391b0 Mon Sep 17 00:00:00 2001
From: peter <peterghost86@gmail.com>
Date: Mon, 17 Sep 2018 08:12:40 -0700
Subject: [PATCH 182/237] Remove symbol export annotations in THC/generic/*.cu
 (#11367)

Summary:
We use these annotations during function declarations, not definitions. See the description of compiler error [C2491](https://msdn.microsoft.com/en-us/library/62688esh.aspx) for more details.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11367

Reviewed By: ezyang

Differential Revision: D9697923

Pulled By: orionr

fbshipit-source-id: 1e539c02957851386f887e6d0510ce83117a1695
---
 aten/src/THC/THCTensorSort.cuh                |  2 +
 aten/src/THC/generic/THCStorage.cu            |  2 +-
 aten/src/THC/generic/THCTensor.cu             |  2 +-
 aten/src/THC/generic/THCTensorCopy.cu         | 13 ++-
 aten/src/THC/generic/THCTensorMasked.cu       | 30 +++----
 aten/src/THC/generic/THCTensorMath.cu         | 12 +--
 aten/src/THC/generic/THCTensorMathBlas.cu     | 28 +++---
 aten/src/THC/generic/THCTensorMathCompare.cu  | 24 ++---
 aten/src/THC/generic/THCTensorMathCompareT.cu | 36 +++-----
 aten/src/THC/generic/THCTensorMathMagma.cu    | 28 +++---
 aten/src/THC/generic/THCTensorMathPairwise.cu | 41 +++------
 .../src/THC/generic/THCTensorMathPointwise.cu | 60 +++++--------
 aten/src/THC/generic/THCTensorMathReduce.cu   | 87 ++++++++-----------
 aten/src/THC/generic/THCTensorMode.cu         | 42 ++++-----
 aten/src/THC/generic/THCTensorRandom.cu       | 42 ++++-----
 aten/src/THC/generic/THCTensorSort.cu         | 18 ++--
 aten/src/THC/generic/THCTensorSort.h          |  7 ++
 aten/src/THC/generic/THCTensorTopK.cu         | 10 +--
 18 files changed, 205 insertions(+), 279 deletions(-)

diff --git a/aten/src/THC/THCTensorSort.cuh b/aten/src/THC/THCTensorSort.cuh
index 9b75a7355a00f4..b8feedcc1f21c9 100644
--- a/aten/src/THC/THCTensorSort.cuh
+++ b/aten/src/THC/THCTensorSort.cuh
@@ -1,6 +1,8 @@
 #ifndef THC_TENSORSORT_CUH
 #define THC_TENSORSORT_CUH
 
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
 #include "THCReduceApplyUtils.cuh"
 #include "THCSortUtils.cuh"
 #include "THCTensorCopy.h"
diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu
index 88ed2e5541820e..cd12d2bcae0fb6 100644
--- a/aten/src/THC/generic/THCStorage.cu
+++ b/aten/src/THC/generic/THCStorage.cu
@@ -18,7 +18,7 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
   THCStorage_resize(state, self, size);
 }
 
-THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
+int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
   return THCStorage_getDevice(state, storage);
 }
 
diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu
index 98478341575c75..0c694f5e4e25c8 100644
--- a/aten/src/THC/generic/THCTensor.cu
+++ b/aten/src/THC/generic/THCTensor.cu
@@ -2,7 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensor.cu"
 #else
 
-THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
+int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
   return THCTensor_getDevice(state, tensor);
 }
 
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 25ded22747441f..2c05c74f247a48 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorCopy.cu"
 #else
 
-THC_API void
-THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
+void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
   if (dst == src) return;
   THC_copyTensor<scalar_t, scalar_t>(state, dst, src);
 }
@@ -51,16 +50,14 @@ void THCTensor_copyIgnoringOverlaps<scalar_t>(THCState* state, THCTensor* dst, T
     ReadOnly);
 }
 
-THC_API void
-THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
+void THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
   THCTensor_copyIgnoringOverlaps<scalar_t>(state, dst, src);
 }
 
 #define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA, SCALARC)        \
-  THC_API void                                                          \
-  THCTensor_(copyCuda##TYPEC)(THCState *state,                          \
-                              THCTensor *self,                          \
-                              THCuda##TYPECUDA##Tensor *src) {          \
+  void THCTensor_(copyCuda##TYPEC)(THCState *state,                     \
+                                   THCTensor *self,                     \
+                                   THCuda##TYPECUDA##Tensor *src) {     \
     THC_copyTensor<scalar_t, SCALARC>(state, self, src); \
   }
 
diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu
index f7e3e3f32a9a18..684ce31b141f79 100644
--- a/aten/src/THC/generic/THCTensorMasked.cu
+++ b/aten/src/THC/generic/THCTensorMasked.cu
@@ -3,9 +3,8 @@
 #else
 
 
-THC_API void
-THCTensor_(maskedFill)(THCState* state,
-                       THCTensor *tensor, THCudaByteTensor *mask, scalar_t value)
+void THCTensor_(maskedFill)(THCState* state,
+                            THCTensor *tensor, THCudaByteTensor *mask, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask));
   THArgCheck(THCTensor_(nElement)(state, tensor) ==
@@ -20,9 +19,8 @@ THCTensor_(maskedFill)(THCState* state,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(maskedFillByte)(THCState* state,
-                           THCTensor *tensor, THByteTensor *mask, scalar_t value)
+void THCTensor_(maskedFillByte)(THCState* state,
+                                THCTensor *tensor, THByteTensor *mask, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {});
@@ -31,9 +29,8 @@ THCTensor_(maskedFillByte)(THCState* state,
   THCudaByteTensor_free(state, maskCuda);
 }
 
-THC_API void
-THCTensor_(maskedCopy)(THCState* state,
-                       THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
+void THCTensor_(maskedCopy)(THCState* state,
+                            THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
   ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
@@ -98,9 +95,8 @@ THCTensor_(maskedCopy)(THCState* state,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(maskedCopyByte)(THCState* state,
-                           THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
+void THCTensor_(maskedCopyByte)(THCState* state,
+                                THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {});
   THCudaByteTensor_copyByte(state, maskCuda, mask);
@@ -108,9 +104,8 @@ THCTensor_(maskedCopyByte)(THCState* state,
   THCudaByteTensor_free(state, maskCuda);
 }
 
-THC_API void
-THCTensor_(maskedSelect)(THCState* state,
-                         THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
+void THCTensor_(maskedSelect)(THCState* state,
+                              THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
   THArgCheck(THCudaByteTensor_nElement(state, mask) ==
              THCTensor_(nElement)(state, src),
@@ -171,9 +166,8 @@ THCTensor_(maskedSelect)(THCState* state,
 }
 
 // FIXME: remove now that we have THCudaByteTensor?
-THC_API void
-THCTensor_(maskedSelectByte)(THCState* state,
-                             THCTensor *tensor, THCTensor *src, THByteTensor *mask)
+void THCTensor_(maskedSelectByte)(THCState* state,
+                                  THCTensor *tensor, THCTensor *src, THByteTensor *mask)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {});
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index 9ffe626dd8425f..c4f7afb6a227b9 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMath.cu"
 #else
 
-THC_API void
-THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
+void THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
 
@@ -15,8 +14,7 @@ THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(zero)(THCState *state, THCTensor *self_)
+void THCTensor_(zero)(THCState *state, THCTensor *self_)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   if (THCTensor_(isContiguous)(state, self_)) {
@@ -35,16 +33,14 @@ THCTensor_(zero)(THCState *state, THCTensor *self_)
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input)
+void THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input));
   THCTensor_(resizeAs)(state, r_, input);
   THCTensor_(zero)(state, r_);
 }
 
-THC_API void
-THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input)
+void THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input));
   THCTensor_(resizeAs)(state, r_, input);
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index 45364c7574cad4..d83b8ff929b4f6 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -5,8 +5,7 @@
 #define ERROR_ONLY_FP_TYPES(func) \
   THError("%s for CUDA tensors only supports floating-point types. Try converting the tensors with .float()", func);
 
-THC_API accreal
-THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
+accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
@@ -43,8 +42,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 #endif
 }
 
-THC_API void
-THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec)
+void THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
@@ -151,8 +149,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s
 #endif
 }
 
-THC_API void
-THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2)
+void THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
@@ -249,8 +246,7 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, sc
 #endif
 }
 
-THC_API void
-THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2)
+void THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
 
@@ -413,9 +409,8 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s
 #endif
 }
 
-THC_API void
-THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
-                   scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
+void THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
+                        scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 2, 4, "expected 2D tensor");
@@ -478,9 +473,8 @@ __global__ void createBatchGemmBuffer3(const scalar_t** buffer1, const scalar_t
   }
 }
 
-THC_API void
-THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
-                    scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
+void THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
+                         scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 3, 4, "expected 3D tensor");
@@ -745,7 +739,7 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor
 #endif
 }
 
-THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a)
+void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THAssert(THCTensor_(checkGPU)(state, 2, ra_, a));
@@ -852,8 +846,8 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens
 }
 
 
-THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
-                              THCTensor *atf, THCudaIntTensor *pivots)
+void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
+                           THCTensor *atf, THCudaIntTensor *pivots)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b));
diff --git a/aten/src/THC/generic/THCTensorMathCompare.cu b/aten/src/THC/generic/THCTensorMathCompare.cu
index 0a0041ab9e4784..3c8e8ce0c325a7 100644
--- a/aten/src/THC/generic/THCTensorMathCompare.cu
+++ b/aten/src/THC/generic/THCTensorMathCompare.cu
@@ -2,7 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu"
 #else
 
-THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -10,7 +10,7 @@ THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -18,7 +18,7 @@ THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -26,7 +26,7 @@ THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -34,7 +34,7 @@ THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -42,7 +42,7 @@ THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -50,7 +50,7 @@ THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -58,7 +58,7 @@ THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *
                                   scalar_t>(value));
 }
 
-THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -66,7 +66,7 @@ THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *
                               scalar_t>(value));
 }
 
-THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -74,7 +74,7 @@ THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *
                                scalar_t>(value));
 }
 
-THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -82,7 +82,7 @@ THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *
                                scalar_t>(value));
 }
 
-THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -90,7 +90,7 @@ THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *
                                scalar_t>(value));
 }
 
-THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
diff --git a/aten/src/THC/generic/THCTensorMathCompareT.cu b/aten/src/THC/generic/THCTensorMathCompareT.cu
index 6397a0b7caaa96..1bd4b9909fc26d 100644
--- a/aten/src/THC/generic/THCTensorMathCompareT.cu
+++ b/aten/src/THC/generic/THCTensorMathCompareT.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu"
 #else
 
-THC_API void
-THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -11,8 +10,7 @@ THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -20,8 +18,7 @@ THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -29,8 +26,7 @@ THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -38,8 +34,7 @@ THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -47,8 +42,7 @@ THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -56,8 +50,7 @@ THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -65,8 +58,7 @@ THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -74,8 +66,7 @@ THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -83,8 +74,7 @@ THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -92,8 +82,7 @@ THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -101,8 +90,7 @@ THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index ecf39d9a1bf0f2..29c7999f74a9b6 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -60,7 +60,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
 }
 
 
-THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
@@ -98,8 +98,8 @@ THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 #endif
 }
 
-THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_,
-                               const char *uplo, const char *trans, const char *diag)
+void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_,
+                       const char *uplo, const char *trans, const char *diag)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
@@ -135,7 +135,7 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_,
 #endif
 }
 
-THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
@@ -182,7 +182,7 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 #endif
 }
 
-THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
+void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
 {
 #ifdef USE_MAGMA
   int64_t n = THTensor_sizeLegacyNoScalars(a, 0);
@@ -247,7 +247,7 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 #endif
 }
 
-THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
+void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
 {
 #ifdef USE_MAGMA
   THArgCheck(a_->dim() == 2, 3, "A should be 2 dimensional");
@@ -321,7 +321,7 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 #endif
 }
 
-THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
+void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
 {
 #ifdef USE_MAGMA
   THCTensor *ra_ = THCTensor_(new)(state);
@@ -332,7 +332,7 @@ THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_,
 #endif
 }
 
-THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
+void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -396,7 +396,7 @@ THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
 #endif
 }
 
-THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
+void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
 {
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
   THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
@@ -524,7 +524,7 @@ __global__ void THCTensor_(copyLowerSymmetric)(scalar_t *input, int n, int len)
   }
 }
 
-THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -564,7 +564,7 @@ THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, co
 #endif
 }
 
-THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional");
@@ -600,7 +600,7 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co
 #endif
 }
 
-THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
+void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
   THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
@@ -632,7 +632,7 @@ THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, TH
 #endif
 }
 
-THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_)
+void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -669,7 +669,7 @@ THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_
 #endif
 }
 
-THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
+void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
index 06720b358df985..0f2d0067ff5546 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu"
 #else
 
-THC_API void
-THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -21,8 +20,7 @@ THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -40,20 +38,17 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
+void THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
 {
   THCTensor_(add)(state, self_, src_, value * alpha);
 }
 
-THC_API void
-THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
+void THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
 {
   THCTensor_(sub)(state, self_, src_, value * alpha);
 }
 
-THC_API void
-THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -71,8 +66,7 @@ THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   THArgCheck(value != ScalarConvert<int, scalar_t>::to(0), 3, "divide by zero");
@@ -92,8 +86,7 @@ THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCTensor_(mul)(state, self_, src_, pow(2, value));
@@ -116,8 +109,7 @@ THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t
 #endif
 }
 
-THC_API void
-THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCTensor_(mul)(state, self_, src_, pow(2, -value));
@@ -140,8 +132,7 @@ THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t
 #endif
 }
 
-THC_API void
-THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -159,8 +150,7 @@ THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t va
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -235,7 +225,7 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
+int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
@@ -259,8 +249,7 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_
   return min != 0;
 }
 
-THC_API void
-THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   return THError("bitand only supported for integer type tensors");
@@ -281,8 +270,7 @@ THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t
 #endif
 }
 
-THC_API void
-THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   return THError("bitor only supported for integer type tensors");
@@ -303,8 +291,7 @@ THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t v
 #endif
 }
 
-THC_API void
-THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   return THError("bitxor only supported for integer type tensors");
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index f5e1443b0eb5e7..4ff836fd53dda6 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -108,8 +108,7 @@ void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, scalar
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
+void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
 
@@ -211,8 +210,7 @@ void THCTensor_(polygamma)(THCState* state, THCTensor* self_, int64_t n, THCTens
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w)
+void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b));
   THArgCheck(THCTensor_(nElement)(state, a) ==
@@ -235,8 +233,7 @@ c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl> retainTensorImpl(THC
 }
 }
 
-THC_API void
-THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
+void THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
 {
   auto out = at::Tensor(retainTensorImpl(self_));
 #ifdef THC_REAL_IS_HALF
@@ -247,8 +244,7 @@ THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t va
   at::add_out(out, retainTensorImpl(src1), retainTensorImpl(src2), alpha);
 }
 
-THC_API void
-THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
+void THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
 {
   auto out = at::Tensor(retainTensorImpl(self_));
 #ifdef THC_REAL_IS_HALF
@@ -259,15 +255,13 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t va
   at::sub_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)), alpha);
 }
 
-THC_API void
-THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   auto out = at::Tensor(retainTensorImpl(self_));
   at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
-THC_API void
-THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -374,15 +368,13 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, scalar_t value, THCTens
 
   THCudaCheck(cudaGetLastError());
 }
-THC_API void
-THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   auto out = at::Tensor(retainTensorImpl(self_));
   at::div_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
-THC_API void
-THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF)
   return THError("clshift not supported for torch.CudaHalfTensor");
@@ -409,8 +401,7 @@ THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso
 #endif
 }
 
-THC_API void
-THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF)
   return THError("crshift not supported for torch.CudaHalfTensor");
@@ -437,8 +428,7 @@ THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso
 #endif
 }
 
-THC_API void
-THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -456,8 +446,7 @@ THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
   }
 }
 
-THC_API void
-THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -475,8 +464,7 @@ THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
   }
 }
 
-THC_API void
-THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -494,8 +482,7 @@ THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTen
   }
 }
 
-THC_API void
-THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -513,8 +500,7 @@ THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *
   }
 }
 
-THC_API void
-THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
+void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
 
@@ -530,8 +516,7 @@ THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t
   }
 }
 
-THC_API void
-THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
+void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
 
@@ -547,8 +532,7 @@ THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t
   }
 }
 
-THC_API void
-THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
+void THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
   if(self_ != t)
@@ -572,8 +556,7 @@ THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
+void THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
   if(self_ != t)
@@ -596,8 +579,7 @@ THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   return THError("cbitand is only supported for integer type tensors");
@@ -624,8 +606,7 @@ THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso
 #endif
 }
 
-THC_API void
-THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   return THError("cbitor is only supported for integer type tensors");
@@ -652,8 +633,7 @@ THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor
 #endif
 }
 
-THC_API void
-THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   return THError("cbitor is only supported for integer type tensors");
diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu
index 91319745b6a1de..009ac39f346db9 100644
--- a/aten/src/THC/generic/THCTensorMathReduce.cu
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu"
 #else
 
-THC_API void
-THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
+void THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (!THC_reduceDim<scalar_t>(state, self, src,
                            thrust::identity<accreal>{},
@@ -18,8 +17,7 @@ THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
+void THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (!THC_reduceDim<scalar_t>(state, self, src,
                            thrust::identity<accreal>{},
@@ -34,8 +32,7 @@ THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim)
+void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   const accreal size = scalar_cast<accreal>(THCTensor_(size)(state, src, dim));
@@ -54,8 +51,7 @@ THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-THC_API void
-THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm)
+void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   THCTensor *self_;
@@ -88,8 +84,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t va
   THCTensor_(free)(state, data);
 }
 
-THC_API void
-THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
+void THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
 
@@ -116,8 +111,7 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension
   }
 }
 
-THC_API void
-THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
+void THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
 
@@ -144,15 +138,13 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension
   }
 }
 
-THC_API accreal
-THCTensor_(stdall)(THCState *state, THCTensor *self, int biased)
+accreal THCTensor_(stdall)(THCState *state, THCTensor *self, int biased)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   return THCNumerics<accreal>::sqrt((THCTensor_(varall)(state, self, biased)));
 }
 
-THC_API accreal
-THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
+accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal mean = THCTensor_(meanall)(state, self);
@@ -175,8 +167,7 @@ THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
   return val;
 }
 
-THC_API void
-THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim)
+void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim)
 {
   const accreal value = scalar_cast<accreal>(_value);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
@@ -220,8 +211,7 @@ THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API accreal
-THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value)
+accreal THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value)
 {
   const accreal value = scalar_cast<accreal>(_value);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
@@ -294,8 +284,7 @@ accreal THCTensor_(dist)(THCState *state, THCTensor *self,
 
 #endif
 
-THC_API accreal
-THCTensor_(sumall)(THCState *state, THCTensor *self) {
+accreal THCTensor_(sumall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -310,8 +299,7 @@ THCTensor_(sumall)(THCState *state, THCTensor *self) {
   return val;
 }
 
-THC_API accreal
-THCTensor_(prodall)(THCState *state, THCTensor *self) {
+accreal THCTensor_(prodall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -326,15 +314,13 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) {
   return val;
 }
 
-THC_API accreal
-THCTensor_(meanall)(THCState *state, THCTensor *self)
+accreal THCTensor_(meanall)(THCState *state, THCTensor *self)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self);
 }
 
-THC_API scalar_t
-THCTensor_(minall)(THCState *state, THCTensor *self) {
+scalar_t THCTensor_(minall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -348,8 +334,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) {
   return scalar_cast<scalar_t>(val);
 }
 
-THC_API scalar_t
-THCTensor_(maxall)(THCState *state, THCTensor *self) {
+scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -363,8 +348,7 @@ THCTensor_(maxall)(THCState *state, THCTensor *self) {
   return scalar_cast<scalar_t>(val);
 }
 
-THC_API scalar_t
-THCTensor_(medianall)(THCState *state, THCTensor *self) {
+scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
 
   scalar_t val;
@@ -391,13 +375,12 @@ THCTensor_(medianall)(THCState *state, THCTensor *self) {
   return val;
 }
 
-THC_API void
-THCTensor_(median)(THCState *state,
-                   THCTensor *values,
-                   THCudaLongTensor *indices,
-                   THCTensor *self,
-                   int dimension,
-                   int keepdim) {
+void THCTensor_(median)(THCState *state,
+                        THCTensor *values,
+                        THCudaLongTensor *indices,
+                        THCTensor *self,
+                        int dimension,
+                        int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
 
   int64_t t_size_dim, k;
@@ -433,13 +416,12 @@ THCTensor_(median)(THCState *state,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(max)(THCState *state,
-                THCTensor *values,
-                THCudaLongTensor *indices,
-                THCTensor *src,
-                int dimension,
-                int keepdim) {
+void THCTensor_(max)(THCState *state,
+                     THCTensor *values,
+                     THCudaLongTensor *indices,
+                     THCTensor *src,
+                     int dimension,
+                     int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
 
   thrust::pair<scalar_t, int64_t>
@@ -452,13 +434,12 @@ THCTensor_(max)(THCState *state,
     MaxValuePair<scalar_t, int64_t>());
 }
 
-THC_API void
-THCTensor_(min)(THCState *state,
-                THCTensor *values,
-                THCudaLongTensor *indices,
-                THCTensor *src,
-                int dimension,
-                int keepdim) {
+void THCTensor_(min)(THCState *state,
+                     THCTensor *values,
+                     THCudaLongTensor *indices,
+                     THCTensor *src,
+                     int dimension,
+                     int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
 
   thrust::pair<scalar_t, int64_t>
diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu
index 274093ef105ae5..db72921bcfd1b0 100644
--- a/aten/src/THC/generic/THCTensorMode.cu
+++ b/aten/src/THC/generic/THCTensorMode.cu
@@ -2,13 +2,13 @@
 #define THC_GENERIC_FILE "generic/THCTensorMode.cu"
 #else
 
-THC_API void THCTensor_(calculateMode)(THCState *state,
-                                        THCTensor *values,
-                                        THCudaLongTensor *indices,
-                                        THCTensor *input,
-                                        THCudaLongStorage *sortBuffer,
-                                        int dimension,
-                                        THLongStorage *position) {
+void THCTensor_(calculateMode)(THCState *state,
+                               THCTensor *values,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               THCudaLongStorage *sortBuffer,
+                               int dimension,
+                               THLongStorage *position) {
   THAssert(THCTensor_(isContiguous)(state, input));
 
   // Because the input is contiguous, we want to get a reference to the
@@ -129,14 +129,14 @@ THC_API void THCTensor_(calculateMode)(THCState *state,
 }
 
 // this probably could be a loop, not a recursive algorithm
-THC_API void THCTensor_(dimApplyMode)(THCState *state,
-                               THCTensor *values,
-                               THCudaLongTensor *indices,
-                               THCTensor *input,
-                               THCudaLongStorage *sortBuffer,
-                               int dimension,
-                               THLongStorage *position,
-                               int curDim) {
+void THCTensor_(dimApplyMode)(THCState *state,
+                              THCTensor *values,
+                              THCudaLongTensor *indices,
+                              THCTensor *input,
+                              THCudaLongStorage *sortBuffer,
+                              int dimension,
+                              THLongStorage *position,
+                              int curDim) {
   int64_t ndim = THCTensor_(nDimensionLegacyAll)(state, input);
 
   // Because we have transposed the Tensor, the data for the dimension we are mode'ing along
@@ -155,12 +155,12 @@ THC_API void THCTensor_(dimApplyMode)(THCState *state,
 #define MAX_GRID_SIZE  65535
 #define MAX_BLOCK_SIZE 1024
 
-THC_API void THCTensor_(mode)(THCState *state,
-                              THCTensor *values,
-                              THCudaLongTensor *indices,
-                              THCTensor *input,
-                              int dimension,
-                              int keepdim) {
+void THCTensor_(mode)(THCState *state,
+                      THCTensor *values,
+                      THCudaLongTensor *indices,
+                      THCTensor *input,
+                      int dimension,
+                      int keepdim) {
   THCTensor *transposed, *contiguous, *valuesTransposed;
   THLongStorage *position;
   THCudaLongStorage *sortBuffer;
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
index 620c73e9af01d3..4cbc6dd1a29999 100644
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -6,7 +6,7 @@
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
+void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -21,7 +21,7 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
+void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -36,13 +36,13 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean,
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
+void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
   THCTensor_(resizeAs)(state, self, means);
   THCTensor_(normal)(state, self, 0, stddev);
   THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
 }
 
-THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
+void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
 {
   THCTensor_(resizeAs)(state, self, stddevs);
   THCTensor_(normal)(state, self, 0, 1);
@@ -50,7 +50,7 @@ THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double
   THCTensor_(add)(state, self, self, ScalarConvert<double, scalar_t>::to(mean));
 }
 
-THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
+void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
 {
   THCTensor_(resizeAs)(state, self, means);
   THCTensor_(normal)(state, self, 0, 1);
@@ -58,7 +58,7 @@ THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self,
   THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
 }
 
-THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
+void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
 
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
@@ -75,7 +75,7 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
+void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -91,7 +91,7 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
+void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -128,11 +128,11 @@ void THCTensor_(renormRows)(struct THCState* state,
                                         rows, cols);
 }
 
-THC_API void THCTensor_(multinomial)(struct THCState *state,
-                                      THCudaLongTensor *self,
-                                      THCTensor *prob_dist,
-                                      int n_sample,
-                                      int with_replacement)
+void THCTensor_(multinomial)(struct THCState *state,
+                              THCudaLongTensor *self,
+                              THCTensor *prob_dist,
+                              int n_sample,
+                              int with_replacement)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist));
   THCGenerator* gen = THCRandom_getGenerator(state);
@@ -299,7 +299,7 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
   }
 }
 
-THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){
+void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){
   THAssert(THCTensor_(isContiguous)(state, _q));
   THAssert(THCudaLongTensor_isContiguous(state, _J));
   THAssert(THCTensor_(isContiguous)(state, _probs));
@@ -354,7 +354,7 @@ THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_prob
   THCudaLongTensor_free(state, larger_short);
 }
 
-THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){
+void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){
   THAssert(THCTensor_(isContiguous)(state, _q));
   THAssert(THCudaLongTensor_isContiguous(state, _J));
   THCGenerator* gen = THCRandom_getGenerator(state);
@@ -388,7 +388,7 @@ GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_do
 GENERATE_KERNEL1(generate_bernoulli, scalar_t, double p, float, curand_uniform, (ScalarConvert<bool, scalar_t>::to(x <= p)))
 #endif
 
-THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
+void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -413,7 +413,7 @@ void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p
 }
 
 #define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE)               \
-THC_API void THCTensor_(NAME)(THCState* state,                                 \
+void THCTensor_(NAME)(THCState* state,                                 \
         THCTensor *self_, PROB_TYPE *probs_)                                   \
 {                                                                              \
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_));             \
@@ -458,7 +458,7 @@ GENERATE_KERNEL2(generate_random, scalar_t, int32_t base, uint32_t range, uint32
     static_cast<scalar_t>(static_cast<int32_t>(x % range + base)))
 #endif
 
-THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
+void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -474,7 +474,7 @@ THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val)
+void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val)
 {
   THArgCheck(min_val < max_val, 2,
              "max must be greater than min, but got: min = %lld, max = %lld", min_val, max_val);
@@ -502,14 +502,14 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val)
+void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val)
 {
   THCTensor_(clampedRandom)(state, self_, 0LL, max_val);
 };
 
 #define HLF_MANT_DIG 11
 
-THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
+void THCTensor_(random)(THCState* state, THCTensor *self_)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu
index 0dca009b80ab72..d60fa0c4da96fb 100644
--- a/aten/src/THC/generic/THCTensorSort.cu
+++ b/aten/src/THC/generic/THCTensorSort.cu
@@ -5,10 +5,10 @@
 // In alignment with default sort on a c++ map, this function
 // will permute key and value tensors identically, and
 // in such a way that the 'key' tensor is ordered numerically
-THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
-                                           THCTensor* key,
-                                           THCudaLongTensor* value,
-                                           int dim, bool dir) {
+void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                     THCTensor* key,
+                                     THCudaLongTensor* value,
+                                     int dim, bool dir) {
   THArgCheck(key->sizes().equals(value->sizes()), 2,
              "Key tensor must have same size as value tensor");
   int dims = THCudaLongTensor_nDimensionLegacyNoScalars(state, value);
@@ -274,11 +274,11 @@ void THCTensor_(sortViaThrust)(THCState* state,
   THCudaLongTensor_freeCopyTo(state, trContigIndices, indices);
 }
 
-THC_API void THCTensor_(sort)(THCState* state,
-                               THCTensor *sorted,
-                               THCudaLongTensor *indices,
-                               THCTensor *input,
-                               int dim, int order) {
+void THCTensor_(sort)(THCState* state,
+                      THCTensor *sorted,
+                      THCudaLongTensor *indices,
+                      THCTensor *input,
+                      int dim, int order) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input));
   THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
   int64_t dims = THCTensor_(nDimensionLegacyNoScalars)(state, sorted);
diff --git a/aten/src/THC/generic/THCTensorSort.h b/aten/src/THC/generic/THCTensorSort.h
index 009d825a223975..eba93fb75773c3 100644
--- a/aten/src/THC/generic/THCTensorSort.h
+++ b/aten/src/THC/generic/THCTensorSort.h
@@ -4,10 +4,17 @@
 
 /* Performs an in-place sort of (keys, values). Only works for slice sizes
    <= 2048 at the moment (slice size == size of keys/values dim `dim`) */
+#ifdef __cplusplus
+THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                             THCTensor* keys,
+                                             THCudaLongTensor* values,
+                                             int dim, bool dir);
+#else
 THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
                                              THCTensor* keys,
                                              THCudaLongTensor* values,
                                              int dim, int order);
+#endif
 
 /* Performs an out-of-place sort of `input`, returning the per-slice indices
    in `indices` and the sorted values in `sorted` */
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index 71ee008659b12a..a195dfbe5ca7a8 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -2,11 +2,11 @@
 #define THC_GENERIC_FILE "generic/THCTensorTopK.cu"
 #else
 
-THC_API void THCTensor_(topk)(THCState* state,
-                               THCTensor *topK,
-                               THCudaLongTensor *indices,
-                               THCTensor *input_,
-                               int64_t k, int dim, int dir, int sorted) {
+void THCTensor_(topk)(THCState* state,
+                      THCTensor *topK,
+                      THCudaLongTensor *indices,
+                      THCTensor *input_,
+                      int64_t k, int dim, int dir, int sorted) {
   THAssert(topK != NULL && indices != NULL && input_ != NULL);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_));
   THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);

From a8b1755de6c967bd8e9d777b6a0dddf4e7a6eed1 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 17 Sep 2018 08:15:19 -0700
Subject: [PATCH 183/237] Check device argument makes sense for legacy tensor
 constructors. (#11669)

Summary:
Fixes: https://github.com/pytorch/pytorch/issues/11427.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11669

Differential Revision: D9817881

Pulled By: gchanan

fbshipit-source-id: 77dc5b0e6bc9884d2616210b96c07e4734058bb6
---
 aten/src/ATen/DeviceGuard.h     |  7 ++++
 test/test_sparse.py             | 29 +++++++++++++++
 test/test_torch.py              | 32 +++++++++++++++++
 torch/csrc/utils/tensor_new.cpp | 63 ++++++++++++++++++++++++++-------
 4 files changed, 118 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
index b51d80d22d350f..4da336aef5b7cd 100644
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@@ -4,6 +4,7 @@
 #include <ATen/core/ScalarType.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/Error.h>
+#include <ATen/core/optional.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 
 #include <cstddef>
@@ -28,6 +29,12 @@ struct DeviceGuard {
     }
   }
 
+  explicit DeviceGuard(optional<Device> device_opt) {
+    if (device_opt.has_value() && device_opt.value().is_cuda()) {
+      set_index(device_opt.value().index());
+    }
+  }
+
   /// Calls `set_index` with the given index.
   explicit DeviceGuard(int32_t index) {
     set_index(index);
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 0b3ff39ab48301..82de45d6d17384 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1176,6 +1176,35 @@ def test_factory_copy(self):
         self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
         self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
+    @cpu_only  # just run once, we test both cpu and cuda
+    def test_constructor_device_legacy(self):
+        i = torch.tensor([[0, 1, 1], [2, 0, 2]])
+        v = torch.tensor([3., 4., 5.])
+        size = torch.Size([2, 3])
+
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, size, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cuda'))
+
+        x = torch.sparse_coo_tensor(i, v, size, device='cpu')
+        self.assertRaises(RuntimeError, lambda: x.new(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda'))
+
+        if torch.cuda.is_available():
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, size, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cpu'))
+
+            x = torch.sparse_coo_tensor(i, v, size, device='cuda')
+            self.assertRaises(RuntimeError, lambda: x.new(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu'))
+
     @cpu_only  # not really, but we only really want to run this once
     def test_dtypes(self):
         all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]
diff --git a/test/test_torch.py b/test/test_torch.py
index b90e0631d4f470..cf938d4d0f50d7 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2329,6 +2329,38 @@ def test_constructor_dtypes(self):
 
         torch.set_default_tensor_type(default_type)
 
+    def test_constructor_device_legacy(self):
+        self.assertRaises(RuntimeError, lambda: torch.FloatTensor(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.FloatTensor(torch.Size([2, 3, 4]), device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.FloatTensor((2.0, 3.0), device='cuda'))
+
+        self.assertRaises(RuntimeError, lambda: torch.Tensor(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.Tensor(torch.Size([2, 3, 4]), device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.Tensor((2.0, 3.0), device='cuda'))
+
+        x = torch.randn((3,), device='cpu')
+        self.assertRaises(RuntimeError, lambda: x.new(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new((2.0, 3.0), device='cuda'))
+
+        if torch.cuda.is_available():
+            self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor(torch.Size([2, 3, 4]), device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor((2.0, 3.0), device='cpu'))
+
+            default_type = torch.Tensor().type()
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            self.assertRaises(RuntimeError, lambda: torch.Tensor(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.Tensor(torch.Size([2, 3, 4]), device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.Tensor((2.0, 3.0), device='cpu'))
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            torch.set_default_tensor_type(default_type)
+
+            x = torch.randn((3,), device='cuda')
+            self.assertRaises(RuntimeError, lambda: x.new(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new((2.0, 3.0), device='cpu'))
+
     def test_type(self):
         x = torch.randn(3, 3).double()
         self.assertEqual(x.type('torch.FloatTensor').dtype, torch.float32)
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 3c3ab4384dc85f..ebed9c07fad7a7 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -250,6 +250,15 @@ Tensor legacy_new_from_sequence(const Type & type, at::optional<Device> device,
   return legacy_new_from_data(type, device, data);
 }
 
+void check_legacy_ctor_device(const Type& type, at::optional<Device> device) {
+  if (device.has_value()) {
+    AT_CHECK(type.device_type() == device.value().type(),
+             "legacy constructor for device type: ", type.device_type(),
+             " was passed device type: ", device.value().type(),
+             ", but device type must be: ", type.device_type());
+  }
+}
+
 Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new(*, Device? device=None)",
@@ -261,22 +270,30 @@ Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwa
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
     return at::empty({0}, type.options(r.device(0).index()));
   } else if (r.idx == 1) {
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return type.unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 2) {
-    at::DeviceGuard device_guard(r.device(2));
+    auto deviceOptional = r.deviceOptional(2);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
-    at::DeviceGuard device_guard(r.device(3));
+    auto deviceOptional = r.deviceOptional(3);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   }
@@ -294,27 +311,35 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
   ParsedArgs<5> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    at::DeviceGuard device_guard(r.device(0));
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.tensor();
   } else if (r.idx == 1) {
-    auto cdata = reinterpret_cast<void*>(r.device(0).index());
+    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return type.unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 2) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
-    at::DeviceGuard device_guard(r.device(2));
+    auto deviceOptional = r.deviceOptional(2);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
-    at::DeviceGuard device_guard(r.device(3));
+    auto deviceOptional = r.deviceOptional(3);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   }
@@ -346,7 +371,9 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    at::DeviceGuard device_guard(r.device(0));
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.tensor();
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
@@ -357,14 +384,18 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
     return new_with_tensor(type, r.tensor(0));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   } else if (r.idx == 5) {
-    return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
+    return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
   }
   throw std::runtime_error("new(): invalid arguments");
 }
@@ -386,7 +417,9 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
   ParsedArgs<3> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    at::DeviceGuard device_guard(r.device(0));
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.tensor();
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
@@ -397,13 +430,17 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
     return new_with_tensor(type, r.tensor(0));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   } else if (r.idx == 5) {
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
   }
   throw std::runtime_error("new(): invalid arguments");

From 5bfd8f583c7edb9804a9988de53a93970f2ef055 Mon Sep 17 00:00:00 2001
From: Jesse Hellemn <hellemn@fb.com>
Date: Mon, 17 Sep 2018 08:42:03 -0700
Subject: [PATCH 184/237] Moving copy of Caffe2 protos back to
 build_pytorch_libs.sh (#11726)

Summary:
This way it shows up in all current and future setup.py commands, as otherwise we'd have to override every once to have them all call copy_protos. This is needed because the nightly packages still do not include caffe2_pb2, because setup.py bdist does not go through setup.py install or setup.py develop
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11726

Reviewed By: orionr

Differential Revision: D9844075

Pulled By: pjh5

fbshipit-source-id: 57b469e48010aacd0c08c214ba8a7e5d757feefa
---
 setup.py                    |  7 ++++++-
 tools/build_pytorch_libs.sh | 32 +++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 533b6dee4fcc53..ccfced5f35bc83 100644
--- a/setup.py
+++ b/setup.py
@@ -413,6 +413,7 @@ def build_libs(libs):
 # protobuf python compiler) from the build folder to the root folder
 # cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py
 def copy_protos():
+    print('setup.py::copy_protos()')
     for src in glob.glob(
             os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')):
         dst = os.path.join(
@@ -423,6 +424,7 @@ def copy_protos():
 # Build all dependent libraries
 class build_deps(PytorchCommand):
     def run(self):
+        print('setup.py::build_deps::run()')
         # Check if you remembered to check out submodules
         def check_file(f):
             if not os.path.exists(f):
@@ -507,6 +509,7 @@ def run(self):
 
 class build_module(PytorchCommand):
     def run(self):
+        print('setup.py::build_module::run()')
         self.run_command('build_py')
         self.run_command('build_ext')
 
@@ -514,6 +517,7 @@ def run(self):
 class build_py(setuptools.command.build_py.build_py):
 
     def run(self):
+        print('setup.py::build_py::run()')
         self.run_command('create_version_file')
         setuptools.command.build_py.build_py.run(self)
 
@@ -521,6 +525,7 @@ def run(self):
 class develop(setuptools.command.develop.develop):
 
     def run(self):
+        print('setup.py::develop::run()')
         self.run_command('create_version_file')
         setuptools.command.develop.develop.run(self)
         self.create_compile_commands()
@@ -727,9 +732,9 @@ def run(self):
 class install(setuptools.command.install.install):
 
     def run(self):
+        print('setup.py::run()')
         if not self.skip_build:
             self.run_command('build_deps')
-        copy_protos()
 
         setuptools.command.install.install.run(self)
 
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 79dfc948ea07b2..37d816775f75b2 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -323,15 +323,26 @@ function build_caffe2() {
 
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
 
+  # Install Python proto files
+  if [[ "$BUILD_PYTHON" == 'ON' ]]; then
+      echo "Copying Caffe2 proto files from $(pwd)/caffe2/proto to  $(cd .. && pwd)/caffe2/proto"
+      echo "All the files in caffe2/proto are $(find caffe2/proto)"
+      for proto_file in $(pwd)/caffe2/proto/*.py; do
+          cp $proto_file "$(pwd)/../caffe2/proto/"
+      done
+  fi
+
+
   # Fix rpaths of shared libraries
   if [[ $(uname) == 'Darwin' ]]; then
-    # root/torch/lib/tmp_install/lib
-    pushd "$INSTALL_DIR/lib"
-    for lib in *.dylib; do
-      echo "Updating install_name for $lib"
-      install_name_tool -id @rpath/$lib $lib
-    done
-    popd
+      # root/torch/lib/tmp_install/lib
+      echo "Updating all install_names in $INSTALL_DIR/lib"
+      pushd "$INSTALL_DIR/lib"
+      for lib in *.dylib; do
+          echo "Updating install_name for $(pwd)/$lib"
+          install_name_tool -id @rpath/$lib $lib
+      done
+      popd
   fi
 }
 
@@ -373,14 +384,21 @@ pushd $TORCH_LIB_DIR
 
 # If all the builds succeed we copy the libraries, headers,
 # binaries to torch/lib
+echo "tools/build_pytorch_libs.sh succeeded at $(date)"
+echo "removing $INSTALL_DIR/lib/cmake and $INSTALL_DIR/lib/python"
 rm -rf "$INSTALL_DIR/lib/cmake"
 rm -rf "$INSTALL_DIR/lib/python"
+
+echo "Copying $INSTALL_DIR/lib to $(pwd)"
 $SYNC_COMMAND -r "$INSTALL_DIR/lib"/* .
 if [ -d "$INSTALL_DIR/lib64/" ]; then
     $SYNC_COMMAND -r "$INSTALL_DIR/lib64"/* .
 fi
+echo "Copying $(cd ../.. && pwd)/aten/src/generic/THNN.h to $(pwd)"
 $SYNC_COMMAND ../../aten/src/THNN/generic/THNN.h .
 $SYNC_COMMAND ../../aten/src/THCUNN/generic/THCUNN.h .
+
+echo "Copying $INSTALL_DIR/include to $(pwd)"
 $SYNC_COMMAND -r "$INSTALL_DIR/include" .
 if [ -d "$INSTALL_DIR/bin/" ]; then
     $SYNC_COMMAND -r "$INSTALL_DIR/bin/"/* .

From 0d345cfa181970d255ac9e14825b7ebf6cf2ce13 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 17 Sep 2018 08:42:49 -0700
Subject: [PATCH 185/237] Remove Type method defaults in ATen. (#11675)

Summary:
This will allow us to break the dependency cycle between Tensor and Type, because currently Type has defaulted Tensor (reference)  arguments.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11675

Reviewed By: ezyang

Differential Revision: D9819720

Pulled By: gchanan

fbshipit-source-id: a9577ac34a358120075129ab0654e7862d1dace6
---
 aten/src/ATen/core/Type.h         | 244 +++++++++++++++---------------
 aten/src/ATen/function_wrapper.py |  15 +-
 2 files changed, 128 insertions(+), 131 deletions(-)

diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index dabd19c7837223..dea7844e93debb 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -167,7 +167,7 @@ struct AT_API Type {
   virtual int64_t storage_offset(const Tensor & self) const = 0;
   virtual Tensor & resize_(Tensor & self, IntList size) const = 0;
   virtual Tensor & set_(Tensor & self, Storage source) const = 0;
-  virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride={}) const = 0;
+  virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride) const = 0;
   virtual Tensor & set_(Tensor & self, const Tensor & source) const = 0;
   virtual Tensor & set_(Tensor & self) const = 0;
   virtual bool is_contiguous(const Tensor & self) const = 0;
@@ -185,7 +185,7 @@ struct AT_API Type {
   virtual Tensor view(const Tensor & self, IntList size) const = 0;
   virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const = 0;
   virtual Tensor take(const Tensor & self, const Tensor & index) const = 0;
-  virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate=false) const = 0;
+  virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) const = 0;
   virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
   virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0;
   virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const = 0;
@@ -269,8 +269,8 @@ struct AT_API Type {
   virtual Tensor max(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor max(const Tensor & self) const = 0;
   virtual Tensor median(const Tensor & self) const = 0;
-  virtual std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim=-1, bool descending=false) const = 0;
-  virtual std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const = 0;
+  virtual std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim, bool descending) const = 0;
+  virtual std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) const = 0;
   virtual Tensor all(const Tensor & self) const = 0;
   virtual Tensor any(const Tensor & self) const = 0;
   virtual Tensor lgamma(const Tensor & self) const = 0;
@@ -285,8 +285,8 @@ struct AT_API Type {
   virtual Tensor frac(const Tensor & self) const = 0;
   virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
   virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
-  virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0;
-  virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0;
+  virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p) const = 0;
+  virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p) const = 0;
   virtual Tensor reciprocal(const Tensor & self) const = 0;
   virtual Tensor & reciprocal_(Tensor & self) const = 0;
   virtual Tensor neg(const Tensor & self) const = 0;
@@ -305,7 +305,7 @@ struct AT_API Type {
   virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0;
   virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
   virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
-  virtual Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const = 0;
+  virtual Tensor histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) const = 0;
   virtual Tensor sign(const Tensor & self) const = 0;
   virtual Tensor & sign_(Tensor & self) const = 0;
   virtual Tensor trace(const Tensor & self) const = 0;
@@ -321,71 +321,71 @@ struct AT_API Type {
   virtual Tensor & remainder_(Tensor & self, Scalar other) const = 0;
   virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const = 0;
   virtual Tensor & remainder_(Tensor & self, const Tensor & other) const = 0;
-  virtual Tensor tril(const Tensor & self, int64_t diagonal=0) const = 0;
-  virtual Tensor & tril_(Tensor & self, int64_t diagonal=0) const = 0;
-  virtual Tensor triu(const Tensor & self, int64_t diagonal=0) const = 0;
-  virtual Tensor & triu_(Tensor & self, int64_t diagonal=0) const = 0;
-  virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1) const = 0;
-  virtual Tensor diag(const Tensor & self, int64_t diagonal=0) const = 0;
-  virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
-  virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0;
+  virtual Tensor tril(const Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor & tril_(Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor triu(const Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor & triu_(Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) const = 0;
+  virtual Tensor diag(const Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
   virtual std::tuple<Tensor,Tensor> gels(const Tensor & self, const Tensor & A) const = 0;
-  virtual std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const = 0;
-  virtual std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors=false, bool upper=true) const = 0;
-  virtual std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors=false) const = 0;
-  virtual std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some=true) const = 0;
-  virtual Tensor potrf(const Tensor & self, bool upper=true) const = 0;
-  virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true) const = 0;
-  virtual Tensor potri(const Tensor & self, bool upper=true) const = 0;
-  virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper=true, Scalar tol=-1) const = 0;
+  virtual std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) const = 0;
+  virtual std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors, bool upper) const = 0;
+  virtual std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors) const = 0;
+  virtual std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some) const = 0;
+  virtual Tensor potrf(const Tensor & self, bool upper) const = 0;
+  virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) const = 0;
+  virtual Tensor potri(const Tensor & self, bool upper) const = 0;
+  virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper, Scalar tol) const = 0;
   virtual std::tuple<Tensor,Tensor> qr(const Tensor & self) const = 0;
   virtual std::tuple<Tensor,Tensor> geqrf(const Tensor & self) const = 0;
   virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0;
-  virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const = 0;
-  virtual std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot=true) const = 0;
-  virtual std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot=true) const = 0;
+  virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) const = 0;
+  virtual std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot) const = 0;
+  virtual std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot) const = 0;
   virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const = 0;
-  virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator=nullptr) const = 0;
-  virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator=nullptr) const = 0;
-  virtual Tensor & random_(Tensor & self, Generator * generator=nullptr) const = 0;
-  virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const = 0;
-  virtual Tensor & uniform_(Tensor & self, double from=0, double to=1, Generator * generator=nullptr) const = 0;
-  virtual Tensor & normal_(Tensor & self, double mean=0, double std=1, Generator * generator=nullptr) const = 0;
-  virtual Tensor & cauchy_(Tensor & self, double median=0, double sigma=1, Generator * generator=nullptr) const = 0;
-  virtual Tensor & log_normal_(Tensor & self, double mean=1, double std=2, Generator * generator=nullptr) const = 0;
-  virtual Tensor & exponential_(Tensor & self, double lambd=1, Generator * generator=nullptr) const = 0;
-  virtual Tensor & geometric_(Tensor & self, double p, Generator * generator=nullptr) const = 0;
-  virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride={}) const = 0;
+  virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator) const = 0;
+  virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator) const = 0;
+  virtual Tensor & random_(Tensor & self, Generator * generator) const = 0;
+  virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) const = 0;
+  virtual Tensor & uniform_(Tensor & self, double from, double to, Generator * generator) const = 0;
+  virtual Tensor & normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
+  virtual Tensor & cauchy_(Tensor & self, double median, double sigma, Generator * generator) const = 0;
+  virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
+  virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0;
+  virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0;
+  virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride) const = 0;
   virtual Tensor tensor(IntList size, IntList stride) const = 0;
   virtual Tensor abs(const Tensor & self) const = 0;
   virtual Tensor & abs_(Tensor & self) const = 0;
   virtual Tensor acos(const Tensor & self) const = 0;
   virtual Tensor & acos_(Tensor & self) const = 0;
-  virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
-  virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
-  virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha=1) const = 0;
-  virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha=1) const = 0;
-  virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
-  virtual bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0;
-  virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
-  AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step=1) const = 0);
+  virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
+  virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step) const = 0);
   AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0);
-  virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor argmax(const Tensor & self) const = 0;
-  virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor argmin(const Tensor & self) const = 0;
   virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0;
   virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0;
@@ -395,19 +395,19 @@ struct AT_API Type {
   virtual Tensor & asin_(Tensor & self) const = 0;
   virtual Tensor atan(const Tensor & self) const = 0;
   virtual Tensor & atan_(Tensor & self) const = 0;
-  virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0;
-  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator=nullptr) const = 0;
+  virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0;
   virtual Tensor bernoulli(const Tensor & self) const = 0;
-  virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0;
-  virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator=nullptr) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator) const = 0;
   virtual Tensor & bernoulli_(Tensor & self) const = 0;
-  virtual Tensor bincount(const Tensor & self, const Tensor & weights={}, int64_t minlength=0) const = 0;
+  virtual Tensor bincount(const Tensor & self, const Tensor & weights, int64_t minlength) const = 0;
   virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0;
   virtual Tensor ceil(const Tensor & self) const = 0;
   virtual Tensor & ceil_(Tensor & self) const = 0;
-  virtual std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim=0) const = 0;
+  virtual std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim) const = 0;
   virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const = 0;
   virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const = 0;
   virtual Tensor clamp_max(const Tensor & self, Scalar max) const = 0;
@@ -423,8 +423,8 @@ struct AT_API Type {
   virtual Tensor cumprod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
   virtual Tensor cumprod(const Tensor & self, int64_t dim) const = 0;
   virtual Tensor det(const Tensor & self) const = 0;
-  virtual Tensor diagflat(const Tensor & self, int64_t offset=0) const = 0;
-  virtual Tensor diagonal(const Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const = 0;
+  virtual Tensor diagflat(const Tensor & self, int64_t offset) const = 0;
+  virtual Tensor diagonal(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2) const = 0;
   virtual Tensor div(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor & div_(Tensor & self, const Tensor & other) const = 0;
   virtual Tensor div(const Tensor & self, Scalar other) const = 0;
@@ -439,10 +439,10 @@ struct AT_API Type {
   virtual Tensor & exp_(Tensor & self) const = 0;
   virtual Tensor expm1(const Tensor & self) const = 0;
   virtual Tensor & expm1_(Tensor & self) const = 0;
-  virtual Tensor expand(const Tensor & self, IntList size, bool implicit=false) const = 0;
+  virtual Tensor expand(const Tensor & self, IntList size, bool implicit) const = 0;
   virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0;
-  AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m=-1) const = 0);
-  virtual Tensor flatten(const Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) const = 0;
+  AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m) const = 0);
+  virtual Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim) const = 0;
   virtual Tensor & fill_(Tensor & self, Scalar value) const = 0;
   virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0;
   virtual Tensor floor(const Tensor & self) const = 0;
@@ -450,16 +450,16 @@ struct AT_API Type {
   AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0);
   virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0;
   virtual std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) const = 0;
-  virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0;
-  virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0;
-  virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true) const = 0;
-  virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const = 0;
+  virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
+  virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
+  virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided) const = 0;
+  virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const = 0;
   virtual Tensor index(const Tensor & self, TensorList indices) const = 0;
   virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
   virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0;
   virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0;
   virtual Tensor inverse(const Tensor & self) const = 0;
-  virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0;
+  virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
   virtual bool is_cuda(const Tensor & self) const = 0;
   virtual bool is_distributed(const Tensor & self) const = 0;
   virtual bool is_floating_point(const Tensor & self) const = 0;
@@ -468,8 +468,8 @@ struct AT_API Type {
   virtual bool is_same_size(const Tensor & self, const Tensor & other) const = 0;
   virtual bool is_signed(const Tensor & self) const = 0;
   virtual bool is_sparse(const Tensor & self) const = 0;
-  virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const = 0;
-  AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps=100) const = 0);
+  virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) const = 0;
+  AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps) const = 0);
   virtual Tensor log(const Tensor & self) const = 0;
   virtual Tensor & log_(Tensor & self) const = 0;
   virtual Tensor log10(const Tensor & self) const = 0;
@@ -479,23 +479,23 @@ struct AT_API Type {
   virtual Tensor log2(const Tensor & self) const = 0;
   virtual Tensor & log2_(Tensor & self) const = 0;
   virtual Tensor logdet(const Tensor & self) const = 0;
-  AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps=100) const = 0);
+  AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps) const = 0);
   virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0;
-  virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor matrix_power(const Tensor & self, int64_t n) const = 0;
-  virtual std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
-  virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor mean(const Tensor & self, ScalarType dtype) const = 0;
   virtual Tensor mean(const Tensor & self) const = 0;
   virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
-  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor mean(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
-  virtual std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
-  virtual std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
-  virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor mm(const Tensor & self, const Tensor & mat2) const = 0;
-  virtual std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim=-1, bool keepdim=false) const = 0;
+  virtual std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor mul(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor & mul_(Tensor & self, const Tensor & other) const = 0;
   virtual Tensor mul(const Tensor & self, Scalar other) const = 0;
@@ -507,13 +507,13 @@ struct AT_API Type {
   AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0);
   virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
   virtual Tensor pin_memory(const Tensor & self) const = 0;
-  virtual Tensor pinverse(const Tensor & self, double rcond=1e-15) const = 0;
-  AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator=nullptr) const = 0);
-  AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator=nullptr) const = 0);
-  AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator=nullptr) const = 0);
-  AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator=nullptr) const = 0);
-  AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator=nullptr) const = 0);
-  AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step=1) const = 0);
+  virtual Tensor pinverse(const Tensor & self, double rcond) const = 0;
+  AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step) const = 0);
   virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0;
   virtual Tensor reshape(const Tensor & self, IntList shape) const = 0;
   virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0;
@@ -521,7 +521,7 @@ struct AT_API Type {
   virtual Tensor & round_(Tensor & self) const = 0;
   virtual Tensor relu(const Tensor & self) const = 0;
   virtual Tensor & relu_(Tensor & self) const = 0;
-  virtual Tensor hardshrink(const Tensor & self, Scalar lambd=0.5) const = 0;
+  virtual Tensor hardshrink(const Tensor & self, Scalar lambd) const = 0;
   virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0;
   virtual Tensor rsqrt(const Tensor & self) const = 0;
   virtual Tensor & rsqrt_(Tensor & self) const = 0;
@@ -535,32 +535,32 @@ struct AT_API Type {
   virtual Tensor detach(const Tensor & self) const = 0;
   virtual Tensor & detach_(Tensor & self) const = 0;
   virtual int64_t size(const Tensor & self, int64_t dim) const = 0;
-  virtual Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const = 0;
+  virtual Tensor slice(const Tensor & self, int64_t dim, int64_t start, int64_t end, int64_t step) const = 0;
   virtual std::tuple<Tensor,Tensor> slogdet(const Tensor & self) const = 0;
   virtual Tensor smm(const Tensor & self, const Tensor & mat2) const = 0;
   virtual Tensor softmax(const Tensor & self, int64_t dim) const = 0;
-  virtual std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim=0) const = 0;
-  virtual std::vector<Tensor> split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim=0) const = 0;
+  virtual std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim) const = 0;
+  virtual std::vector<Tensor> split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim) const = 0;
   virtual Tensor squeeze(const Tensor & self) const = 0;
   virtual Tensor squeeze(const Tensor & self, int64_t dim) const = 0;
   virtual Tensor & squeeze_(Tensor & self) const = 0;
   virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0;
-  virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const = 0;
+  virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const = 0;
   virtual int64_t stride(const Tensor & self, int64_t dim) const = 0;
   virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0;
   virtual Tensor sum(const Tensor & self) const = 0;
   virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0;
-  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim=false) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim) const = 0;
   virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0;
   virtual Tensor sqrt(const Tensor & self) const = 0;
   virtual Tensor & sqrt_(Tensor & self) const = 0;
-  virtual Tensor std(const Tensor & self, bool unbiased=true) const = 0;
-  virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0;
+  virtual Tensor std(const Tensor & self, bool unbiased) const = 0;
+  virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
   virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0;
   virtual Tensor prod(const Tensor & self) const = 0;
   virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
-  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor prod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
   virtual Tensor t(const Tensor & self) const = 0;
   virtual Tensor & t_(Tensor & self) const = 0;
@@ -571,29 +571,29 @@ struct AT_API Type {
   virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const = 0;
   virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const = 0;
   virtual Tensor flip(const Tensor & self, IntList dims) const = 0;
-  virtual Tensor rot90(const Tensor & self, int64_t k=1, IntList dims={0,1}) const = 0;
+  virtual Tensor rot90(const Tensor & self, int64_t k, IntList dims) const = 0;
   virtual Tensor trunc(const Tensor & self) const = 0;
   virtual Tensor & trunc_(Tensor & self) const = 0;
   virtual Tensor type_as(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const = 0;
   virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const = 0;
-  virtual Tensor var(const Tensor & self, bool unbiased=true) const = 0;
-  virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0;
+  virtual Tensor var(const Tensor & self, bool unbiased) const = 0;
+  virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
   virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0;
   AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0);
-  virtual Tensor norm(const Tensor & self, Scalar p=2) const = 0;
-  virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const = 0;
+  virtual Tensor norm(const Tensor & self, Scalar p) const = 0;
+  virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0;
   virtual Tensor clone(const Tensor & self) const = 0;
   virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0;
   virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0;
   virtual Tensor & zero_(Tensor & self) const = 0;
-  virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
-  virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0;
-  virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1) const = 0;
-  virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha=1) const = 0;
-  virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0;
-  virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0;
+  virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor tensor() const = 0;
   virtual Tensor tensor(IntList size) const = 0;
   virtual Tensor native_sparse_coo_tensor(IntList size) const = 0;
@@ -615,12 +615,12 @@ struct AT_API Type {
   virtual Tensor _indices(const Tensor & self) const = 0;
   virtual Tensor _values(const Tensor & self) const = 0;
   virtual int64_t numel(const Tensor & self) const = 0;
-  virtual std::vector<Tensor> unbind(const Tensor & self, int64_t dim=0) const = 0;
+  virtual std::vector<Tensor> unbind(const Tensor & self, int64_t dim) const = 0;
   virtual int64_t get_device(const Tensor & self) const = 0;
-  virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking=false) const = 0;
-  virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking=false) const = 0;
-  virtual Tensor to(const Tensor & self, Device device, bool non_blocking=false) const = 0;
-  virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking=false) const = 0;
+  virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) const = 0;
+  virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking) const = 0;
+  virtual Tensor to(const Tensor & self, Device device, bool non_blocking) const = 0;
+  virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking) const = 0;
   virtual Scalar _local_scalar(const Tensor & self) const = 0;
 protected:
   TensorTypeId type_id_;
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index e88dd8867a0e96..4ba677a473ef7a 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -39,7 +39,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #    declaration under Type.h  (right now, we call this template
 #    BROADCAST but it also handles default arguments)
 TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\
-${return_type} ${api_name}(${type_method_formals_with_defaults}) const override;
+${return_type} ${api_name}(${type_method_formals}) const override;
 """)
 # 2. broadcasting functions are implemented in Type.cpp
 TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\
@@ -60,18 +60,18 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #    for 'native' declarations (so the native dispatch is hardcoded into
 #    the template here.)
 PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\
-virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0;
+virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const = 0;
 """)
 DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\
 AT_DEPRECATED(virtual ${return_type} \
-${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0);
+${method_prefix_derived}${api_name}(${type_method_formals}) const = 0);
 """)
 PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\
-virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const = 0;
+virtual ${return_type} ${api_name}(${type_method_formals}) const = 0;
 """)
 
 TYPE_METHOD_DECLARATION_ABSTRACT = CodeTemplate("""\
-${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const override;
+${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override;
 """)
 TYPE_METHOD_DEFINITION_ABSTRACT = CodeTemplate("""\
 ${return_type} TypeDefault::${method_prefix_derived}${api_name}(${type_method_formals}) const {
@@ -79,7 +79,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 }
 """)
 TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\
-${return_type} ${api_name}(${type_method_formals_with_defaults}) const override;
+${return_type} ${api_name}(${type_method_formals}) const override;
 """)
 TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\
 ${return_type} TypeDefault::${api_name}(${type_method_formals}) const {
@@ -515,7 +515,6 @@ def __getitem__(self, x):
     'type_definition_body': List[str],
     'type_method_actuals': List[str],
     'type_method_definition_dispatch': str,
-    'type_method_formals_with_defaults': List[str],
     'type_method_formals': List[str],
     'variants': str,
     'when_spares_dispatch': str,
@@ -817,7 +816,6 @@ def process_option(option, output_options):
 
         # There are no cases where these differ, but they do in native_functions
         option['type_method_formals'] = option['formals']
-        option['type_method_formals_with_defaults'] = option['formals_with_defaults']
         option['type_method_actuals'] = option['actuals']
 
         option['const_mark'] = '' if option['inplace'] else ' const'
@@ -1051,7 +1049,6 @@ def find_formal(formal_name, formals):
             dispatch_type['is_type_dispatched'] = True
 
         option['type_method_formals'] = [format_formal(f) for f in formals if f != dispatch_type]
-        option['type_method_formals_with_defaults'] = [formal_with_default(f) for f in formals if f != dispatch_type]
         option['type_method_actuals'] = [f['name'] for f in formals if f != dispatch_type]
         option['native_actuals'] = [f['name'] if f != dispatch_type else '*this' for f in formals]
 

From 35518b3dc79e081fca3da29cc0c6ef9451458b38 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 17 Sep 2018 08:49:27 -0700
Subject: [PATCH 186/237] Back out "Back out "Refactor Tensor/TensorImpl
 constructors."" E2: Confirm problem with old patch (#11744)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11744

Original commit changeset: 093e4c47d557

Restores D9813742

Reviewed By: dzhulgakov

Differential Revision: D9847835

fbshipit-source-id: f3f467891e01c923dd9d3352d892cf59e10402f1
---
 caffe2/core/tensor.h      | 76 ++++++++++++++++++++++++++++-----------
 caffe2/core/tensor_impl.h | 71 +-----------------------------------
 2 files changed, 56 insertions(+), 91 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index c17667c006f54c..a167658060e7fc 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -50,45 +50,75 @@ class CAFFE2_API Tensor final {
     return impl_.get();
   }
 
-  explicit Tensor(DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(type)) {}
-
+  explicit Tensor(Storage storage)
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(std::move(storage))) {}
+
+  /**
+   * @brief Creates a tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   */
   explicit Tensor(const vector<TIndex>& dims, DeviceType type)
-      : impl_(
-            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
+      : Tensor(Storage(type)) {
+    // TODO: here, we create a Storage
+    // and immediately discard it in Resize() since
+    // reset_tensor will be true and FreeMemory will be called,
+    // we might want to avoid creating Storage twice?
+    Resize(dims);
+  }
 
   explicit Tensor(const vector<int>& dims, DeviceType type)
-      : impl_(
-            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
+      : Tensor(Storage(type)) {
+    Resize(dims);
+  }
 
+  /**
+   * context_for_copy is required to have the same DeviceType as src
+   */
   Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            *src.impl_,
-            context_for_copy,
-            type)) {}
+      : Tensor(Storage(type)) {
+    CopyFrom(src, context_for_copy);
+  }
 
+  /**
+   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
+   * src Tensor
+   */
   Tensor(const Tensor& src, DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            *src.impl_,
-            type)) {}
+      : Tensor(Storage(type)) {
+    CopyFrom(src);
+  }
 
+  /**
+   * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
+   */
   template <typename T>
   Tensor(
       const vector<TIndex>& dims,
       const vector<T>& values,
       BaseContext* context)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            dims,
-            values,
-            context)) {}
+      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
+    Resize(dims);
+    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size());
+    context->CopyItemsFromCPU(
+        storage().dtype(), size(), values.data(), mutable_data<T>());
+  }
 
+  /**
+   * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
+   */
   template <
       typename T,
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
   Tensor(const T& value, BaseContext* context)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            value,
-            context)) {}
+      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
+    Resize(std::vector<TIndex>{});
+    context->CopyItemsFromCPU(
+        storage().dtype(), size(), &value, mutable_data<T>());
+  }
 
   Tensor Clone() const {
     Tensor x(GetDeviceType());
@@ -276,6 +306,10 @@ class CAFFE2_API Tensor final {
   inline void ExtractDeviceOption(DeviceOption* device) const {
     return impl_.get()->ExtractDeviceOption(device);
   }
+
+  const Storage& storage() {
+    return impl_->storage();
+  }
 };
 
 using TensorCPU = Tensor;
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 4cdf612a1fb091..2e01c55318d88f 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -88,77 +88,8 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
 class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  public:
   TensorImpl() = delete;
-  explicit TensorImpl(DeviceType device_type) : storage_(device_type) {}
 
-  /**
-   * @brief Creates a tensor of the given dimension.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * the first time mutable_data() is called.
-   */
-  // TODO: here, we create a Storage
-  // and immediately discard it in Resize() since
-  // reset_tensor will be true and FreeMemory will be called,
-  // we might want to avoid creating Storage twice?
-  explicit TensorImpl(const std::vector<TIndex>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  explicit TensorImpl(const std::vector<int>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  /* Now we require that context_for_copy has the same device type as src since
-   * template is removed
-   */
-  TensorImpl(
-      const TensorImpl& src,
-      at::BaseContext* context_for_copy,
-      at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src, context_for_copy);
-  }
-
-  /**
-   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
-   * src Tensor
-   */
-  TensorImpl(const TensorImpl& src, at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src);
-  }
-
-  /**
-   * @brief Creates a tensor, and fills its contents with the given values.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <typename T>
-  TensorImpl(
-      const std::vector<TIndex>& dims,
-      const std::vector<T>& values,
-      at::BaseContext* context)
-      : storage_(context->device_type(), TypeMeta::Make<T>()) {
-    Resize(dims);
-    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, values.data(), mutable_data<T>());
-  }
-
-  /**
-   * @brief Creates a scalar tensor, and fills its content with the given value.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  TensorImpl(const T& value, at::BaseContext* context)
-      : storage_(context->device_type(), TypeMeta::Make<T>()) {
-    Resize(std::vector<TIndex>{});
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, &value, mutable_data<T>());
-  }
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {}
 
   TensorImpl(const TensorImpl&) = default;
   TensorImpl& operator=(const TensorImpl&) = default;

From 2baba7f835ec01e5b0776ef0817064d5d6b32116 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 17 Sep 2018 08:49:29 -0700
Subject: [PATCH 187/237] Add storage_offset to Caffe2 (#11701)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11701

There's one extra multiply from TypeMeta::itemsize() which needs
to be characterized.  For all existing Caffe2 uses, storage_offset
is zero.

Reviewed By: li-roy

Differential Revision: D9831230

fbshipit-source-id: 353678edf76d2ccc297a73475a34f6ab2a20d1e1
---
 caffe2/core/tensor_impl.h | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 2e01c55318d88f..cec550132ab9c6 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -89,7 +89,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  public:
   TensorImpl() = delete;
 
-  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {}
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {}
 
   TensorImpl(const TensorImpl&) = default;
   TensorImpl& operator=(const TensorImpl&) = default;
@@ -312,11 +312,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       if (reserved_) {
         // If tensor is reserved then don't claim its memeory unless capacity()
         // is smaller than new size
-        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize();
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize();
       } else {
-        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() ||
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() ||
             !FLAGS_caffe2_keep_on_shrink ||
-            storage_.capacity() - numel_ * storage_.itemsize() >
+            storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() >
                 FLAGS_caffe2_max_keep_on_shrink_memory;
       }
 
@@ -371,6 +371,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   inline void FreeMemory() {
     // We'll detach from the old Storage and create a new one
     storage_ = at::Storage(storage_.device_type(), storage_.dtype());
+    storage_offset_ = 0;
   }
 
   /**
@@ -420,6 +421,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
      * this still keeps the original semantics
      */
     storage_ = src.storage();
+    storage_offset_ = src.storage_offset();
   }
 
   /**
@@ -472,10 +474,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
           "To share data with a raw pointer, you need to set shape first.");
       storage_.UniqueStorageShareExternalPointer(
           std::move(data_ptr), data_type, capacity);
+      storage_offset_ = 0;
     } else {
       int64_t numel = capacity / data_type.itemsize();
       // Create a new Storage
       storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+      storage_offset_ = 0;
     }
   }
 
@@ -485,7 +489,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   inline const void* raw_data() const {
     CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
-    return storage_.data();
+    return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * storage_.itemsize());
   }
 
   /**
@@ -508,7 +512,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         ", while tensor contains ",
         storage_.dtype().name(),
         ". ");
-    return static_cast<T*>(storage_.data());
+    return static_cast<T*>(storage_.data()) + storage_offset_;
   }
 
   /**
@@ -525,13 +529,14 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   inline void* raw_mutable_data(const TypeMeta& meta) {
     // For 0-size tensors it's fine to return any pointer (including nullptr)
     if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) {
-      return storage_.data();
+      return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
     } else {
       CAFFE_ENFORCE_WITH_CALLER(
           numel_ >= 0,
           "Tensor is not initialized. You probably need to call Resize() "
           "before calling mutable_data()");
       bool had_special_dtor = storage_.dtype().dtor() != nullptr;
+      storage_offset_ = 0;
       if (storage_.unique()) {
         storage_.set_dtype(meta);
       } else {
@@ -546,6 +551,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       if (numel_ == 0 ||
           (meta.ctor() == nullptr && !had_special_dtor &&
            storage_.numel() >= numel_)) {
+        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
         return storage_.data();
       }
       const at::Allocator* allocator = storage_.allocator();
@@ -584,6 +590,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
             at::Device(storage_.device_type())));
       }
       storage_.set_numel(numel_);
+      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
       return storage_.data();
     }
   }
@@ -614,7 +621,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   template <typename T>
   inline T* mutable_data() {
     if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
-      return static_cast<T*>(storage_.data());
+      return static_cast<T*>(storage_.data()) + storage_offset_;
     }
     // Check it here statically - otherwise TypeMeta would throw the runtime
     // error in attempt to invoke TypeMeta::ctor()
@@ -652,6 +659,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     ;
   }
 
+  // NB: This capacity may also include available space
+  // in the storage BEFORE the tensor data, if storage_offset != 0
   inline size_t capacity_nbytes() const {
     return storage_.capacity();
   }
@@ -746,6 +755,10 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_;
   }
 
+  int64_t storage_offset() const {
+    return storage_offset_;
+  }
+
  protected:
   using DimVector = std::vector<TIndex>;
   DimVector dims_; // sizes_
@@ -756,7 +769,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   // then subsequent Resize()s will not free up Storage.
   bool reserved_ = false;
   at::Storage storage_;
-  // int64_t storage_offset_;
+  int64_t storage_offset_ = 0;
 
  private:
   template <

From 6660a128a5d2ccaf84a1552bcbeaf62d9743521c Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 17 Sep 2018 08:49:31 -0700
Subject: [PATCH 188/237] Cache and use TypeMeta in TensorImpl (#11706)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11706

This is necessary to handle use-cases when Storage is not set (because the
tensor in question doesn't have a notion of storage.)

Reviewed By: orionr

Differential Revision: D9833361

fbshipit-source-id: e90a384019f44f57682b687d129b54e85b6fabb9
---
 caffe2/core/tensor_impl.h | 49 +++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index cec550132ab9c6..3cd5bd23d446fb 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -89,7 +89,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
  public:
   TensorImpl() = delete;
 
-  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {}
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {
+    data_type_ = storage_ ? storage_.dtype() : TypeMeta{};
+  }
 
   TensorImpl(const TensorImpl&) = default;
   TensorImpl& operator=(const TensorImpl&) = default;
@@ -130,25 +132,27 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     if ((void*)&src == (void*)this) {
       return;
     }
-    if (storage_.dtype() != src.meta()) {
+    if (data_type_ != src.meta()) {
       storage_ = at::Storage(GetDeviceType(), src.meta());
+      data_type_ = src.meta();
     }
     if (src.size() == -1) {
       dims_.clear();
       numel_ = -1;
       storage_.reset();
+      data_type_ = TypeMeta();
       return;
     }
     Resize(src.dims());
     if (size() > 0) {
-      if (storage_.dtype().copy()) {
+      if (data_type_.copy()) {
         CAFFE_ENFORCE(
             GetDeviceType() == CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
         CAFFE_ENFORCE(
             src.GetDeviceType() == CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size());
+        data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
       } else {
         // We'll need to use a non-CPU context to perform the copy if
         // one of the context is not CPU since only non-CPU context
@@ -222,11 +226,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     auto oldSize = numel_;
     auto oldDims = dims_;
     Resize(newCapacity);
-    auto* newData = raw_mutable_data(storage_.dtype());
+    auto* newData = raw_mutable_data(data_type_);
     CAFFE_ENFORCE(
         context != nullptr, "Context must be provided to Extend the tensor");
     context->CopyItemsSameDevice(
-        storage_.dtype(), oldSize, oldData.get(), newData);
+        data_type_, oldSize, oldData.get(), newData);
     reserved_ = true;
     dims_ = newDims;
     numel_ = newNumel;
@@ -282,7 +286,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     auto oldDims = dims_;
     Resize(newCapacity);
     // Allocate new memory but don't copy over the data
-    raw_mutable_data(storage_.dtype());
+    raw_mutable_data(data_type_);
     dims_ = oldDims;
     numel_ = oldSize;
     reserved_ = true;
@@ -370,7 +374,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   inline void FreeMemory() {
     // We'll detach from the old Storage and create a new one
-    storage_ = at::Storage(storage_.device_type(), storage_.dtype());
+    storage_ = at::Storage(storage_.device_type(), data_type_);
     storage_offset_ = 0;
   }
 
@@ -382,7 +386,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   std::string DebugString() const {
     std::stringstream ss;
     ss << "A Tensor of item size " << storage_.itemsize() << " and type "
-       << storage_.dtype().name() << " and dimension (";
+       << data_type_.name() << " and dimension (";
     for (int d : dims_) {
       ss << d << ",";
     }
@@ -421,6 +425,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
      * this still keeps the original semantics
      */
     storage_ = src.storage();
+    data_type_ = src.dtype();
     storage_offset_ = src.storage_offset();
   }
 
@@ -474,11 +479,13 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
           "To share data with a raw pointer, you need to set shape first.");
       storage_.UniqueStorageShareExternalPointer(
           std::move(data_ptr), data_type, capacity);
+      data_type_ = data_type;
       storage_offset_ = 0;
     } else {
       int64_t numel = capacity / data_type.itemsize();
       // Create a new Storage
       storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+      data_type_ = data_type;
       storage_offset_ = 0;
     }
   }
@@ -510,7 +517,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         "Tensor type mismatch, caller expects elements to be ",
         TypeMeta::TypeName<T>(),
         ", while tensor contains ",
-        storage_.dtype().name(),
+        data_type_.name(),
         ". ");
     return static_cast<T*>(storage_.data()) + storage_offset_;
   }
@@ -528,22 +535,23 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   inline void* raw_mutable_data(const TypeMeta& meta) {
     // For 0-size tensors it's fine to return any pointer (including nullptr)
-    if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) {
+    if (data_type_ == meta && (storage_.data() || numel_ == 0)) {
       return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
     } else {
       CAFFE_ENFORCE_WITH_CALLER(
           numel_ >= 0,
           "Tensor is not initialized. You probably need to call Resize() "
           "before calling mutable_data()");
-      bool had_special_dtor = storage_.dtype().dtor() != nullptr;
+      bool had_special_dtor = data_type_.dtor() != nullptr;
       storage_offset_ = 0;
       if (storage_.unique()) {
         storage_.set_dtype(meta);
       } else {
-        if (storage_.dtype() != meta) {
+        if (data_type_ != meta) {
           storage_ = at::Storage(storage_.device_type(), meta);
         }
       }
+      data_type_ = meta;
 
       // We can reuse the existing buffer if the current data does not have
       // a special destructor and the new data doesn't have a special
@@ -564,7 +572,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         // making sure that when the data is freed, it calls the right
         // destruction procedure.
         auto size = numel_;
-        auto dtor = storage_.dtype().dtor();
+        auto dtor = data_type_.dtor();
         void* ptr;
         at::DeleterFnPtr deleter;
         auto ptr_and_deleter = GetStaticContext()->New(
@@ -579,7 +587,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
               deleter(local_ptr);
             },
             at::Device(storage_.device_type())));
-        storage_.dtype().ctor()(storage_.data(), numel_);
+        data_type_.ctor()(storage_.data(), numel_);
       } else {
         // For fundamental type, new and delete is easier.
         auto ptr_and_deleter =
@@ -606,10 +614,10 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   inline void* raw_mutable_data() {
     CAFFE_ENFORCE_WITH_CALLER(
-        storage_.dtype().id() != TypeIdentifier::uninitialized(),
+        data_type_.id() != TypeIdentifier::uninitialized(),
         "Calling raw_mutable_data() without meta, but the current meta is "
         "of unknown type.");
-    return raw_mutable_data(storage_.dtype());
+    return raw_mutable_data(data_type_);
   }
 
   /**
@@ -709,7 +717,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * Returns the TypeMeta object associated with the current data type.
    */
   inline const TypeMeta& meta() const {
-    return storage_.dtype();
+    return data_type_;
+  }
+
+  inline const TypeMeta& dtype() const {
+    return data_type_;
   }
 
   /**
@@ -770,6 +782,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   bool reserved_ = false;
   at::Storage storage_;
   int64_t storage_offset_ = 0;
+  TypeMeta data_type_;
 
  private:
   template <

From f6a6d7fae17a1003e2e40735abe0d94b6a59dbac Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 17 Sep 2018 09:08:34 -0700
Subject: [PATCH 189/237] Switch at::TensorImpl to store TypeMeta rather than
 ScalarType

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11702

Reviewed By: cpuhrsch

Differential Revision: D9831384

fbshipit-source-id: 1b1233a70ed70b47a3dab4a5797b6cfcb7a2c265
---
 aten/src/ATen/Context.cpp                     |  2 +-
 aten/src/ATen/SparseTensorImpl.cpp            |  6 ++---
 aten/src/ATen/SparseTensorImpl.h              |  2 +-
 aten/src/ATen/core/Tensor.h                   |  2 +-
 aten/src/ATen/core/TensorImpl.cpp             | 14 +++++------
 aten/src/ATen/core/TensorImpl.h               | 24 +++++++------------
 aten/src/ATen/core/UndefinedTensorImpl.cpp    |  2 +-
 aten/src/ATen/core/typeid.h                   | 10 ++++----
 aten/src/ATen/function_wrapper.py             | 15 ++++++------
 aten/src/ATen/native/sparse/SparseTensor.cpp  |  2 +-
 aten/src/ATen/templates/SparseTypeDerived.cpp |  2 +-
 aten/src/ATen/templates/Tensor.h              |  2 +-
 torch/csrc/autograd/variable.cpp              |  4 ++--
 torch/csrc/jit/interpreter.cpp                |  2 +-
 14 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index cb96814927130c..c73d2efd8ea813 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -116,7 +116,7 @@ TypeExtendedInterface& getType(TensorOptions options) {
 TypeExtendedInterface& getType(const TensorImpl* impl) {
   Backend backend = tensorTypeIdToBackend(impl->type_id());
   return globalContext().getType(
-            backend, impl->scalar_type(), impl->is_variable());
+            backend, dataTypeToScalarType(impl->dtype().id()), impl->is_variable());
 }
 
 TypeExtendedInterface& getType(const Tensor& t) {
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 3f13d59b4467e5..66b71dd7b8a650 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -28,13 +28,13 @@ namespace {
 //
 // This means that we allocate a [1,0] size indices tensor and a [0] size
 // values tensor for such an empty tensor.
-SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, at::ScalarType scalar_type)
-    : TensorImpl(type_id, scalar_type, nullptr, false)
+SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type)
+    : TensorImpl(type_id, data_type, nullptr, false)
     , size_{0}
     , sparseDims_(1)
     , denseDims_(0)
     , indices_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), ScalarType::Long)->tensor({1, 0}))
-    , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), scalar_type)->tensor()) {}
+    , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), dataTypeToScalarType(data_type.id()))->tensor()) {}
 
 IntList SparseTensorImpl::sizes() const {
   return size_;
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 77ad805ac99e6f..42b670bea08541 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -36,7 +36,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
 
 public:
   // Public for now...
-  explicit SparseTensorImpl(at::TensorTypeId, at::ScalarType);
+  explicit SparseTensorImpl(at::TensorTypeId, const caffe2::TypeMeta&);
 
   int64_t nnz() const { return values_.size(0); }
   int64_t sparseDims() const { return sparseDims_; }
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index 4b1f18655547d3..ca23584f870ce7 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -142,7 +142,7 @@ struct AT_API Tensor {
     return tensor_impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return tensor_impl_->scalar_type();
+    return dataTypeToScalarType(tensor_impl_->dtype().id());
   }
   const Storage& storage() const {
     return tensor_impl_->storage();
diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index 1bf6802c7d4d8e..5b568482d8dfe2 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -17,19 +17,19 @@ const Tensor& TensorImpl::grad() const {
   AT_ERROR("grad is not implemented for Tensor");
 }
 
-TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
-    : TensorImpl({}, type_id, scalar_type, is_variable) {
+TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable)
+    : TensorImpl({}, type_id, data_type, is_variable) {
   // UndefinedTensors and SparseTensors don't have storages.
-  if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined
+  if (type_id != UndefinedTensorId() && data_type.id() != caffe2::TypeIdentifier::uninitialized()
       && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true);
+    storage_ = Storage(data_type, 0, allocator, true);
   }
 }
 
 TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable)
-    : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {}
+    : TensorImpl(std::move(storage), type_id, storage.dtype(), is_variable) {}
 
-TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable)
+TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable)
     : storage_(std::move(storage)),
       storage_offset_(0),
       sizes_{0},
@@ -37,7 +37,7 @@ TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scala
       is_contiguous_(true),
       numel_(0),
       type_id_(type_id),
-      scalar_type_(scalar_type),
+      data_type_(data_type),
       is_variable_(is_variable) {}
 
 IntList TensorImpl::sizes() const {
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 831c53e23b6d49..d2f98ff52780f8 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -22,7 +22,7 @@ struct Tensor;
 namespace at {
 struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
-  TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
+  TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
   TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
 
   virtual void release_resources() override;
@@ -32,7 +32,7 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
     // could not have been created without initializing the Type first.
     // TODO: This is not actually true via the Caffe2 codepath!  Make
     // it so.
-    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable());
+    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), dataTypeToScalarType(dtype().id()), is_variable());
   }
 
   TensorTypeId type_id() const { return type_id_; }
@@ -103,7 +103,7 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   inline void* data() const {
     return static_cast<void*>(
         static_cast<char*>(storage_.data()) +
-        at::elementSize(scalar_type_) * storage_offset_);
+        data_type_.itemsize() * storage_offset_);
   }
 
   template <typename T>
@@ -111,14 +111,8 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_.unsafe_data<T>() + storage_offset_;
   }
 
-  // TODO: Remove this once we get rid of scalar_type and use dmeta or dtype
-  // instead.
-  inline at::ScalarType scalar_type() const {
-    return scalar_type_;
-  }
-
-  inline caffe2::TypeMeta dtype() const {
-    return storage_.dtype();
+  const caffe2::TypeMeta& dtype() const {
+    return data_type_;
   }
 
   virtual int64_t storage_offset() const {
@@ -204,13 +198,13 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
     is_contiguous_ = compute_contiguous();
   }
   TensorTypeId type_id_;
-  // INVARIANT: When storage is non-null, this scalar type must
-  // agree with the scalar type in storage
-  ScalarType scalar_type_;
+  // INVARIANT: When storage is non-null, this type meta must
+  // agree with the type meta in storage
+  caffe2::TypeMeta data_type_;
   bool is_variable_ = false;
   bool is_wrapped_number_ = false;
 
  private:
-  TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
+  TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable);
 };
 } // namespace at
diff --git a/aten/src/ATen/core/UndefinedTensorImpl.cpp b/aten/src/ATen/core/UndefinedTensorImpl.cpp
index d270e379a3415e..e26b61a03c87e0 100644
--- a/aten/src/ATen/core/UndefinedTensorImpl.cpp
+++ b/aten/src/ATen/core/UndefinedTensorImpl.cpp
@@ -5,7 +5,7 @@ namespace at {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensorImpl::UndefinedTensorImpl()
-: TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) {
+: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is variable */ false) {
 }
 
 IntList UndefinedTensorImpl::sizes() const {
diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index e258d3be6a7fa3..9741de03dbaedb 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -49,9 +49,9 @@ class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, ui
       TypeIdentifier typeId);
   friend bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
 
-  // This is 8, because 0 is uint8_t (due to ScalarType BC constraint)
+  // 0 is uint8_t (due to ScalarType BC constraint)
   static constexpr TypeIdentifier uninitialized() {
-    return TypeIdentifier(8);
+    return TypeIdentifier(11);
   }
 
  private:
@@ -452,10 +452,8 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
 
 class Tensor;
 
-// Note: we have preallocated the numbers 0-8 so they line up exactly
+// Note: we have preallocated the numbers so they line up exactly
 // with at::ScalarType's numbering.  All other numbers do not matter.
-//
-// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins.
 
 struct _CaffeHighestPreallocatedTypeId final {};
 
@@ -470,7 +468,7 @@ CAFFE_DECLARE_KNOWN_TYPE(7, double)
 CAFFE_DECLARE_KNOWN_TYPE(8, at::ComplexHalf)
 CAFFE_DECLARE_KNOWN_TYPE(9, std::complex<float>)
 CAFFE_DECLARE_KNOWN_TYPE(10, std::complex<double>)
-// 10 = undefined type id
+// 11 = undefined type id
 
 CAFFE_DECLARE_KNOWN_TYPE(12, Tensor)
 CAFFE_DECLARE_KNOWN_TYPE(13, std::string)
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 4ba677a473ef7a..5df218a89cc06d 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -192,7 +192,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 BUFFER_DEFINITION = CodeTemplate("""\
 auto ${name}_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-    ${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false).release();
+    ${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), ${THTensor}_new(), false).release();
 auto ${name} = Tensor(${name}_, false);""")
 
 CONDITIONAL_INITIALIZER = CodeTemplate("""\
@@ -333,17 +333,18 @@ def __init__(self, reason):
 
 ALLOC_NOARGS_WRAP = {
     'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                 '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
+                 '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()',
     'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                     '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()',
+                     '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Byte), allocator(), false).release()',
     'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                      '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()',
+                      '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), allocator(), false).release()',
     'THIntegerTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                        '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()',
+                        '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Int), allocator(), false).release()',
     'THDenseTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                      '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
+                      '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()',
     'THDenseIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
-                           '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()'
+                           '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), '
+                           'allocator(), false).release()'
 }
 
 ALLOC_WRAP = {
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index b09feb0dc1e7dd..25fd4fc5df4326 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -68,7 +68,7 @@ SparseTensor new_sparse(const SparseType& dtype) {
   } else {
     type_id = SparseCPUTensorId();
   }
-  return SparseTensor(c10::make_intrusive<SparseTensorImpl>(type_id, dtype.scalarType()));
+  return SparseTensor(c10::make_intrusive<SparseTensorImpl>(type_id, scalarTypeToTypeMeta(dtype.scalarType())));
 }
 
 /*** Helper methods ***/
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index dc419b9ad52c01..61035f2c3d38f7 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -33,7 +33,7 @@ ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
 caffe2::TypeMeta ${Type}::typeMeta() const {
-  AT_ERROR("typeMeta not defined for SparseType");
+  return caffe2::TypeMeta::Make<${ScalarType}>();
 }
 Backend ${Type}::backend() const {
   return Backend::${Backend};
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 50f6bb83bb7edb..02bef7f89a6057 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -142,7 +142,7 @@ struct AT_API Tensor {
     return tensor_impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return tensor_impl_->scalar_type();
+    return dataTypeToScalarType(tensor_impl_->dtype().id());
   }
   const Storage& storage() const {
     return tensor_impl_->storage();
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 3808aef75c4d57..5f7631512c715e 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -22,7 +22,7 @@
 namespace torch {
 namespace autograd {
 Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
-    : TensorImpl(data.type().type_id(), data.type().scalarType(), data.type().allocator(), /* is variable */ true),
+    : TensorImpl(data.type().type_id(), data.type().typeMeta(), data.type().allocator(), /* is variable */ true),
       data_(std::move(data)),
       grad_fn_(std::move(gradient_edge.function)),
       requires_grad_(false),
@@ -159,7 +159,7 @@ void Variable::Impl::set_data(Tensor new_data) {
   }
 
   // Updates metadata
-  scalar_type_ = new_data.type().scalarType();
+  data_type_ = new_data.type().typeMeta();
   type_id_ = new_data.type().type_id();
   is_variable_ = true;
   data_ = std::move(new_data);
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index d7e8c31f8cb5cf..415820a3086164 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -336,7 +336,7 @@ struct PreprocessGraph {
 struct ContainerTensor : public at::TensorImpl {
 public:
   ContainerTensor()
-  : TensorImpl(at::UndefinedTensorId(), at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
+  : TensorImpl(at::UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is_variable */ false) {}
 
   virtual ~ContainerTensor() = default;
   virtual at::IntList sizes() const override {

From 07fd4450ab8bcaf4c726e2e4afb738bf2b98365d Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 17 Sep 2018 09:34:52 -0700
Subject: [PATCH 190/237] Revert D9831398: [pytorch][PR] Update OpenMP cmake
 setting for xcode 9 compiler(AppleClang 9.0)

Differential Revision:
D9831398

Original commit changeset: db119d3f9c26

fbshipit-source-id: 4f183c9c178c159473bdaaa6299d4d5eb8afe549
---
 caffe2/CMakeLists.txt    |  8 -----
 cmake/Dependencies.cmake | 65 +++++++++++++++++++++++-----------------
 torch/CMakeLists.txt     | 13 ++------
 3 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 361ce228f3227a..4e9b3eef8937f4 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -206,14 +206,6 @@ if (${CAFFE2_LINK_LOCAL_PROTOBUF})
 else()
   target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 endif()
-#cmake only check for separate OpenMP library on AppleClang 7+
-#https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
-if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
-      CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
-    target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY})
-  endif()
-endif()
 target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 8fdfbd8ca8eb4a..dbec0d07ebf6e3 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -456,37 +456,12 @@ endif()
 
 # ---[ OpenMP
 if(USE_OPENMP)
-  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
-  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
-    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
-    if(DARWIN_VERSION GREATER 9)
-      set(APPLE_OPENMP_SUCKS 1)
-    endif(DARWIN_VERSION GREATER 9)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
-      OUTPUT_VARIABLE GCC_VERSION)
-    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
-      message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
-      message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
-      add_compile_options(-Wno-unknown-pragmas)
-      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
-    endif()
-  endif()
-
-  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
-    find_package(OpenMP)
-    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
-
-    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
-    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
-    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
-  endif()
-
+  find_package(OpenMP)
   if(OPENMP_FOUND)
     message(STATUS "Adding " ${OpenMP_CXX_FLAGS})
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
   else()
     message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
     caffe2_update_option(USE_OPENMP OFF)
@@ -981,6 +956,42 @@ if (NOT BUILD_ATEN_MOBILE)
     STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
     STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
   ENDIF()
+
+  # OpenMP support?
+  SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+  IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+    EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+    STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+    MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+    IF (DARWIN_VERSION GREATER 9)
+      SET(APPLE_OPENMP_SUCKS 1)
+    ENDIF (DARWIN_VERSION GREATER 9)
+    EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+      OUTPUT_VARIABLE GCC_VERSION)
+    IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+      MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+      MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+      add_compile_options(-Wno-unknown-pragmas)
+      SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+    ENDIF()
+  ENDIF()
+
+  IF (WITH_OPENMP AND NOT CHECKED_OPENMP)
+    FIND_PACKAGE(OpenMP)
+    SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
+
+    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
+    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
+    SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
+  ENDIF()
+
+  IF (OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  ENDIF()
+
+
   SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 
   FIND_PACKAGE(MAGMA)
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 706ff0c2d7091b..7679f3d6467ba0 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -3,7 +3,7 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
     return()
   endif()
 else()
-  cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
   project(torch CXX C)
   find_package(Caffe2 REQUIRED)
   option(USE_CUDA "Use CUDA" ON)
@@ -322,15 +322,8 @@ if(OPENMP_FOUND)
   if (VERBOSE)
     message(STATUS "Compiling with OpenMP")
   endif()
-  target_compile_options(torch INTERFACE ${OpenMP_CXX_FLAGS})
-  #cmake only check for separate OpenMP library on AppleClang 7+
-  #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252
-  if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR
-        CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0")
-      target_link_libraries(torch ${OpenMP_libomp_LIBRARY})
-    endif()
-  endif()
+  target_compile_options(torch INTERFACE -fopenmp)
+  target_link_libraries(torch -fopenmp)
 endif()
 
 if (NOT NO_API AND NOT USE_ROCM)

From a7e3cd09e023685a96f65d8447490455a16a4ebb Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Mon, 17 Sep 2018 09:44:51 -0700
Subject: [PATCH 191/237] Fix ctc gradient handling (#11753)

Summary:
Fixes: #11750

Also fix cuda ctc with double to enable gradient check.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11753

Differential Revision: D9861318

Pulled By: ezyang

fbshipit-source-id: 2e7afea2b60dbbd891bb5d0bda61ee75fe01d933
---
 aten/src/ATen/native/LossCTC.cpp     |  2 +-
 aten/src/ATen/native/cuda/LossCTC.cu | 15 ++++++++++----
 test/test_autograd.py                | 29 ++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index ccae5fb75f5b01..b7a9c52c64bd4e 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -287,7 +287,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
       for (int64_t c = 0; c < num_labels; c++) {
         scalar_t& res = grad_a[t][c];
         scalar_t lp = log_probs_a[t][c];
-        res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+        res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
       }
     }
     // zero the remainder
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index 16d7935f3d49fa..dc0b5af8d4e264 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -227,7 +227,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
   Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
 
   // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu...
-  constexpr int max_threads = 1024;
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 896; // we need 72 or so 32 bit registers for double
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
@@ -247,6 +247,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
                       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
                       tg_batch_offsets.data<int64_t>(), tg_target_stride,
                       batch_size, BLANK);
+  THCudaCheck(cudaGetLastError()); // catch launch errors
   return std::make_tuple(neg_log_likelihood, log_alpha);
 }
 
@@ -452,7 +453,7 @@ __global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ grad
     scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c];
     if (t < input_length) {
       scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c];
-      res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+      res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
     }
     else {
       res = 0.;
@@ -505,7 +506,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
   Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta))
 
   // As above, there may be better configurations to use.
-  constexpr int max_threads = 1024;
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 896; // we need 72 or so 32 bit registers for double
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
@@ -526,6 +527,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data<int64_t>(), tg_target_stride,
        batch_size, BLANK);
+    THCudaCheck(cudaGetLastError()); // catch launch errors
   }
 
   // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
@@ -550,7 +552,10 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
 		   .sub_(log_probs.narrow(2, BLANK, 1))
 		   .exp_()
 		   );
-    // Tor the non-blank characters, we use a kernel to compute the subtrahend.
+    // scale by output gradient (blanks and first summand of non-blanks)
+    grad *= grad_out.view({1, batch_size, 1});
+
+    // For the non-blank characters, we use a kernel to compute the subtrahend.
     // Again we might configure block and grid in a better way.
     int threads_target = max_threads;
     while (threads_target / 2 >= max_target_length) {
@@ -572,6 +577,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data<int64_t>(), tg_target_stride,
        batch_size, num_labels, BLANK);
+    THCudaCheck(cudaGetLastError()); // catch launch errors
   } else { // small problem, use naive algorithm
     // Still no block/grid configuration guru...
     int threads_input = max_threads;
@@ -595,6 +601,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data<int64_t>(), tg_target_stride,
        batch_size, num_labels, BLANK);
+    THCudaCheck(cudaGetLastError()); // catch launch errors
   }
   return grad;
 }
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 47ff4c66be0d33..4c4140e81114a7 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1375,6 +1375,35 @@ def test_unused_output(self):
         expected_grad[:2] = grad_output
         self.assertEqual(x.grad.data, expected_grad)
 
+    def test_ctc_loss(self):
+        batch_size = 64
+        num_labels = 101
+        target_length = 15
+        gradcheck_input_size = 10
+
+        # device, input_length
+        tests = [('cpu', 150)]
+        if torch.cuda.is_available():
+            tests += [('cuda', 50),
+                      ('cuda', 150)]
+
+        for device, input_length in tests:
+            targets = torch.randint(1, num_labels, (batch_size, target_length),
+                                    device=device, dtype=torch.long)
+            x = torch.randn(gradcheck_input_size, device=device, requires_grad=True)
+            tile_factors = torch.randn(input_length * batch_size * num_labels // gradcheck_input_size + 1,
+                                       device=device)
+            input_lengths = [input_length for _ in range(batch_size)]
+            target_lengths = [target_length for _ in range(batch_size)]
+
+            def ctc_after_softmax(x):
+                x_full = ((x[:, None] * tile_factors[None, :]).view(-1)[:input_length * batch_size * num_labels]
+                          .view(input_length, batch_size, num_labels))
+                log_probs = torch.log_softmax(x_full, 2)
+                return torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+
+            gradcheck(ctc_after_softmax, [x])
+
     def test_gc_in_destructor(self):
         """
         Previously, if a Function destructor triggered a garbage collection,

From 7949250295d17bf6ebb953d268438c3a7df3a6c1 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 17 Sep 2018 09:47:28 -0700
Subject: [PATCH 192/237] Fixes for Torch Script C++ API (#11682)

Summary:
A couple fixes I deem necessary to the TorchScript C++ API after writing the tutorial:

1. When I was creating the custom op API, I created `torch/op.h` as the one-stop header for creating custom ops. I now notice that there is no good header for the TorchScript C++ story altogether, i.e. when you just want to load a script module in C++ without any custom ops necessarily. The `torch/op.h` header suits that purpose just as well of course, but I think we should rename it to `torch/script.h`, which seems like a great name for this feature.

2. The current API for the CMake we provided was that we defined a bunch of variables like `TORCH_LIBRARY_DIRS` and `TORCH_INCLUDES` and then expected users to add those variables to their targets. We also had a CMake function that did that for you automatically. I now realized a much smarter way of doing this is to create an `IMPORTED` target for the libtorch library in CMake, and then add all this stuff to the link interface of that target. Then all downstream users have to do is `target_link_libraries(my_target torch)` and they get all the proper includes, libraries and compiler flags added to their target. This means we can get rid of the CMake function and all that stuff. orionr  AFAIK this is a much, much better way of doing all of this, no?

3. Since we distribute libtorch with `D_GLIBCXX_USE_CXX11_ABI=0`, dependent libraries must set this flag too. I now add this to the interface compile options of this imported target.

4. Fixes to JIT docs.

These could likely be 4 different PRs but given the release I wouldn't mind landing them all asap.

zdevito dzhulgakov soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11682

Differential Revision: D9839431

Pulled By: goldsborough

fbshipit-source-id: fdc47b95f83f22d53e1995aa683e09613b4bfe65
---
 cmake/TorchConfig.cmake.in               | 38 +++++++++++++-----------
 docs/source/jit.rst                      | 11 +++----
 test/custom_operator/CMakeLists.txt      | 15 +++-------
 test/custom_operator/op.cpp              |  2 +-
 test/custom_operator/op.h                |  2 +-
 test/custom_operator/test_custom_ops.cpp |  2 +-
 torch/CMakeLists.txt                     | 24 ++++++++++++++-
 torch/abi-check.cpp                      |  9 ++++++
 torch/{op.h => script.h}                 |  0
 torch/utils/cpp_extension.py             |  2 +-
 10 files changed, 66 insertions(+), 39 deletions(-)
 create mode 100644 torch/abi-check.cpp
 rename torch/{op.h => script.h} (100%)

diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index f177a9c2c3504d..066a7e63f9c57a 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -7,15 +7,12 @@
 #
 #   TORCH_FOUND        -- True if the system has the Torch library
 #   TORCH_INCLUDE_DIRS -- The include directories for torch
-#   TORCH_LIBRARIES    -- Libraries to link to
+#   TORCH_LIBRARIES    -- Libraries to link against
+#   TORCH_CXX_FLAGS    -- Additional (required) compiler flags
 #
 # and the following imported targets:
 #
-#   Torch
-#
-# and the following functions:
-#
-#   torch_add_custom_op_library(<name> <source_files>)
+#   torch
 
 if ($ENV{TORCH_INSTALL_PREFIX})
   set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX})
@@ -26,13 +23,19 @@ else()
 endif()
 
 # Include directories.
-set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include")
+if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include")
+  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include")
+else()
+  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/include")
+endif()
 
 # Library dependencies.
 find_package(Caffe2 REQUIRED)
+
 find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
+add_library(torch SHARED IMPORTED)
+set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS})
 
-set(TORCH_LIBRARIES ${TORCH_LIBRARY} ${Caffe2_MAIN_LIBS})
 if (@USE_CUDA@)
   if(MSVC)
     set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
@@ -59,13 +62,12 @@ if (@USE_CUDA@)
   list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
 endif()
 
-# Creates a shared library <name> with the correct include directories
-# and linker flags set to include Torch header files and link with Torch
-# libraries. Also sets the C++ standard version to C++11. All options
-# can be override by specifying further options on the `<name>` CMake target.
-function(torch_add_custom_op_library name source_files)
-  add_library(${name} SHARED ${source_files})
-  target_include_directories(${name} PUBLIC "${TORCH_INCLUDE_DIRS}")
-  target_link_libraries(${name} "${TORCH_LIBRARIES}")
-  set_property(TARGET ${name} PROPERTY CXX_STANDARD 11)
-endfunction(torch_add_custom_op_library)
+# When we build libtorch with the old GCC ABI, dependent libraries must too.
+set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
+
+set_target_properties(torch PROPERTIES
+    IMPORTED_LOCATION ${TORCH_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${TORCH_INCLUDE_DIRS}
+    INTERFACE_COMPILE_OPTIONS ${TORCH_CXX_FLAGS}
+    CXX_STANDARD 11
+)
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 005c09fa693624..f7a4411956a2c8 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -7,13 +7,13 @@ Torch Script
 .. currentmodule:: torch.jit
 
 Torch Script is a way to create serializable and optimizable models from PyTorch code.
-Anything code written in Torch Script can be saved from your Python
-process and loaded/run a process where there is no python dependency.
+Any code written in Torch Script can be saved from your Python
+process and loaded in a process where there is no Python dependency.
 
 We provide tools to incrementally transition a model from being a pure Python program
-to a Torch Script program that can be run independently from python, for instance, in a standalone C++ process.
+to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
 This makes it possible to train models in PyTorch using familiar tools and then export
-the model to a production environment where it is not a good idea to run models as python programs
+the model to a production environment where it is not a good idea to run models as Python programs
 for performance and multi-threading reasons.
 
 Creating Torch Script Code
@@ -47,7 +47,7 @@ Mixing Tracing and Scripting
 ----------------------------
 
 In many cases either tracing or script is an easier approach for converting a model.
-We allow you to compose tracing and scripting to suite the particular requirements
+We allow you to compose tracing and scripting to suit the particular requirements
 of a part of a model.
 
 Scripted functions can call traced ones. This is particularly useful when you need
@@ -79,6 +79,7 @@ Example:
 ::
 
     import torch
+
     @torch.jit.script
     def foo(x, y):
         if x.max() > y.max():
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index 14f605d4f5b3f7..f692bdfae123b9 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -1,19 +1,12 @@
 # Basic CMake setup
-cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(custom_ops)
 
 find_package(Torch REQUIRED)
 
-# This convenience function will create a shared library target, configure
-# the right include directories and link against the right libraries. It is
-# exactly equivalent to the following lines:
-#
-# add_library(custom_ops SHARED op.cpp)
-# target_include_directories(custom_ops PUBLIC "${TORCH_INCLUDE_DIRS}")
-# target_link_libraries(custom_ops "${TORCH_LIBRARIES}")
-# set_property(TARGET custom_ops PROPERTY CXX_STANDARD 11)
-#
-torch_add_custom_op_library(custom_ops op.cpp)
+add_library(custom_ops SHARED op.cpp)
+target_compile_features(custom_ops PUBLIC cxx_range_for)
+target_link_libraries(custom_ops ${TORCH_LIBRARIES})
 
 add_executable(test_custom_ops test_custom_ops.cpp)
 target_link_libraries(test_custom_ops custom_ops)
diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp
index 113476f581c52d..f48189728ea293 100644
--- a/test/custom_operator/op.cpp
+++ b/test/custom_operator/op.cpp
@@ -1,4 +1,4 @@
-#include <torch/op.h>
+#include <torch/script.h>
 
 #include <cstddef>
 #include <vector>
diff --git a/test/custom_operator/op.h b/test/custom_operator/op.h
index b1c4b5b1647f0b..daa38b6965c5d8 100644
--- a/test/custom_operator/op.h
+++ b/test/custom_operator/op.h
@@ -1,4 +1,4 @@
-#include <torch/op.h>
+#include <torch/script.h>
 
 #include <cstddef>
 #include <vector>
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index 47e6f3618aad7c..918eadd1e869fd 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -1,4 +1,4 @@
-#include <torch/op.h>
+#include <torch/script.h>
 
 #include "op.h"
 
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 7679f3d6467ba0..68eee299e65334 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -413,7 +413,7 @@ endif()
 install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
         FILES_MATCHING PATTERN "*.h")
-install(FILES "${TORCH_SRC_DIR}/op.h"
+install(FILES "${TORCH_SRC_DIR}/script.h"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 install(TARGETS torch
@@ -488,6 +488,28 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
   endif()
 endif()
 
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  message(STATUS "${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
+  execute_process(
+    COMMAND
+      "${CMAKE_CXX_COMPILER}"
+      "${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp"
+      "-o"
+      "${CMAKE_BINARY_DIR}/abi-check"
+    RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
+  if (ABI_CHECK_COMPILE_RESULT)
+    message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
+  endif()
+  execute_process(
+    COMMAND "${CMAKE_BINARY_DIR}/abi-check"
+    RESULT_VARIABLE ABI_CHECK_RESULT
+    OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
+  if (ABI_CHECK_RESULT)
+    message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
+  endif()
+  message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
+endif()
+
 # CMake config for external projects.
 configure_file(
     ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
diff --git a/torch/abi-check.cpp b/torch/abi-check.cpp
new file mode 100644
index 00000000000000..e506eb7831cad4
--- /dev/null
+++ b/torch/abi-check.cpp
@@ -0,0 +1,9 @@
+#include <iostream>
+
+int main() {
+#ifdef _GLIBCXX_USE_CXX11_ABI
+  std::cout << _GLIBCXX_USE_CXX11_ABI;
+#else
+  std::cout << 0;
+#endif
+}
diff --git a/torch/op.h b/torch/script.h
similarity index 100%
rename from torch/op.h
rename to torch/script.h
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 22940aaa5f7ea5..01961cf994ec73 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -288,7 +288,7 @@ def _add_gnu_abi_flag_if_binary(self, extension):
         # if the extension is compiled with gcc >= 5.1,
         # then we have to define _GLIBCXX_USE_CXX11_ABI=0
         # so that the std::string in the API is resolved to
-        # non-C++11 symbols
+        # non-C++11 symbols.
         define = '-D_GLIBCXX_USE_CXX11_ABI=0'
         if is_binary_build():
             if isinstance(extension.extra_compile_args, dict):

From cdefc277954e4edfcaded72808bd5196e0d118a8 Mon Sep 17 00:00:00 2001
From: Chenguang Xi <cxi@fb.com>
Date: Mon, 17 Sep 2018 10:14:08 -0700
Subject: [PATCH 193/237] Support lr adaption for SparseAdam and
 RowWiseSparseAdam (#11162)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11162

as title, fix pr test failure

Reviewed By: chocjy

Differential Revision: D9619308

fbshipit-source-id: 0a2228841ed8fadb15f07e94d3575aa701b10146
---
 caffe2/python/operator_test/adam_test.py | 164 +++++++++++-
 caffe2/python/optimizer.py               |  55 ++--
 caffe2/sgd/adam_op.cc                    |   8 +-
 caffe2/sgd/adam_op.h                     | 323 ++++++++++++++++-------
 caffe2/sgd/adam_op_gpu.cu                | 146 ++++++++--
 5 files changed, 541 insertions(+), 155 deletions(-)

diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 9cf3f7c06b4385..8209b1c0493095 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -34,15 +34,18 @@ def ref_adam(param, mom1, mom2, grad, LR, ITER,
 
     @staticmethod
     def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
-                          beta1, beta2, epsilon):
+                          beta1, beta2, epsilon, output_grad=False):
         t = ITER + 1
-        corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \
+        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
             (1 - np.power(beta1, t))
         mom1_out = (beta1 * mom1) + (1 - beta1) * grad
         mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad))
-        param_out = param + corrected_local_rate * mom1_out / \
-            (np.sqrt(mom2_out) + epsilon)
-        return (param_out, mom1_out, mom2_out)
+        grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon)
+        param_out = param + LR * grad_out
+        if output_grad:
+            return param_out, mom1_out, mom2_out, grad_out
+        else:
+            return param_out, mom1_out, mom2_out
 
     @given(inputs=hu.tensors(n=4),
            ITER=st.integers(min_value=0, max_value=10000),
@@ -176,6 +179,76 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
             ref_sparse,
             input_device_options=input_device_options)
 
+    @given(inputs=hu.tensors(n=4),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
+                         data_strategy, gc, dc):
+        param, mom1, mom2, grad = inputs
+        mom2 = np.absolute(mom2)
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(
+            np.array_equal(
+                np.unique(indices.flatten()),
+                np.sort(indices.flatten())))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "SparseAdam",
+            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2", "output_grad"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
+                                beta1, beta2, epsilon, output_grad):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+            grad_out = np.copy(grad)
+
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
+                    self.ref_adam(param[index], mom1[index], mom2[index],
+                                  grad[i], LR, ITER,
+                                  beta1, beta2, epsilon, output_grad)
+            return (param_out, mom1_out, mom2_out, grad_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, indices, grad, LR, ITER],
+            functools.partial(
+                ref_sparse_output_grad,
+                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
+            input_device_options=input_device_options)
+
     @given(inputs=hu.tensors(n=3),
            ITER=st.integers(min_value=0, max_value=10000),
            LR=st.floats(min_value=0.01, max_value=0.99,
@@ -252,6 +325,87 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
             ref_row_wise_sparse,
             input_device_options=input_device_options)
 
+    @given(inputs=hu.tensors(n=3),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+               **hu.gcs_cpu_only)
+    def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
+                                  epsilon, data_strategy, gc, dc):
+        param, mom1, grad = inputs
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create a 1D row-wise average 2nd moment tensor.
+        mom2 = data_strategy.draw(
+            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
+                        elements=hu.elements_of_type(dtype=np.float32))
+        )
+        mom2 = np.absolute(mom2)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
+        # tensor that is strictly 1-dimensional and equal in length to the
+        # first dimension of the parameters, so indices must also be
+        # 1-dimensional.
+        indices = indices.flatten()
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "RowWiseSparseAdam",
+            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2", "output_grad"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
+                                        beta1, beta2, epsilon, output_grad):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+            grad_out = np.copy(grad)
+
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
+                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
+                                           grad[i], LR, ITER,
+                                           beta1, beta2, epsilon, output_grad)
+            return (param_out, mom1_out, mom2_out, grad_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, indices, grad, LR, ITER],
+            functools.partial(
+                ref_row_wise_sparse_output_grad,
+                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
+            input_device_options=input_device_options)
+
 
 if __name__ == "__main__":
     import unittest
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index a48502c68a3cd6..482d16a0dfa6a6 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -930,19 +930,6 @@ def _run(self, net, param_init_net, param_info):
             **(self.init_kwargs)
         )
 
-        if self.use_lr_adaption:
-            effective_grad = param_init_net.ConstantFill(
-                [param],
-                param + "_effgrad",
-                value=0.0
-            )
-            self._aux_params.local.append(effective_grad)
-            net.LearningRateAdaption(
-                [lr, grad, effective_grad],
-                [lr],
-                lr_alpha=self.lr_alpha,
-                normalized_lr_adaption=self.normalized_lr_adaption)
-
         m1 = param_init_net.ConstantFill(
             [param],
             param + "_first_moment",
@@ -973,35 +960,45 @@ def _run(self, net, param_init_net, param_info):
                 'If SparseAdam with rowWise=True, gradient must be '\
                 'a gradientslice. PLease ensure that rowWise is not enabled '\
                 'for the dense Adam optimizer, as it is not supported.'
+
+        output_blobs = [param, m1, m2]
+        if self.use_lr_adaption:
+            effective_grad = str(param) + '_effective_grad'
+            output_blobs.append(effective_grad)
+
         if isinstance(grad, core.GradientSlice):
             grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
             if self.rowWise:
                 op = 'RowWiseSparseAdam'
             else:
                 op = 'SparseAdam'
+
             net.__getattr__(op)(
                 [param, m1, m2, grad.indices, grad.values, lr, iteration],
-                [param, m1, m2],
+                output_blobs,
                 beta1=self.beta1,
                 beta2=self.beta2,
-                epsilon=self.epsilon
-            )
+                epsilon=self.epsilon)
+            if self.use_lr_adaption:
+                net.LearningRateAdaption(
+                    [lr, grad.values, effective_grad],
+                    [lr],
+                    lr_alpha=self.lr_alpha,
+                    normalized_lr_adaption=self.normalized_lr_adaption)
 
         else:
+            net.Adam(
+                [param, m1, m2, grad, lr, iteration],
+                output_blobs,
+                beta1=self.beta1,
+                beta2=self.beta2,
+                epsilon=self.epsilon)
             if self.use_lr_adaption:
-                net.Adam(
-                    [param, m1, m2, grad, lr, iteration],
-                    [param, m1, m2, effective_grad],
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon)
-            else:
-                net.Adam(
-                    [param, m1, m2, grad, lr, iteration],
-                    [param, m1, m2],
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon)
+                net.LearningRateAdaption(
+                    [lr, grad, effective_grad],
+                    [lr],
+                    lr_alpha=self.lr_alpha,
+                    normalized_lr_adaption=self.normalized_lr_adaption)
 
     def scale_learning_rate(self, scale):
         self.alpha *= scale
diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc
index 25414622bad754..623e93a07e3251 100644
--- a/caffe2/sgd/adam_op.cc
+++ b/caffe2/sgd/adam_op.cc
@@ -34,7 +34,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output
     .Output(0, "output_param", "Updated parameters")
     .Output(1, "output_moment_1", "Updated first moment")
     .Output(2, "output_moment_2", "Updated second moment")
-    .Output(3, "output_grad", "Effective grad")
+    .Output(3, "output_grad", "Optional Effective gradient")
     .Arg("beta1", "Default 0.9")
     .Arg("beta2", "Default 0.999")
     .Arg("epsilon", "Default 1e-5");
@@ -42,7 +42,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output
 REGISTER_CPU_OPERATOR(SparseAdam, SparseAdamOp<float, CPUContext>);
 OPERATOR_SCHEMA(SparseAdam)
     .NumInputs(7)
-    .NumOutputs(3)
+    .NumOutputs(3, 4)
     .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
     .SetDoc(R"DOC(
 
@@ -62,6 +62,7 @@ OPERATOR_SCHEMA(SparseAdam)
     .Output(0, "output_param", "Updated parameters")
     .Output(1, "output_moment_1", "Updated first moment")
     .Output(2, "output_moment_2", "Updated second moment")
+    .Output(3, "output_grad", "Optional Effective gradient")
     .Arg("beta1", "Default 0.9")
     .Arg("beta2", "Default 0.999")
     .Arg("epsilon", "Default 1e-5");
@@ -71,7 +72,7 @@ REGISTER_CPU_OPERATOR(
     RowWiseSparseAdamOp<float, CPUContext>);
 OPERATOR_SCHEMA(RowWiseSparseAdam)
     .NumInputs(7)
-    .NumOutputs(3)
+    .NumOutputs(3, 4)
     .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
     .SetDoc(R"DOC(
 
@@ -95,6 +96,7 @@ OPERATOR_SCHEMA(RowWiseSparseAdam)
     .Output(0, "output_param", "Updated parameters")
     .Output(1, "output_moment_1", "Updated first moment")
     .Output(2, "output_moment_2", "Updated second moment")
+    .Output(3, "output_grad", "Optional Effective gradient")
     .Arg("beta1", "Default 0.9")
     .Arg("beta2", "Default 0.999")
     .Arg("epsilon", "Default 1e-5");
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index 3cd301a8223009..699ba7aa5d23b1 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -195,58 +195,118 @@ class SparseAdamOp final : public Operator<Context> {
     auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
-    for (auto i = 0; i < n; ++i) {
-      auto idx = indices[i];
-
-      if (block_size == 1) {
-        float gi = gradIn[i];
-        float mi = moment1Out[idx] =
-            moment1In[idx] * beta1_ + gi * (1 - beta1_);
-        float vi = moment2Out[idx] =
-            moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
-        paramOut[idx] =
-            paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
-
-      } else {
-        auto offsetI = i * block_size;
-        auto offsetIdx = idx * block_size;
+    if (OutputSize() == 3) {
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          paramOut[idx] = paramIn[idx] +
+              lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
+
+#ifndef NDEBUG
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
+#endif
+
+          adam_compute(
+              block_size,
+              paramIn + offsetIdx,
+              gradIn + offsetI,
+              moment1In + offsetIdx,
+              moment2In + offsetIdx,
+              paramOut + offsetIdx,
+              moment1Out + offsetIdx,
+              moment2Out + offsetIdx,
+              beta1_,
+              beta2_,
+              epsilon_,
+              correction,
+              lr,
+              &context_);
+        }
+      }
+    } else {
+      Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+      auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_);
+          paramOut[idx] = paramIn[idx] + lr[0] * ngi;
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
 #ifndef NDEBUG
-        CAFFE_ENFORCE_GE(
-            Input(PARAM).size(),
-            block_size + offsetIdx,
-            this->debug_def().input(PARAM),
-            ", out of bound,  idx:",
-            idx,
-            " for input i:",
-            i,
-            " and block size:",
-            block_size);
-        CAFFE_ENFORCE_GE(
-            Input(GRAD).size(),
-            block_size + offsetI,
-            this->debug_def().input(GRAD),
-            ", out of bound idx, idx:",
-            idx,
-            " for input i:",
-            i);
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
 #endif
 
-        adam_compute(
-            block_size,
-            paramIn + offsetIdx,
-            gradIn + offsetI,
-            moment1In + offsetIdx,
-            moment2In + offsetIdx,
-            paramOut + offsetIdx,
-            moment1Out + offsetIdx,
-            moment2Out + offsetIdx,
-            beta1_,
-            beta2_,
-            epsilon_,
-            correction,
-            lr,
-            &context_);
+          adam_compute_output_grad(
+              block_size,
+              paramIn + offsetIdx,
+              gradIn + offsetI,
+              moment1In + offsetIdx,
+              moment2In + offsetIdx,
+              paramOut + offsetIdx,
+              moment1Out + offsetIdx,
+              moment2Out + offsetIdx,
+              gradOut + offsetI,
+              beta1_,
+              beta2_,
+              epsilon_,
+              correction,
+              lr,
+              &context_);
+        }
       }
     }
     return true;
@@ -257,7 +317,7 @@ class SparseAdamOp final : public Operator<Context> {
   T beta2_;
   T epsilon_;
   INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER);
-  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
 };
 
 template <typename T, class Context>
@@ -305,61 +365,126 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
     auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
-    for (auto i = 0; i < n; ++i) {
-      auto idx = indices[i];
+    if (OutputSize() == 3) {
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          paramOut[idx] = paramIn[idx] +
+              lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
-      if (block_size == 1) {
-        float gi = gradIn[i];
-        float mi = moment1Out[idx] =
-            moment1In[idx] * beta1_ + gi * (1 - beta1_);
-        float vi = moment2Out[idx] =
-            moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
-        paramOut[idx] =
-            paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+#ifndef NDEBUG
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
+#endif
 
-      } else {
-        auto offsetI = i * block_size;
-        auto offsetIdx = idx * block_size;
+          const float* w = paramIn + offsetIdx;
+          const float* g = gradIn + offsetI;
+          const float* m1 = moment1In + offsetIdx;
+          const float* m2 = moment2In + idx;
+          float* nw = paramOut + offsetIdx;
+          float* nm1 = moment1Out + offsetIdx;
+          float* nm2 = moment2Out + idx;
+
+          float m2_sum = 0.;
+          for (auto j = 0; j < block_size; ++j) {
+            float gj = g[j];
+            m2_sum += gj * gj;
+          }
+          float vi = nm2[0] =
+              m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
+          for (auto j = 0; j < block_size; ++j) {
+            float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
+            nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+          }
+        }
+      }
+    } else {
+      Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+      auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_);
+          paramOut[idx] = paramIn[idx] + lr[0] * ngi;
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
 #ifndef NDEBUG
-        CAFFE_ENFORCE_GE(
-            Input(PARAM).size(),
-            block_size + offsetIdx,
-            this->debug_def().input(PARAM),
-            ", out of bound,  idx:",
-            idx,
-            " for input i:",
-            i,
-            " and block size:",
-            block_size);
-        CAFFE_ENFORCE_GE(
-            Input(GRAD).size(),
-            block_size + offsetI,
-            this->debug_def().input(GRAD),
-            ", out of bound idx, idx:",
-            idx,
-            " for input i:",
-            i);
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
 #endif
 
-        const float* w = paramIn + offsetIdx;
-        const float* g = gradIn + offsetI;
-        const float* m1 = moment1In + offsetIdx;
-        const float* m2 = moment2In + idx;
-        float* nw = paramOut + offsetIdx;
-        float* nm1 = moment1Out + offsetIdx;
-        float* nm2 = moment2Out + idx;
-
-        float m2_sum = 0.;
-        for (auto j = 0; j < block_size; ++j) {
-          float gj = g[j];
-          m2_sum += gj * gj;
-        }
-        float vi = nm2[0] =
-            m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
-        for (auto j = 0; j < block_size; ++j) {
-          float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
-          nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+          const float* w = paramIn + offsetIdx;
+          const float* g = gradIn + offsetI;
+          const float* m1 = moment1In + offsetIdx;
+          const float* m2 = moment2In + idx;
+          float* nw = paramOut + offsetIdx;
+          float* nm1 = moment1Out + offsetIdx;
+          float* nm2 = moment2Out + idx;
+          float* ng = gradOut + offsetI;
+
+          float m2_sum = 0.;
+          for (auto j = 0; j < block_size; ++j) {
+            float gj = g[j];
+            m2_sum += gj * gj;
+          }
+          float vi = nm2[0] =
+              m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
+          for (auto j = 0; j < block_size; ++j) {
+            float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
+            float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_);
+            nw[j] = w[j] + lr[0] * ngi;
+          }
         }
       }
     }
@@ -371,7 +496,7 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
   T beta2_;
   T epsilon_;
   INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER);
-  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
 };
 
 } // namespace caffe2
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
index 41f70ca51d577b..2e142af682f795 100644
--- a/caffe2/sgd/adam_op_gpu.cu
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -1,6 +1,6 @@
-#include "caffe2/sgd/adam_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/sgd/adam_op.h"
 
 namespace caffe2 {
 
@@ -95,6 +95,55 @@ void adam_compute<CUDAContext>(
       N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr);
 }
 
+__global__ void AdamComputeOutputGrad(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float* ng,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    float ngi = ng[i] = correction * mi / (sqrtf(vi) + eps_hat);
+    nw[i] = w[i] + lr[0] * ngi;
+  }
+}
+
+template <>
+void adam_compute_output_grad<CUDAContext>(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float* ng,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    CUDAContext* context) {
+  AdamComputeOutputGrad<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr);
+}
+
 template <typename SIndex>
 __global__ void SparseAdamKernel(
     const size_t N,
@@ -123,9 +172,44 @@ __global__ void SparseAdamKernel(
   }
 }
 
+template <typename SIndex>
+__global__ void SparseAdamOutputGradKernel(
+    const size_t N,
+    const size_t grad_slice_sz,
+    const float beta1,
+    const float beta2,
+    const float epsilon,
+    float* param,
+    float* mom1,
+    float* mom2,
+    float* output_grad,
+    const SIndex* indices,
+    const float* grad,
+    const float correction,
+    const float* lr,
+    const float iter) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    const size_t gradIdx = i;
+    const SIndex index = indices[i / grad_slice_sz];
+    const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);
+
+    float m1n = mom1[paramIdx] =
+        mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1);
+    float m2n = mom2[paramIdx] =
+        mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2);
+    float gradOut = output_grad[gradIdx] =
+        correction * m1n / (sqrt(m2n) + epsilon);
+    param[paramIdx] += lr[0] * gradOut;
+  }
+}
+
 template <>
 template <typename SIndex>
 bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
+  Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
+  Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
+  Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
+
   auto N = Input(GRAD).size();
   auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
   const auto iter =
@@ -133,24 +217,48 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
   const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
       (1.0f - std::pow(beta1_, iter + 1));
 
-  SparseAdamKernel<SIndex>
-      <<<CAFFE_GET_BLOCKS(N),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          N,
-          grad_slice_sz,
-          beta1_,
-          beta2_,
-          epsilon_,
-          Output(OUTPUT_PARAM)->template mutable_data<float>(),
-          Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
-          Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
-          Input(INDICES).template data<SIndex>(),
-          Input(GRAD).template data<float>(),
-          correction,
-          Input(LR).template data<float>(),
-          iter);
+  if (OutputSize() == 3) {
+    SparseAdamKernel<SIndex>
+        <<<CAFFE_GET_BLOCKS(N),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            grad_slice_sz,
+            beta1_,
+            beta2_,
+            epsilon_,
+            Output(OUTPUT_PARAM)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
+            Input(INDICES).template data<SIndex>(),
+            Input(GRAD).template data<float>(),
+            correction,
+            Input(LR).template data<float>(),
+            iter);
+  } else {
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    SparseAdamOutputGradKernel<SIndex>
+        <<<CAFFE_GET_BLOCKS(N),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            grad_slice_sz,
+            beta1_,
+            beta2_,
+            epsilon_,
+            Output(OUTPUT_PARAM)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
+            Output(OUTPUT_GRAD)->template mutable_data<float>(),
+            Input(INDICES).template data<SIndex>(),
+            Input(GRAD).template data<float>(),
+            correction,
+            Input(LR).template data<float>(),
+            iter);
+  }
+
   return true;
 }
 

From e125e61824a5287d602fd6317d6e13581b7682b3 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 17 Sep 2018 11:02:48 -0700
Subject: [PATCH 194/237] Fix flake8

Summary: Fix flake8

Reviewed By: ezyang

Differential Revision: D9873872

fbshipit-source-id: 26e81238f22caaeccd2c8b4f39cedb6cfb5520dd
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index ccfced5f35bc83..4364b19ffdc52d 100644
--- a/setup.py
+++ b/setup.py
@@ -426,6 +426,7 @@ class build_deps(PytorchCommand):
     def run(self):
         print('setup.py::build_deps::run()')
         # Check if you remembered to check out submodules
+
         def check_file(f):
             if not os.path.exists(f):
                 print("Could not find {}".format(f))

From 39520ffec15ab7e97691fed048de1832e83785e8 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 17 Sep 2018 11:03:56 -0700
Subject: [PATCH 195/237] remove Type/Tensor/TensorMethods include order
 dependencies. (#11720)

Summary:
Previously, it was a necessity to include TensorMethods.h after Tensor.h in order to get the tensor method definitions.
We abstracted this away from users by making sure ATen.h did this correctly; but we don't have any equivalent for ATen/core.

In order to solve this dependency issue, we now forward declare Tensor in the Type declaration, which breaks the dependency cycle.
Type.h now includes Tensor.h (for backwards compatibility) and Tensor.h now includes TensorMethods.h, so there is no longer include dependency restrictions.

We could get rid of TensorMethods.h completely now, but that would involve coordinating a code generation change.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11720

Reviewed By: ezyang

Differential Revision: D9841488

Pulled By: gchanan

fbshipit-source-id: 1668199095e096c1790e646b5dc9f61ec1b33c0a
---
 aten/src/ATen/core/Tensor.h      | 2 ++
 aten/src/ATen/core/Type.h        | 4 +++-
 aten/src/ATen/templates/Tensor.h | 2 ++
 aten/src/ATen/templates/Type.h   | 4 +++-
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index ca23584f870ce7..1b2c0f0e288264 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -684,3 +684,5 @@ struct AT_API WeakTensor {
   c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
 };
 } // namespace at
+
+#include "ATen/core/TensorMethods.h"
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index dea7844e93debb..1366f899c30b84 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -8,7 +8,6 @@
 #include "ATen/core/Scalar.h"
 #include "ATen/core/ScalarType.h"
 #include "ATen/core/SparseTensorRef.h"
-#include "ATen/core/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"
 #include "ATen/core/TensorTypeIdRegistration.h"
@@ -34,6 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
+struct Tensor;
 
 static inline void noop_deleter(void*) {}
 
@@ -630,3 +630,5 @@ struct AT_API Type {
 };
 
 } // namespace at
+
+#include "ATen/core/Tensor.h"
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 02bef7f89a6057..85e7c84961d6ee 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -298,3 +298,5 @@ struct AT_API WeakTensor {
   c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
 };
 } // namespace at
+
+#include "ATen/core/TensorMethods.h"
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 8dda48749b0c97..0e00a5d3499fcd 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -8,7 +8,6 @@
 #include "ATen/core/Scalar.h"
 #include "ATen/core/ScalarType.h"
 #include "ATen/core/SparseTensorRef.h"
-#include "ATen/core/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"
 #include "ATen/core/TensorTypeIdRegistration.h"
@@ -34,6 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
+struct Tensor;
 
 static inline void noop_deleter(void*) {}
 
@@ -144,3 +144,5 @@ struct AT_API Type {
 };
 
 } // namespace at
+
+#include "ATen/core/Tensor.h"

From 47d65ed34fa6bec4c3038edf1ee5119347607664 Mon Sep 17 00:00:00 2001
From: vishwakftw <cs15btech11043@iith.ac.in>
Date: Mon, 17 Sep 2018 12:05:13 -0700
Subject: [PATCH 196/237] Fix issue 10492 (#11634)

Summary:
- pass infos vector by reference
- checkErrors takes infos vector by reference
- modified gesv tests to not cause infs or nans sporadically
- also clean up error messages

Reviewed By: ezyang

Differential Revision: D9818550

Pulled By: soumith

fbshipit-source-id: 00215205ff88767d6a5e921322394c5fd915d6d8
---
 aten/src/ATen/native/Gesv.cpp     |  5 +-
 aten/src/ATen/native/Gesv.h       | 15 +++---
 aten/src/ATen/native/cuda/Gesv.cu |  2 +-
 test/common.py                    | 65 ++++++++++++++++++++++++++
 test/test_autograd.py             | 76 ++++++-------------------------
 test/test_torch.py                | 23 +++++-----
 6 files changed, 101 insertions(+), 85 deletions(-)

diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp
index b45e2a4f98860e..dcb8a0964d2f90 100644
--- a/aten/src/ATen/native/Gesv.cpp
+++ b/aten/src/ATen/native/Gesv.cpp
@@ -44,7 +44,7 @@ template<> void lapackGesv<double>(
 #endif
 
 template <typename scalar_t>
-static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
+static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
 #ifndef USE_LAPACK
   AT_ERROR("gesv: LAPACK library not found in compilation");
 #endif
@@ -117,8 +117,7 @@ std::tuple<Tensor&,Tensor&> gesv_out(
     Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) {
   if (self.dim() > 2 || A.dim() > 2) {
     AT_ERROR("torch.gesv() with the `out` keyword does not support batching. "
-                  "b.dim() (%lld) and A.dim() (%lld) must both be 2.",
-                  (long long)self.dim(), (long long)A.dim());
+             "b.dim() (", self.dim(), ") and A.dim() (", A.dim(), ") must both be 2.");
   }
   return at::_gesv_single_out(solution, lu, self, A);
 }
diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h
index 2d265520f3d21c..a3ba6ec1a8f127 100644
--- a/aten/src/ATen/native/Gesv.h
+++ b/aten/src/ATen/native/Gesv.h
@@ -5,26 +5,23 @@ namespace at { namespace native {
 static inline void checkInputs(const Tensor& self, const Tensor& A) {
   if (A.size(-1) != A.size(-2)) {
     AT_ERROR("A must be batches of square matrices, "
-        "but they are %lld by %lld matrices",
+        "but they are ", A.size(-1), " by ", A.size(-2), " matrices",
         (long long)A.size(-1), (long long)A.size(-2));
   }
   if (A.size(-1) != self.size(-2)) {
     AT_ERROR("Incompatible matrix sizes for matmul: each A "
-        "matrix is %llu by %lld but each b matrix is %lld by %lld.",
-        (long long)A.size(-1), (long long)A.size(-1),
-        (long long)self.size(-2), (long long)self.size(-1));
+        "matrix is ", A.size(-1), " by ", A.size(-1),
+        " but each b matrix is ", self.size(-2), " by ", self.size(-1));
   }
 }
 
-static inline void checkErrors(std::vector<int64_t> infos) {
+static inline void checkErrors(std::vector<int64_t>& infos) {
   for (size_t i = 0; i < infos.size(); i++) {
     auto info = infos[i];
     if (info < 0) {
-      AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value",
-          (long long)i, -info);
+      AT_ERROR("gesv: For batch ", i, ": Argument ", -info, " has illegal value.");
     } else if (info > 0) {
-      AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.",
-          (long long)i, info, info);
+      AT_ERROR("gesv: For batch ", i, ": U(", info, ",", info, ") is zero, singular U.");
     }
   }
 }
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index bc37e83990e192..80c7aaeb74f6a8 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -84,7 +84,7 @@ static inline Storage pin_memory(int64_t size, Tensor dummy) {
   name = static_cast<type*>(storage_##name.data());
 
 template <typename scalar_t>
-static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
+static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
 #ifndef USE_MAGMA
 AT_ERROR("gesv: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
diff --git a/test/common.py b/test/common.py
index c1cd7999d82901..c415504e94656d 100644
--- a/test/common.py
+++ b/test/common.py
@@ -593,3 +593,68 @@ def find_free_port():
     sockname = sock.getsockname()
     sock.close()
     return sockname[1]
+
+
+# Methods for matrix generation
+# Used in test_autograd.py and test_torch.py
+def prod_single_zero(dim_size):
+    result = torch.randn(dim_size, dim_size)
+    result[0, 1] = 0
+    return result
+
+
+def random_square_matrix_of_rank(l, rank):
+    assert rank <= l
+    A = torch.randn(l, l)
+    u, s, v = A.svd()
+    for i in range(l):
+        if i >= rank:
+            s[i] = 0
+        elif s[i] == 0:
+            s[i] = 1
+    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
+
+
+def random_symmetric_matrix(l):
+    A = torch.randn(l, l)
+    for i in range(l):
+        for j in range(i):
+            A[i, j] = A[j, i]
+    return A
+
+
+def random_symmetric_psd_matrix(l):
+    A = torch.randn(l, l)
+    return A.mm(A.transpose(0, 1))
+
+
+def random_symmetric_pd_matrix(l, eps=1e-5):
+    A = torch.randn(l, l)
+    return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps
+
+
+def make_nonzero_det(A, sign=None, min_singular_value=0.1):
+    u, s, v = A.svd()
+    s[s < min_singular_value] = min_singular_value
+    A = u.mm(torch.diag(s)).mm(v.t())
+    det = A.det().item()
+    if sign is not None:
+        if (det < 0) ^ (sign < 0):
+            A[0, :].neg_()
+    return A
+
+
+def random_fullrank_matrix_distinct_singular_value(l, *batches):
+    if len(batches) == 0:
+        A = torch.randn(l, l)
+        u, _, v = A.svd()
+        s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
+        return u.mm(torch.diag(s)).mm(v.t())
+    else:
+        all_matrices = []
+        for _ in range(0, torch.prod(torch.as_tensor(batches)).item()):
+            A = torch.randn(l, l)
+            u, _, v = A.svd()
+            s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
+            all_matrices.append(u.mm(torch.diag(s)).mm(v.t()))
+        return torch.stack(all_matrices).reshape(*(batches + (l, l)))
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 4c4140e81114a7..296eead471e558 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -14,8 +14,12 @@
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
-from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
-    suppress_warnings, skipIfRocm
+from common import (TEST_MKL, TestCase, run_tests, skipIfNoLapack,
+                    suppress_warnings, skipIfRocm,
+                    prod_single_zero, random_square_matrix_of_rank,
+                    random_symmetric_matrix, random_symmetric_psd_matrix,
+                    random_symmetric_pd_matrix, make_nonzero_det,
+                    random_fullrank_matrix_distinct_singular_value)
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
@@ -2595,60 +2599,6 @@ def prod_zeros(dim_size, dim_select):
     return result
 
 
-def prod_single_zero(dim_size):
-    result = torch.randn(dim_size, dim_size)
-    result[0, 1] = 0
-    return result
-
-
-def random_square_matrix_of_rank(l, rank):
-    assert rank <= l
-    A = torch.randn(l, l)
-    u, s, v = A.svd()
-    for i in range(l):
-        if i >= rank:
-            s[i] = 0
-        elif s[i] == 0:
-            s[i] = 1
-    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
-
-
-def random_symmetric_matrix(l):
-    A = torch.randn(l, l)
-    for i in range(l):
-        for j in range(i):
-            A[i, j] = A[j, i]
-    return A
-
-
-def random_symmetric_psd_matrix(l):
-    A = torch.randn(l, l)
-    return A.mm(A.transpose(0, 1))
-
-
-def random_symmetric_pd_matrix(l, eps=1e-5):
-    A = torch.randn(l, l)
-    return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps
-
-
-def make_nonzero_det(A, sign=None, min_singular_value=0.1):
-    u, s, v = A.svd()
-    s[s < min_singular_value] = min_singular_value
-    A = u.mm(torch.diag(s)).mm(v.t())
-    det = A.det().item()
-    if sign is not None:
-        if (det < 0) ^ (sign < 0):
-            A[0, :].neg_()
-    return A
-
-
-def random_fullrank_matrix_distinct_singular_value(l):
-    A = torch.randn(l, l)
-    u, _, v = A.svd()
-    s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
-    return u.mm(torch.diag(s)).mm(v.t())
-
-
 def uniform_scalar(offset=0, requires_grad=False):
     v = torch.rand(()) + offset
     v.requires_grad = requires_grad
@@ -3180,11 +3130,15 @@ class dont_convert(tuple):
      'tall_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])),
     ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
      'large', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (S, S, S), ((S, S, S),), 'batched', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
-    ('gesv', (2, 3, S, S), ((2, 3, S, S),), 'batched_dims', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
-    ('gesv', (2, 2, S, S), ((1, S, S),), 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
-    ('gesv', (1, S, S), ((2, 2, S, S),), 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (S, S), (random_fullrank_matrix_distinct_singular_value(S),), '', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (S, S, S), (random_fullrank_matrix_distinct_singular_value(S, S),),
+     'batched', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (2, 3, S, S), (random_fullrank_matrix_distinct_singular_value(S, 2, 3),),
+     'batched_dims', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (2, 2, S, S), (random_fullrank_matrix_distinct_singular_value(S, 1),),
+     'batched_broadcast_A', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (1, S, S), (random_fullrank_matrix_distinct_singular_value(S, 2, 2),),
+     'batched_broadcast_b', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
     ('fill_', (S, S, S), (1,), 'number'),
     ('fill_', (), (1,), 'number_scalar'),
     # FIXME: we should compute the derivative w.r.t torch.tensor(1)
diff --git a/test/test_torch.py b/test/test_torch.py
index cf938d4d0f50d7..fbabc12d32e051 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3997,8 +3997,9 @@ def test_gesv(self):
 
     @staticmethod
     def _test_gesv_batched(self, cast):
+        from common import random_fullrank_matrix_distinct_singular_value as fullrank
         # test against gesv: one batch
-        A = cast(torch.randn(1, 5, 5))
+        A = cast(fullrank(5, 1))
         b = cast(torch.randn(1, 5, 10))
         x_exp, LU_exp = torch.gesv(b.squeeze(0), A.squeeze(0))
         x, LU = torch.gesv(b, A)
@@ -4006,7 +4007,7 @@ def _test_gesv_batched(self, cast):
         self.assertEqual(LU, LU_exp.unsqueeze(0))
 
         # test against gesv in a loop: four batches
-        A = cast(torch.randn(4, 5, 5))
+        A = cast(fullrank(5, 4))
         b = cast(torch.randn(4, 5, 10))
 
         x_exp_list = list()
@@ -4023,7 +4024,7 @@ def _test_gesv_batched(self, cast):
         self.assertEqual(LU, LU_exp)
 
         # basic correctness test
-        A = cast(torch.randn(3, 5, 5))
+        A = cast(fullrank(5, 3))
         b = cast(torch.randn(3, 5, 10))
         x, LU = torch.gesv(b, A)
         self.assertEqual(torch.matmul(A, x), b)
@@ -4033,7 +4034,7 @@ def _test_gesv_batched(self, cast):
             return
         import numpy
         from numpy.linalg import solve
-        A = cast(torch.randn(2, 2, 2)).permute(1, 0, 2)
+        A = cast(fullrank(2, 2)).permute(1, 0, 2)
         b = cast(torch.randn(2, 2, 2)).permute(2, 1, 0)
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
@@ -4048,18 +4049,18 @@ def _test_gesv_batched_dims(self, cast):
         if not TEST_NUMPY:
             return
 
-        import numpy
         from numpy.linalg import solve
+        from common import random_fullrank_matrix_distinct_singular_value as fullrank
 
         # test against numpy.linalg.solve
-        A = cast(torch.randn(2, 1, 3, 4, 4))
+        A = cast(fullrank(4, 2, 1, 3))
         b = cast(torch.randn(2, 1, 3, 4, 6))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(x.data, cast(x_exp))
 
         # test column major format
-        A = cast(torch.randn(2, 1, 3, 4, 4)).transpose(-2, -1)
+        A = cast(fullrank(4, 2, 1, 3)).transpose(-2, -1)
         b = cast(torch.randn(2, 1, 3, 6, 4)).transpose(-2, -1)
         assert not A.is_contiguous()
         assert not b.is_contiguous()
@@ -4068,21 +4069,21 @@ def _test_gesv_batched_dims(self, cast):
         self.assertEqual(x.data, cast(x_exp))
 
         # broadcasting b
-        A = cast(torch.randn(2, 1, 3, 4, 4))
+        A = cast(fullrank(4, 2, 1, 3))
         b = cast(torch.randn(4, 6))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(x.data, cast(x_exp))
 
         # broadcasting A
-        A = cast(torch.randn(4, 4))
+        A = cast(fullrank(4))
         b = cast(torch.randn(2, 1, 3, 4, 2))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(x.data, cast(x_exp))
 
         # broadcasting both A & b
-        A = cast(torch.randn(1, 3, 1, 4, 4))
+        A = cast(fullrank(4, 1, 3, 1))
         b = cast(torch.randn(2, 1, 3, 4, 5))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
@@ -4663,7 +4664,7 @@ def run_test(M, sign=1):
 
         # Single matrix, but full rank
         # This is for negative powers
-        from test_autograd import random_fullrank_matrix_distinct_singular_value
+        from common import random_fullrank_matrix_distinct_singular_value
         M = conv_fn(random_fullrank_matrix_distinct_singular_value(5))
         run_test(M)
         run_test(M, sign=-1)

From 73738ec570fa907045e89dd573d2fd945aa2585c Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Mon, 17 Sep 2018 12:09:16 -0700
Subject: [PATCH 197/237] bump version to 1.0 (#11717)

Summary:
I'm just doing the honors and bumping the version to 1.0.0.

1.0 preview and RC releases will have the 1.0.0.dev{date} tag
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11717

Reviewed By: SsnL

Differential Revision: D9840857

Pulled By: soumith

fbshipit-source-id: 4c9c2e01dccb3c521dab26c49e1569d970a87ace
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4364b19ffdc52d..b61251c46c34fa 100644
--- a/setup.py
+++ b/setup.py
@@ -283,7 +283,7 @@ def patched_link(self, *args, **kwargs):
 # Version, create_version_file, and package_name
 ################################################################################
 package_name = os.getenv('TORCH_PACKAGE_NAME', 'torch')
-version = '0.5.0a0'
+version = '1.0.0a0'
 if os.getenv('PYTORCH_BUILD_VERSION'):
     assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
     build_number = int(os.getenv('PYTORCH_BUILD_NUMBER'))

From 336323f53c21fb68c884fd071825ac31914300a0 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimelshein@nvidia.com>
Date: Mon, 17 Sep 2018 13:28:27 -0700
Subject: [PATCH 198/237] return aten::gt to the list of fusable operations,
 add expected graphs (#11150)

Summary:
Fixes one of #11118 issues.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11150

Differential Revision: D9861372

Pulled By: apaszke

fbshipit-source-id: 98b196b89e991d3936360b30568360367fd32e8b
---
 test/test_jit.py                            | 23 +++++++++++++++++++++
 torch/csrc/jit/fusers/cuda/fused_kernel.cpp |  8 ++++++-
 torch/csrc/jit/passes/graph_fuser.cpp       |  3 +++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index b715045b30a27c..6612a8fbebf6fb 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -365,6 +365,10 @@ def allSum(vs):
 
         return ge
 
+    def assertAllFused(self, graph):
+        self.assertTrue(all(node.kind() in {'prim::Constant', 'prim::FusionGroup'} for node in graph.nodes()))
+        self.assertTrue([node.kind() for node in graph.nodes()].count('prim::FusionGroup') == 1)
+
     def assertExportImport(self, trace, inputs):
         graph = trace if isinstance(trace, torch._C.Graph) else trace.graph()
         m = torch.jit.ScriptModule()
@@ -766,6 +770,7 @@ def test_comparison_gt_lt_cuda(self):
         y = torch.randn(4, 4, dtype=torch.float, device='cuda')
 
         ge = self.checkTrace(self.fn_test_comparison_gt_lt, (x, y))
+        self.assertAllFused(ge.graph_for(x, y))
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
@@ -782,6 +787,24 @@ def f(x, y):
         y = torch.randn(4, 4, dtype=torch.float, device='cuda')
 
         ge = self.checkTrace(f, (x, y))
+        self.assertAllFused(ge.graph_for(x, y))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
+    def test_comparison_eq_ne(self):
+        def f(x, y):
+            mask = (x == 0).type_as(x)
+            z = x * mask + y
+            mask = (x != 0).type_as(x)
+            z = z * mask + y
+            return z
+
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(f, (x, y))
+        self.assertAllFused(ge.graph_for(x, y))
 
     @staticmethod
     def fn_test_relu(x, y):
diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
index 90624307123b56..4067e6cc7bba7e 100644
--- a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
@@ -65,7 +65,13 @@ CUDAFusedKernel::CUDAFusedKernel(
   TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
   ptx.resize(ptx_size);
   TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
-
+  CUcontext pctx = 0;
+  TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+     std::unique_lock<std::mutex> cudaFreeMutexLock(
+     *(THCCachingAllocator_getCudaFreeMutex()));
+     cudaFree(0);
+  }
   TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
   TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
 
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index e8123690789b24..4d69ed57e63786 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -228,6 +228,9 @@ struct GraphFuser {
         node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::gt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::gt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||

From 29610621ec511cd543bc809b40620852b46c56ff Mon Sep 17 00:00:00 2001
From: Jongsoo Park <jongsoo@fb.com>
Date: Mon, 17 Sep 2018 13:40:52 -0700
Subject: [PATCH 199/237] 64B align for avx512 (#11748)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11748

For avx512, we need to align at a multiple of 64B not 32B
Regardless of avx512, it's in general a good idea to be cache line aligned.

Reviewed By: ilia-cher

Differential Revision: D9845056

fbshipit-source-id: b1d3ed67749c0c1a64acd5cc230a1279e8023512
---
 caffe2/core/allocator.h          | 4 ++--
 caffe2/python/hypothesis_test.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h
index 1e9be9f50b39d8..96bc720ccd59d1 100644
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@@ -11,8 +11,8 @@ CAFFE2_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
 
 namespace caffe2 {
 
-// Use 32-byte alignment should be enough for computation up to AVX512.
-constexpr size_t gCaffe2Alignment = 32;
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gCaffe2Alignment = 64;
 
 using MemoryDeleter = void (*)(void*);
 
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 64174f6e71c676..5d41fdfb18262b 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -2229,7 +2229,7 @@ def ref_nhwc(x, scale, bias):
            in_place=st.booleans(),
            **hu.gcs)
     def test_unsafe_coalesce(self, sizes, in_place, gc, dc):
-        gAlignment = 32
+        gAlignment = 64
         Xs = [np.random.randn(size)
               .astype(np.random.choice([np.float32, np.float64, np.uint8]))
               for size in sizes]

From 7671f4ab1c921888554bef92d1cee55785bcf232 Mon Sep 17 00:00:00 2001
From: David Riazati <davidriazati@fb.com>
Date: Mon, 17 Sep 2018 14:03:14 -0700
Subject: [PATCH 200/237] Add `math` to scope when using inf in tests (#11302)

Summary:
This fixes #8515 which was mostly issues in the test themselves. As long
as `math` is imported in the scope in which the script runs it resolves
to a `prim::Constant` with value `inf` correctly. This PR adds this to
the `test_jit.py` tests involving `inf` and adds a test to demonstrate
`inf` in a non-generated test.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11302

Differential Revision: D9684336

Pulled By: driazati

fbshipit-source-id: 73df2848dfdb45ab50690a7c88df8fda269a64eb
---
 test/test_jit.py                     | 23 +++++++++++++++++++----
 torch/csrc/jit/export.cpp            |  2 ++
 torch/csrc/jit/import.cpp            |  2 ++
 torch/csrc/jit/interned_strings.h    |  1 +
 torch/csrc/jit/ir.h                  |  6 ++++++
 torch/csrc/jit/register_prim_ops.cpp | 17 +++++++++++++++++
 torch/csrc/jit/script/compiler.cpp   |  2 ++
 7 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 6612a8fbebf6fb..d91b5b4653d57c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10,6 +10,7 @@
 from torch.autograd.function import traceable
 from torch.testing import assert_allclose
 from torch.onnx import OperatorExportTypes
+from torch._six import inf, PY2
 from common import TestCase, run_tests, IS_WINDOWS, TEST_WITH_UBSAN, skipIfRocm, suppress_warnings
 from textwrap import dedent
 import os
@@ -56,7 +57,6 @@
 
 RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
 
-PY2 = sys.version_info[0] == 2
 PY35 = sys.version_info >= (3, 5)
 WINDOWS = sys.platform == 'win32'
 
@@ -2621,6 +2621,13 @@ def foo(a):
         s = Variable(torch.rand(2))
         self.assertEqual(s + s + s, foo(s))
 
+    def test_inf(self):
+        @torch.jit.script
+        def foo(a):
+            return a < float('inf')
+        s = torch.rand(1)
+        self.assertTrue(foo(s))
+
     def test_add(self):
         def func(a, b):
             c = a + b
@@ -7549,8 +7556,6 @@ def forward(self, x, y):
     'test_var_dim_1d',
     'test_var_dim_1d_neg0',
     'test_var_dim_neg0',
-    'test_norm_inf',
-    'test_renorm_norm_inf',
     'test_matrix_power_n=-1',  # involves inverse
     'test_matrix_power_n=-3',  # involves inverse
     # skipped nn functional tests
@@ -7638,6 +7643,12 @@ def the_method({}):
 '''
 
 
+def get_constant(x):
+    if x == inf or x == -inf:
+        return 'float(\'inf\')' if PY2 else 'math.inf'
+    return x
+
+
 # create a script function from (name, func_type, output_process_fn),
 # returns a function takes in (args, kwargs) and runs the compiled function and
 # then applies the post process fn to the outputs
@@ -7653,7 +7664,7 @@ def script_fn(*args, **kwargs):
                 actuals.append(name)
                 tensors.append(arg)
             else:
-                actuals.append(str(arg))
+                actuals.append(str(get_constant(arg)))
         kwargs_str = ''
         for k, v in kwargs.items():
             kwargs_str += ', ' + k + '=' + str(v)
@@ -7667,6 +7678,10 @@ def script_fn(*args, **kwargs):
             raise 'Unsupported function type'
 
         script = script_template.format(', '.join(formals), call)
+
+        # for math.inf
+        import math
+
         CU = torch.jit.CompilationUnit(script)
         self.assertExportImport(CU.the_method.graph, tensors)
         output = output_process_fn(CU.the_method(*tensors))
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 0740d2e88a2c8a..3a0500b765dead 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -559,6 +559,8 @@ void ModuleEncoder::EncodeTypeInfo(
     type_proto->set_denotation("NoneType");
   } else if (kind == TypeKind::GeneratorType) {
     type_proto->set_denotation("GeneratorType");
+  } else if (kind == TypeKind::StringType) {
+    type_proto->set_denotation("StringType");
   } else {
     throw std::runtime_error("unexpected type kind");
   }
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 8e6a07da0a00ca..84ffdd1c265466 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -260,6 +260,8 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
     return NoneType::get();
   } else if (kind == "GeneratorType") {
     return GeneratorType::get();
+  }else if (kind == "StringType") {
+    return StringType::get();
   } else {
     throw std::runtime_error("unexpected string for type kind");
   }
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 60e086db187472..e1d76dde56c59d 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -51,6 +51,7 @@ namespace torch { namespace jit {
   _(prim, ImplicitTensorToNum)     \
   _(prim, IntToFloat)              \
   _(prim, FloatToInt)              \
+  _(prim, StringToFloat)           \
   _(prim, AutogradAdd)             \
   _(prim, GradOf)                  \
   _(prim, AnyDefined)              \
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 58e702b771acaf..fd730adb69570a 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -1066,6 +1066,12 @@ friend struct Block;
     result->output()->setType(IntType::get());
     return result;
   }
+  Node* createStringToFloat(Value* value) {
+    JIT_ASSERT(*value->type() == *StringType::get());
+    auto* result = create(prim::StringToFloat, {value});
+    result->output()->setType(FloatType::get());
+    return result;
+  }
   Node* createPythonOp(
       THPObjectPtr&& pyobj,
       const std::string& cconv,
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 42184ac318ad58..1f8618121f1e28 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -13,10 +13,12 @@
 
 #include <exception>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <mutex>
 #include <ostream>
 #include <stdexcept>
+#include <string>
 #include <typeinfo>
 #include <unordered_map>
 #include <unordered_set>
@@ -134,6 +136,21 @@ RegisterOperators reg({
             return 0;
           };
         }),
+    Operator(
+        prim::StringToFloat,
+        [](Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto s = pop(stack).toString();
+            if (s->string() != "inf") {
+              AT_ERROR(
+                  "Only 'inf' can be cast to a float, but got '",
+                  s->string(),
+                  "'");
+            }
+            push(stack, std::numeric_limits<double>::infinity());
+            return 0;
+          };
+        }),
     Operator(
         prim::Undefined,
         [](Node* node) {
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index fc7bf068f20998..b66b96dd5eb6fb 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -76,6 +76,8 @@ static Value* typeCast(const SourceRange& loc, Value* value, TypePtr dst) {
     n = graph.createFloatToInt(value);
   } else if(dst->isSubtypeOf(FloatType::get()) && orig->isSubtypeOf(IntType::get())) {
     n = graph.createIntToFloat(value);
+  } else if(dst->isSubtypeOf(FloatType::get()) && orig->isSubtypeOf(StringType::get())) {
+    n = graph.createStringToFloat(value);
   } else {
     throw ErrorReport(loc) << "Cannot cast type '" << orig->str() << "' to type '"
       << dst->str() << "'.";

From 7df6650e9c23c507137b9af7ffb94053b8a86a2d Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Mon, 17 Sep 2018 14:36:18 -0700
Subject: [PATCH 201/237] Fix empty embedding bag on cuda (#11740)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/11739
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11740

Differential Revision: D9881392

Pulled By: SsnL

fbshipit-source-id: 2964d314f199dd9b4bb69e36592b67efdf5e0760
---
 aten/src/ATen/native/cuda/EmbeddingBag.cu | 18 ++++++++++++++----
 test/test_nn.py                           | 21 +++++++++------------
 torch/csrc/autograd/engine.cpp            | 11 ++++++-----
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 853c04deb2215c..afa4c8e1916604 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -73,14 +73,22 @@ __global__ void EmbeddingBag_updateOutputKernel(
         }
       }
       if (mode == MODE_MEAN) {
-        weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
-        bag_size[bag] = bag_size_;
+        if (end == begin) {
+          bag_size[bag] = 0;
+        } else {
+          weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
+          bag_size[bag] = bag_size_;
+        }
       }
 
       if (mode == MODE_MEAN || mode == MODE_SUM) {
         output[bag * featureSize + featureDim] = static_cast<scalar_t>(weightFeatSum);
       }
       else if (mode == MODE_MAX) {
+        if (end == begin) {
+          // If bag is empty, set output to 0.
+          weightFeatMax = 0;
+        }
         max_indices[bag * featureSize + featureDim] = maxWord;
         output[bag * featureSize + featureDim] = weightFeatMax;
       }
@@ -268,8 +276,10 @@ __global__ void EmbeddingBag_accGradParametersKernel_max(
       int64_t bag = chunk / chunksPerBag;
 
       int64_t word_idx = max_indices[bag * stride + featureDim];
-
-      atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]);
+      if (word_idx >= 0) {
+        // If bag is empty, we have max_indices[idx] set to -1 in forward.
+        atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]);
+      }
     }
   }
 }
diff --git a/test/test_nn.py b/test/test_nn.py
index ac2496620c3977..29e7c6ba8ebcc9 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1956,8 +1956,7 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
         input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=torch.long)
 
         # Empty list is only handled in CPU for now
-        offsets = torch.tensor([0, 3], device=device, dtype=torch.long) if cuda \
-            else torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long)
+        offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long)
 
         grad_output = torch.tensor(
             [1, 2,
@@ -2008,33 +2007,31 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
                  [3, 4]], device=device, dtype=dtype)
 
         output = es(input, offsets)
-        output.backward(grad_output if cuda else grad_output_with_empty)
+        output.backward(grad_output_with_empty)
 
         es_weight_grad = es.weight.grad.data
         if sparse:
-            es_weight_grad = es.weight.grad.data.to_dense()
-        self.assertEqual(
-            output.data,
-            expected_output if cuda else expected_output_with_empty)
+            es_weight_grad = es.weight.grad.to_dense()
+        self.assertEqual(output, expected_output_with_empty)
         self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
 
         # check same example except as 2D (2 x 3)
-        input = input.data.view(2, -1)
+        input = input.view(2, -1)
         es.zero_grad()
         output = es(input)
         output.backward(grad_output)
 
-        es_weight_grad = es.weight.grad.data
+        es_weight_grad = es.weight.grad
         if sparse:
-            es_weight_grad = es.weight.grad.data.to_dense()
-        self.assertEqual(output.data, expected_output)
+            es_weight_grad = es.weight.grad.to_dense()
+        self.assertEqual(output, expected_output)
         self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
 
         # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
         def _test_vs_Embedding(N, D, B, L, max_norm=None):
             es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, dtype)
             e = nn.Embedding(N, D, max_norm=max_norm).to(device, dtype)
-            e.weight.data.copy_(es.weight.data)
+            e.weight.data.copy_(es.weight)
             input = torch.randint(N, (B, L), device=device, dtype=torch.long)
             offsets = torch.arange(0, B, device=device, dtype=torch.long).mul_(L)
             grad_output = torch.rand(B, D, device=device, dtype=dtype)
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index a8dc01fa5ec3a8..d0ecc017b42b50 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -8,6 +8,7 @@
 
 #include <ATen/DeviceGuard.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/Error.h>
 
 #include <atomic>
 #include <condition_variable>
@@ -302,7 +303,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
     std::stringstream ss;
     ss << "invalid number of gradients - expected ";
     ss << edges.size() << ", but got " << grads.size();
-    throw std::runtime_error(format_error(ss.str()));
+    AT_ERROR(format_error(ss.str()));
   }
   for (size_t i = 0; i < grads.size(); i++) {
     const auto& edge = edges[i];
@@ -314,7 +315,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
       // FIXME: TestJit.test_ge_optimized fails this assertion.
       // std::stringstream ss;
       // ss << "undefined gradient at index " << i;
-      // throw std::runtime_error(format_error(ss.str()));
+      // AT_ERROR(format_error(ss.str()));
       continue;
     }
     if (!grads[i].sizes().equals(metadata.shape())) {
@@ -323,7 +324,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
         ss << "invalid gradient at index " << i << " - got ";
         ss << grads[i].sizes() << " but expected shape compatible with ";
         ss << metadata.shape();
-        throw std::runtime_error(format_error(ss.str()));
+        AT_ERROR(format_error(ss.str()));
       }
       grads[i] = at::sum_to(grads[i], metadata.shape());
     }
@@ -331,14 +332,14 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
       std::stringstream ss;
       ss << "invalid gradient at index " << i << " - expected type ";
       ss << metadata.type() << " but got " << grads[i].type();
-      throw std::runtime_error(format_error(ss.str()));
+      AT_ERROR(format_error(ss.str()));
     }
     const auto output_device = output.is_cuda() ? output.get_device() : -1;
     if (output_device != metadata.device()) {
       std::stringstream ss;
       ss << "invalid gradient at index " << i << " - expected device ";
       ss << metadata.device() << " but got " << output_device;
-      throw std::runtime_error(format_error(ss.str()));
+      AT_ERROR(format_error(ss.str()));
     }
   }
 }

From 3ce17bf8f6a2c4239085191ea60d6ee51cd620a5 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 17 Sep 2018 15:22:13 -0700
Subject: [PATCH 202/237] Generate ATen/core to source if env GEN_TO_SOURCE is
 set. (#11759)

Summary:
It is currently tedious to change code generation because it takes two steps: change the code gen, then gen.py fails because of file mismatch.  Just add an environment option of generating directly to source.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11759

Differential Revision: D9867259

Pulled By: gchanan

fbshipit-source-id: 3cf8024d9e302f382cf8b8a44cb843fb086f8597
---
 aten/src/ATen/gen.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 66444e1837c540..3112e5ff0424ab 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -41,7 +41,11 @@
 parser.add_argument(
     '-d', '--install_dir', help='output directory', default='ATen')
 options = parser.parse_args()
-core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None
+gen_to_source = os.environ.get('GEN_TO_SOURCE')  # update source directly as part of gen
+if not gen_to_source:
+    core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None
+else:
+    core_install_dir = os.path.join(options.source_path, 'core')
 
 if options.install_dir is not None and not os.path.exists(options.install_dir):
     os.makedirs(options.install_dir)
@@ -50,8 +54,8 @@
 
 
 class FileManager(object):
-    def __init__(self, prefix=None):
-        self.install_dir = '{}/{}'.format(options.install_dir, prefix) if prefix else options.install_dir
+    def __init__(self, install_dir=None):
+        self.install_dir = install_dir if install_dir else options.install_dir
         self.filenames = set()
         self.outputs_written = False
         self.undeclared_files = []
@@ -133,7 +137,7 @@ def check_all_files_written(self):
 context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}());
 """)
 
-core_file_manager = FileManager('core_tmp')
+core_file_manager = FileManager(core_install_dir)
 file_manager = FileManager()
 cuda_file_manager = FileManager()
 

From ca5def1b8fc4c5e23bd32dce4cb247df6d4bb7c7 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Mon, 17 Sep 2018 16:28:29 -0700
Subject: [PATCH 203/237] Expose annotations (#11649)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11649

Putting annotations in python interface

Reviewed By: duc0

Differential Revision: D9784750

fbshipit-source-id: d877c886ac52559ca3f009a1fd848dd1779b7d04
---
 .../nomnigraph/Representations/NeuralNet.h      |  3 ---
 caffe2/opt/converter.cc                         | 15 +++++++++++++++
 caffe2/opt/converter.h                          | 17 +++++++++++++++--
 caffe2/python/nomnigraph.py                     |  1 +
 caffe2/python/nomnigraph_test.py                | 16 ++++++++++++++++
 caffe2/python/pybind_state_nomni.cc             | 17 ++++++++++++++++-
 6 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 09273a5fe15981..3abcb2d1057ae9 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -53,9 +53,6 @@ class CAFFE2_API Annotation {
     return kind_;
   }
 
-  Annotation(const Annotation&) = delete;
-  Annotation& operator=(Annotation&) = delete;
-
  private:
   const AnnotationKind kind_;
 };
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 80e2308eabf3cd..3d0f2ce2aa643a 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -379,6 +379,21 @@ caffe2::OperatorDef convertToOperatorDef(
   return op;
 }
 
+Caffe2Annotation getOrAddCaffe2Annotation(
+    nom::repr::NNGraph::NodeRef& instrNode) {
+  auto* nnOp = repr::nn::get<repr::NeuralNetOperator>(instrNode);
+  auto* annotation = nnOp->getAnnotation();
+  if (!annotation) {
+    auto new_annot = util::make_unique<Caffe2Annotation>();
+    new_annot->setOperatorDef(convertToOperatorDef(instrNode));
+    nnOp->setAnnotation(std::move(new_annot));
+    annotation = nnOp->getAnnotation();
+  }
+  CAFFE_ENFORCE(isa<Caffe2Annotation>(annotation));
+  auto c2_annotation = dyn_cast<Caffe2Annotation>(annotation);
+  return *c2_annotation;
+}
+
 caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m) {
   auto predictNet = caffe2::NetDef();
   return convertToCaffe2Proto(m, predictNet);
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index 31281ab90572ec..fdfa7f9ab37583 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -19,8 +19,12 @@ class Caffe2Annotation : public nom::repr::Annotation {
       : Annotation(AnnotationKind::Caffe2), Device(device) {}
   virtual ~Caffe2Annotation() {}
 
-  void setDevice(std::string device) { Device = device; }
-  const std::string getDevice() const { return Device; }
+  void setDevice(std::string device) {
+    Device = device;
+  }
+  const std::string getDevice() const {
+    return Device;
+  }
 
   void setDeviceType(int device) {
     DeviceType = device;
@@ -33,6 +37,11 @@ class Caffe2Annotation : public nom::repr::Annotation {
     OpDef = opDef;
     OpDefExists = true;
   }
+
+  bool hasOperatorDef() const {
+    return OpDefExists;
+  }
+
   const caffe2::OperatorDef& getOperatorDef() const {
     CAFFE_ENFORCE(
         OpDefExists,
@@ -73,6 +82,10 @@ CAFFE2_API std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOpera
 CAFFE2_API caffe2::OperatorDef convertToOperatorDef(
     const nom::repr::NNGraph::NodeRef& instrNode);
 
+// If the annotation doesn't exist, attempt to add it
+CAFFE2_API Caffe2Annotation
+getOrAddCaffe2Annotation(nom::repr::NNGraph::NodeRef& instrNode);
+
 class CAFFE2_API Converter {
  public:
   explicit Converter() = default;
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index 708eae6b2a071c..417ad64ccc2bb2 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -72,3 +72,4 @@ def render(s):
 NNSubgraph = C.NNSubgraph
 NNMatchGraph = C.NNMatchGraph
 Graph = C.Graph
+Annotation = C.Annotation
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 7739ac05f2979f..1956d2e9f01b68 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -174,3 +174,19 @@ def test_convertToProto(self):
             assert a == b
         for a, b in zip(new_netdef.external_output, net.Proto().external_output):
             assert a == b
+
+    def test_annotation_basic(self):
+        annot = ng.Annotation()
+        annot.setDevice("woot")
+        assert annot.getDevice() == "woot"
+        annot.setDeviceType(7)
+        assert annot.getDeviceType() == 7
+
+    def test_annotation_from_graph(self):
+        nn = ng.NNModule()
+        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
+        annot = node.getAnnotation()
+        annot.setDeviceType(7)
+        node.setAnnotation(annot)
+        new_annot = node.getAnnotation()
+        assert new_annot.getDeviceType() == 7
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
index a8cbb661d69c21..fa7baf60c92541 100644
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@@ -198,7 +198,14 @@ void addNomnigraphMethods(pybind11::module& m) {
             CAFFE_ENFORCE(nn::is<nom::repr::Tensor>(n));
             return nn::get<nom::repr::Tensor>(n);
           },
-          py::return_value_policy::reference_internal);
+          py::return_value_policy::reference_internal)
+      .def(
+          "getAnnotation",
+          [](NNGraph::NodeRef n) { return getOrAddCaffe2Annotation(n); })
+      .def("setAnnotation", [](NNGraph::NodeRef n, Caffe2Annotation annot) {
+        auto* nnOp = nn::get<NeuralNetOperator>(n);
+        nnOp->setAnnotation(nom::util::make_unique<Caffe2Annotation>(annot));
+      });
 
   py::class_<GenericOperator> nnop(m, "NeuralNetOperator");
   py::class_<nom::repr::Tensor> nndata(m, "NeuralNetData");
@@ -282,6 +289,14 @@ void addNomnigraphMethods(pybind11::module& m) {
     }
     return NNSubgraph();
   });
+
+  // Annotation API
+  py::class_<Caffe2Annotation> annotation(m, "Annotation");
+  annotation.def(py::init<>())
+      .def("setDevice", &Caffe2Annotation::setDevice)
+      .def("getDevice", &Caffe2Annotation::getDevice)
+      .def("setDeviceType", &Caffe2Annotation::setDeviceType)
+      .def("getDeviceType", &Caffe2Annotation::getDeviceType);
 }
 
 REGISTER_PYBIND_ADDITION(addNomnigraphMethods);

From 3819d25418b30955880cbd666e63ae69274cd77d Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Mon, 17 Sep 2018 17:25:22 -0700
Subject: [PATCH 204/237] Clean up converter and accept less-valid networks

Summary: Cleaning up converter.cc and allowing networks that have "pass through" inputs (that are also outputs but aren't actually consumed by the network)

Reviewed By: duc0

Differential Revision: D9759435

fbshipit-source-id: 1ddfcc60a1b865a06682e4022230dfecc4b89ec3
---
 caffe2/opt/converter.cc               | 36 +++++++++++++++------------
 caffe2/opt/converter.h                |  2 +-
 caffe2/python/transformations_test.py |  4 +--
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 3d0f2ce2aa643a..f9956060b75cfd 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -264,8 +264,7 @@ std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
 
 /// \brief Ingest a caffe2 protobuf model and output an NNModule.
 /// \param net The caffe2 protobuf NetDef
-/// \param blobMap [optional][output] A pointer to a blobMap to be populated with all the output blobs of the NetDef by name->NodeRef
-repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, repr::NNGraph::NodeRef>* blobMapOut) {
+repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict) {
   repr::NNModule module;
   repr::NNGraph& dfg = module.dataFlow;
   repr::NNCFGraph& cfg = module.controlFlow;
@@ -285,7 +284,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
   /// \brief For the construction of the control flow graph we keep track
   /// of a current basic block, which we split up as we come accross control
   /// flow operations such as if and while.
-  // std::unique_ptr<repr::BasicBlockType<repr::NNGraph>> currentBasicBlock =
   auto bbNode =
       cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
 
@@ -323,17 +321,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
   }
 
   if (externalInputNames.size()) {
-    std::ostringstream os;
-    for (const auto& inputName : externalInputNames) {
-      os << "\"" << inputName << "\" ";
-    }
+    // In strict mode we ensure the input names are valid
+    if (strict) {
+      std::ostringstream os;
+      for (const auto& inputName : externalInputNames) {
+        os << "\"" << inputName << "\" ";
+      }
 
-    CAFFE_ENFORCE(
-        externalInputNames.size() == 0,
-        "Attempting to convert an ill-formed network: external_input contains ",
-        externalInputNames.size(),
-        " unused blobs: ",
-        os.str());
+      CAFFE_ENFORCE(
+          externalInputNames.size() == 0,
+          "Attempting to convert an ill-formed network: ",
+          "external_input contains ",
+          externalInputNames.size(),
+          " unused blobs: ",
+          os.str());
+    // Otherwise, we add the blobs to the graph as no-ops
+    } else {
+      for (const auto& input : externalInputNames) {
+        blobMap[input] = dfg.createNode(util::make_unique<repr::Tensor>(input));
+      }
+    }
   }
 
   for (const auto& outputName : net.external_output()) {
@@ -345,9 +352,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
     module.outputs.insert(blobMap[outputName]);
   }
 
-  if (blobMapOut) {
-    *blobMapOut = blobMap;
-  }
   return module;
 }
 
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index fdfa7f9ab37583..47b3b8c7a3ec3e 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -66,7 +66,7 @@ class Caffe2Annotation : public nom::repr::Annotation {
   int DeviceType = caffe2::DeviceTypeProto::PROTO_CPU;
 };
 
-CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false);
 
 CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
 
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 1a579b519fe09c..383b8410ea6ae2 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -396,8 +396,8 @@ def test_converterEnforceUnusedInputs(self):
         net = core.Net("net")
         net.Relu(["X"], ["Y"])
         net.Proto().external_input.extend(["fake"])
-        with self.assertRaises(Exception):
-            transformer.AddNNPACK(net)  # just testing the converter
+        # This should now work
+        transformer.AddNNPACK(net)  # just testing the converter
 
     def test_converterEnforceUnusedOutputs(self):
         net = core.Net("net")

From 7d0657f13cc285a3e0dec2a6f35420af77adafa4 Mon Sep 17 00:00:00 2001
From: zrphercule <zrphercule@gmail.com>
Date: Mon, 17 Sep 2018 17:26:32 -0700
Subject: [PATCH 205/237] Migrate test in cpp/api/ to use gtest (#11556)

Summary:
The second part of T32009899
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11556

Differential Revision: D9888224

Pulled By: zrphercule

fbshipit-source-id: cb0d0ba5d9c7ad601ee3bce0d932ce9cbbc40908
---
 test/cpp/api/static.cpp | 71 +++++++++++++++++++----------------------
 torch/CMakeLists.txt    | 24 ++++++++++++--
 2 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/test/cpp/api/static.cpp b/test/cpp/api/static.cpp
index 121478c928ac1b..827ff251a7c483 100644
--- a/test/cpp/api/static.cpp
+++ b/test/cpp/api/static.cpp
@@ -1,4 +1,5 @@
-#include <catch.hpp>
+
+#include "gtest/gtest.h"
 
 #include <torch/detail/static.h>
 #include <torch/nn/module.h>
@@ -22,43 +23,35 @@ torch::detail::enable_if_module_t<T, bool> f(T&& m) {
   return true;
 }
 
-TEST_CASE("static") {
-  SECTION("all_of") {
-    REQUIRE(torch::all_of<>::value == true);
-    REQUIRE(torch::all_of<true>::value == true);
-    REQUIRE(torch::all_of<true, true, true>::value == true);
-    REQUIRE(torch::all_of<false>::value == false);
-    REQUIRE(torch::all_of<false, false, false>::value == false);
-    REQUIRE(torch::all_of<true, true, false>::value == false);
-  }
-  SECTION("any_of") {
-    REQUIRE(torch::any_of<>::value == false);
-    REQUIRE(torch::any_of<true>::value == true);
-    REQUIRE(torch::any_of<true, true, true>::value == true);
-    REQUIRE(torch::any_of<false>::value == false);
-    REQUIRE(torch::any_of<true, true, false>::value == true);
-  }
-  SECTION("enable_if_module_t") {
-    REQUIRE(f(torch::nn::LinearImpl(1, 2)) == true);
-    REQUIRE(f(5) == false);
-  }
-  SECTION("check_not_lvalue_references") {
-    REQUIRE(torch::detail::check_not_lvalue_references<int>() == true);
-    REQUIRE(
-        torch::detail::check_not_lvalue_references<float, int, char>() == true);
-    REQUIRE(
-        torch::detail::check_not_lvalue_references<float, int&, char>() ==
-        false);
-    REQUIRE(torch::detail::check_not_lvalue_references<std::string>() == true);
-    REQUIRE(
-        torch::detail::check_not_lvalue_references<std::string&>() == false);
-  }
-  SECTION("apply") {
-    std::vector<int> v;
-    torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
-    REQUIRE(v.size() == 5);
-    for (size_t i = 0; i < v.size(); ++i) {
-      REQUIRE(v.at(i) == 1 + i);
-    }
+TEST(TestStatic, All_Of){
+  EXPECT_TRUE(torch::all_of<>::value);
+  EXPECT_TRUE(torch::all_of<true>::value);
+  EXPECT_TRUE((torch::all_of<true, true, true>::value));
+  EXPECT_FALSE(torch::all_of<false>::value);
+  EXPECT_FALSE((torch::all_of<false, false, false>::value));
+  EXPECT_FALSE((torch::all_of<true, true, false>::value));
+}
+TEST(TestStatic, Any_Of){
+  EXPECT_FALSE(torch::any_of<>::value);
+  EXPECT_TRUE(bool((torch::any_of<true>::value)));
+  EXPECT_TRUE(bool((torch::any_of<true, true, true>::value)));
+  EXPECT_FALSE(bool((torch::any_of<false>::value)));
+}
+TEST(TestStatic, Enable_If_Module){
+  EXPECT_TRUE(f(torch::nn::LinearImpl(1, 2)));
+  EXPECT_FALSE(f(5));
+  EXPECT_TRUE(torch::detail::check_not_lvalue_references<int>());
+  EXPECT_TRUE((torch::detail::check_not_lvalue_references<float, int, char>()));
+  EXPECT_FALSE(
+      (torch::detail::check_not_lvalue_references<float, int&, char>()));
+  EXPECT_TRUE(torch::detail::check_not_lvalue_references<std::string>());
+  EXPECT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
+}
+TEST(TestStatic, Apply){
+  std::vector<int> v;
+  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
+  EXPECT_EQ(v.size(), 5);
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v.at(i), i + 1);
   }
 }
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 68eee299e65334..69b4963277a69c 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -436,8 +436,10 @@ if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
 endif()
 
 if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
-  set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api")
 
+  #Catch test of api.
+  #TODO: Change all these tests to Google test.
+  set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api")
   set(TORCH_API_TEST_SOURCES
     ${TORCH_API_TEST_DIR}/any.cpp
     ${TORCH_API_TEST_DIR}/cursor.cpp
@@ -450,7 +452,6 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
     ${TORCH_API_TEST_DIR}/parallel.cpp
     ${TORCH_API_TEST_DIR}/rnn.cpp
     ${TORCH_API_TEST_DIR}/sequential.cpp
-    ${TORCH_API_TEST_DIR}/static.cpp
     ${TORCH_API_TEST_DIR}/tensor_cuda.cpp
     ${TORCH_API_TEST_DIR}/tensor.cpp
     ${TORCH_API_TEST_DIR}/jit.cpp
@@ -471,19 +472,36 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
 
   target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB})
 
+   #Google test of api.
+   set(TORCH_API_GTEST_DIR "${TORCH_ROOT}/test/cpp/api/")
+   add_executable(gtest_api
+		${TORCH_API_GTEST_DIR}/static.cpp
+		)
+   target_include_directories(gtest_api PRIVATE ${ATen_CPU_INCLUDE})
+   target_link_libraries(gtest_api torch gtest_main)
+   if (USE_CUDA)
+      target_link_libraries(gtest_api ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+   endif()
+
+  #Adding compile options for both tests.
   if (NOT MSVC)
     if (APPLE)
       target_compile_options(test_api PRIVATE
         # Clang has an unfixed bug leading to spurious missing braces
         # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
         -Wno-missing-braces)
-    else()
+      target_compile_options(gtest_api PRIVATE
+        -Wno-missing-braces)
+	else()
       target_compile_options(test_api PRIVATE
         # Considered to be flaky.  See the discussion at
         # https://github.com/pytorch/pytorch/pull/9608
         -Wno-maybe-uninitialized
         # gcc gives nonsensical warnings about variadic.h
         -Wno-unused-but-set-parameter)
+     target_compile_options(gtest_api PRIVATE
+		-Wno-maybe-uninitialized
+		-Wno-unused-but-set-parameter)
     endif()
   endif()
 endif()

From 24a8c13f3620b6f2cdef16e8749b7e49b50de3ed Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pietern@fb.com>
Date: Mon, 17 Sep 2018 17:29:34 -0700
Subject: [PATCH 206/237] Add barrier to fix distributed test flakiness
 (#11775)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11775

This should fix #11582.

Reviewed By: ezyang

Differential Revision: D9885546

fbshipit-source-id: 3544f42ebe8b595cdf6941859c67484d3ea9b3f8
---
 test/test_distributed.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/test_distributed.py b/test/test_distributed.py
index f3250644bd621a..d311f74e3fe0f1 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -428,7 +428,6 @@ def test_broadcast(self):
     )
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
-    @unittest.skip("Flaky test, see pytorch#11582")
     def test_broadcast_cuda(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -663,7 +662,6 @@ def test_all_reduce_sum(self):
     )
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
-    @unittest.skip("Flaky test, see pytorch#11582")
     def test_all_reduce_sum_cuda(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -952,7 +950,6 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
     @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
     @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped")
     @skip_if_no_gpu
-    @unittest.skip("Flaky test, see pytorch#11582")
     def test_broadcast_multigpu(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -1213,7 +1210,6 @@ def test_DistributedDataParallelCPU(self):
                      "Only Nccl & Gloo backend support DistributedDataParallel")
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
-    @unittest.skip("Flaky test, see pytorch#11582")
     def test_DistributedDataParallel(self):
         group, group_id, rank = self._init_global_test()
         rank_to_GPU = self._init_multigpu_helper()
@@ -1296,6 +1292,12 @@ def _run(self, rank):
                     sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
                     # sys.exit(0)
                 raise
+
+            # Execute barrier prior to running test to ensure that every process
+            # has finished initialization and that the following test
+            # immediately exiting due to a skip doesn't cause flakiness.
+            self._barrier()
+
             # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
             # We're retreiving a corresponding test and executing it.
             getattr(self, self.id().split(".")[2])()

From d4dde0bcaf044802e881d73e5ed03cdb357177b1 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Mon, 17 Sep 2018 18:05:58 -0700
Subject: [PATCH 207/237] Detect number of amd gpus in ROCM CI (#11771)

Summary:
We now have CI machines with different number of amd gpus.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11771

Differential Revision: D9889837

Pulled By: bddppq

fbshipit-source-id: dacf728a282f209e3f2419da186e59528a08ca6a
---
 .jenkins/caffe2/test.sh | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index ac29672c8e2ced..585f994367f339 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -49,6 +49,20 @@ fi
 
 mkdir -p $TEST_DIR/{cpp,python}
 
+if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+
+  # Pin individual runs to specific gpu so that we can schedule
+  # multiple jobs on machines that have multi-gpu.
+  NUM_AMD_GPUS=$(/opt/rocm/bin/rocminfo | grep 'Device Type.*GPU' | wc -l)
+  if (( $NUM_AMD_GPUS == 0 )); then
+      echo >&2 "No AMD GPU detected!"
+      exit 1
+  fi
+  export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % $NUM_AMD_GPUS))
+fi
+
 cd "${WORKSPACE}"
 
 # C++ tests
@@ -106,9 +120,6 @@ fi
 
 rocm_ignore_test=()
 if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
-  export LANG=C.UTF-8
-  export LC_ALL=C.UTF-8
-
   # Currently these tests are failing on ROCM platform:
 
   # Unknown reasons, need to debug
@@ -123,10 +134,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Our cuda top_k op has some asm code, the hipified version doesn't
   # compile yet, so we don't have top_k operator for now
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
-
-  # Our AMD CI boxes have 4 gpus on each
-  # Remove this once we have added multi-gpu support
-  export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4))
 fi
 
 # Python tests

From e8ecbcdf010d1e65384ba2d1f8760cc557c02883 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Mon, 17 Sep 2018 18:11:11 -0700
Subject: [PATCH 208/237] Move IValue to ATen/core (#11610)

Summary:
unblocks D9202320
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11610

Differential Revision: D9774853

Pulled By: bwasti

fbshipit-source-id: 4798223f6de680a7152283e8cad8814da7f90209
---
 aten/src/ATen/Formatting.h                    |  25 +-
 aten/src/ATen/{ => core}/Formatting.cpp       |   4 +-
 aten/src/ATen/core/Formatting.h               |  26 ++
 aten/src/ATen/{ => core}/Tensor.cpp           |   6 +-
 .../jit => aten/src/ATen/core}/ivalue.cpp     |   9 +-
 aten/src/ATen/core/ivalue.h                   | 425 +++++++++++++++++
 setup.py                                      |   1 -
 torch/CMakeLists.txt                          |   1 -
 torch/csrc/jit/ivalue.h                       | 426 +-----------------
 torch/csrc/jit/source_range.h                 |   1 +
 10 files changed, 464 insertions(+), 460 deletions(-)
 rename aten/src/ATen/{ => core}/Formatting.cpp (99%)
 create mode 100644 aten/src/ATen/core/Formatting.h
 rename aten/src/ATen/{ => core}/Tensor.cpp (77%)
 rename {torch/csrc/jit => aten/src/ATen/core}/ivalue.cpp (86%)
 create mode 100644 aten/src/ATen/core/ivalue.h

diff --git a/aten/src/ATen/Formatting.h b/aten/src/ATen/Formatting.h
index 6a8b502cc978a8..392e2a27b0130c 100644
--- a/aten/src/ATen/Formatting.h
+++ b/aten/src/ATen/Formatting.h
@@ -1,24 +1 @@
-#pragma once
-
-#include <iostream>
-#include "ATen/Type.h"
-#include "ATen/core/Scalar.h"
-
-namespace at {
-
-AT_API std::ostream& operator<<(std::ostream & out, IntList list);
-AT_API std::ostream& operator<<(std::ostream & out, Backend b);
-AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
-AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
-static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
-  return print(out,t,80);
-}
-static inline void print(const Tensor & t, int64_t linesize=80) {
-  print(std::cout,t,linesize);
-}
-
-static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
-  return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong());
-}
-
-}
+#include <ATen/core/Formatting.h>
diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
similarity index 99%
rename from aten/src/ATen/Formatting.cpp
rename to aten/src/ATen/core/Formatting.cpp
index 390230316bd0dc..f13b0082d90d10 100644
--- a/aten/src/ATen/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -1,6 +1,4 @@
-#include "ATen/Formatting.h"
-
-#include <ATen/ATen.h>
+#include "ATen/core/Formatting.h"
 
 #include <cmath>
 #include <cstdint>
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
new file mode 100644
index 00000000000000..c6ac26b8a9e0e3
--- /dev/null
+++ b/aten/src/ATen/core/Formatting.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/core/Scalar.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/TensorMethods.h>
+#include <ATen/core/Type.h>
+#include <iostream>
+
+namespace at {
+
+AT_API std::ostream& operator<<(std::ostream & out, IntList list);
+AT_API std::ostream& operator<<(std::ostream & out, Backend b);
+AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
+AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
+static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+  return print(out,t,80);
+}
+static inline void print(const Tensor & t, int64_t linesize=80) {
+  print(std::cout,t,linesize);
+}
+
+static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
+  return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong());
+}
+
+}
diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
similarity index 77%
rename from aten/src/ATen/Tensor.cpp
rename to aten/src/ATen/core/Tensor.cpp
index 860a5d2ab0afe1..924688d40b9551 100644
--- a/aten/src/ATen/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -1,6 +1,6 @@
-#include <ATen/Tensor.h>
-#include <ATen/Type.h>
-#include <ATen/Formatting.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/Formatting.h>
+#include <ATen/core/Type.h>
 
 #include <iostream>
 
diff --git a/torch/csrc/jit/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
similarity index 86%
rename from torch/csrc/jit/ivalue.cpp
rename to aten/src/ATen/core/ivalue.cpp
index 315da36deb196f..3d2b56893e7188 100644
--- a/torch/csrc/jit/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1,12 +1,15 @@
-#include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/ivalue.h"
-#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/Formatting.h>
 
 #define TORCH_FORALL_TAGS(_) \
   _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
 
 namespace torch { namespace jit {
 
+AT_API c10::intrusive_ptr<ConstantString> ConstantString::create(std::string str_) {
+  return c10::make_intrusive<ConstantString>(std::move(str_));
+}
+
 namespace {
 
 template<typename Elem>
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
new file mode 100644
index 00000000000000..914598f6ceb426
--- /dev/null
+++ b/aten/src/ATen/core/ivalue.h
@@ -0,0 +1,425 @@
+#pragma once
+
+#include <ATen/core/Scalar.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/TensorImpl.h>
+#include <ATen/core/UndefinedTensorImpl.h>
+#include <ATen/core/intrusive_ptr.h>
+
+#include <type_traits>
+
+namespace torch { namespace jit {
+
+template <typename T>
+using Shared = c10::intrusive_ptr<T>;
+
+// string
+struct AT_API ConstantString final : c10::intrusive_ptr_target {
+ private:
+  const std::string str_;
+ public:
+  ConstantString(std::string str)
+  : str_(std::move(str)) {}
+  static c10::intrusive_ptr<ConstantString> create(std::string str_);
+  const std::string & string() const {
+    return str_;
+  }
+  operator const std::string & () const {
+    return string();
+  }
+  AT_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ConstantString& v);
+};
+
+// non-mutable list
+template <typename Elem>
+struct AT_CORE_EXPORT ConstantList final : c10::intrusive_ptr_target {
+ private:
+  const std::vector<Elem> elements_;
+ public:
+  ConstantList(std::vector<Elem> elements_)
+  : elements_(std::move(elements_)) {}
+  static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
+    return c10::make_intrusive<ConstantList<Elem>>(std::move(elements_));
+  }
+  const std::vector<Elem>& elements() const {
+    return elements_;
+  }
+  operator const std::vector<Elem>&() const {
+    return elements();
+  }
+};
+
+struct IValue;
+using Tuple = ConstantList<IValue>;
+using IntList = ConstantList<int64_t>;
+using TensorList = ConstantList<at::Tensor>;
+using DoubleList = ConstantList<double>;
+
+// IValue is the generic tagged union used by the interpreter to hold
+// all value types.
+// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
+// The tag is currently 4 bytes to determine the type, and 1 byte
+// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
+// retain/release calls.
+
+#define TORCH_FORALL_TAGS(_) \
+  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
+
+struct AT_API IValue final {
+  IValue()
+  : payload{0}
+  , tag(Tag::None)
+  , is_intrusive_ptr(false) {}
+  IValue(const IValue& rhs)
+      : payload(rhs.payload),
+        tag(rhs.tag),
+        is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
+    }
+  }
+  IValue(IValue&& rhs) noexcept : IValue() {
+    swap(rhs);
+  }
+  ~IValue() {
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
+    }
+  }
+  IValue & operator=(IValue && rhs) & noexcept {
+    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+    return *this;
+  }
+  IValue & operator=(IValue const & rhs) & {
+    IValue(rhs).swap(*this);
+    return *this;
+  }
+  void swap(IValue & rhs) noexcept {
+    std::swap(payload, rhs.payload);
+    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
+    std::swap(tag, rhs.tag);
+  }
+  // Accessors for subtypes are arranged together below
+  // While some of these accessors could be generated through templates,
+  // we prefer to write them manually for clarity
+
+  // Tensor
+  IValue(at::Tensor t)
+  : tag(Tag::Tensor), is_intrusive_ptr(t.defined())  {
+    // Note: the undefined tensor is not refcounted, so while it
+    // is tagged as a tensor, is_intrusive_ptr is set to false.
+    // This is not an optional optimization: our incref call
+    // *will not* do the right thing when called on an
+    // undefined tensor.
+    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
+  }
+  bool isTensor() const { return Tag::Tensor == tag; }
+  at::Tensor toTensor() && {
+    AT_ASSERT(isTensor());
+    return at::Tensor(moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  }
+  at::Tensor toTensor() const & {
+    AT_ASSERT(isTensor());
+    return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  }
+
+  // Tuple
+  IValue(c10::intrusive_ptr<Tuple> v);
+  bool isTuple() const { return Tag::Tuple == tag; }
+  c10::intrusive_ptr<Tuple> toTuple() && {
+    AT_ASSERT(isTuple());
+    return moveToIntrusivePtr<Tuple>();
+  }
+  c10::intrusive_ptr<Tuple> toTuple() const & {
+    AT_ASSERT(isTuple());
+    return toIntrusivePtr<Tuple>();
+  }
+
+  // Double
+  IValue(double d)
+  : tag(Tag::Double), is_intrusive_ptr(false) {
+    payload.as_double = d;
+  }
+  bool isDouble() const { return Tag::Double == tag; }
+  double toDouble() const {
+    AT_ASSERT(isDouble());
+    return payload.as_double;
+  }
+
+  // Int
+  IValue(int64_t i)
+  : tag(Tag::Int), is_intrusive_ptr(false) {
+    payload.as_int = i;
+  }
+
+  // allow you to pass literals (3, 4) without ambiguity
+  IValue(int32_t i)
+  : IValue(static_cast<int64_t>(i)) {}
+  IValue(bool b)
+  : IValue(static_cast<int64_t>(b)) {}
+
+  bool isInt() const { return Tag::Int == tag; }
+
+  int64_t toInt() const {
+    AT_ASSERT(isInt());
+    return payload.as_int;
+  }
+
+  // IntList
+  IValue(c10::intrusive_ptr<IntList> v);
+  IValue(std::vector<int64_t> v);
+  IValue(at::ArrayRef<int64_t> v)
+  : IValue(v.vec()) {}
+  bool isIntList() const { return Tag::IntList == tag; }
+  c10::intrusive_ptr<IntList> toIntList() && {
+    AT_ASSERT(isIntList());
+    return moveToIntrusivePtr<IntList>();
+  }
+  c10::intrusive_ptr<IntList> toIntList() const & {
+    AT_ASSERT(isIntList());
+    return toIntrusivePtr<IntList>();
+  }
+
+  const std::vector<int64_t>& toIntListRef() const;
+  const std::vector<double>& toDoubleListRef() const;
+  const std::vector<at::Tensor>& toTensorListRef() const;
+
+  // ConstantString
+  IValue(c10::intrusive_ptr<ConstantString> v);
+  IValue(std::string v);
+  bool isString() const { return Tag::String == tag; }
+  c10::intrusive_ptr<ConstantString> toString() && {
+    AT_ASSERT(isString());
+    return moveToIntrusivePtr<ConstantString>();
+  }
+  c10::intrusive_ptr<ConstantString> toString() const & {
+    AT_ASSERT(isString());
+    return toIntrusivePtr<ConstantString>();
+  }
+
+  // DoubleList
+  IValue(c10::intrusive_ptr<DoubleList> v);
+  IValue(std::vector<double> v);
+  bool isDoubleList() const { return Tag::DoubleList == tag; }
+  c10::intrusive_ptr<DoubleList> toDoubleList() && {
+    AT_ASSERT(isDoubleList());
+    return moveToIntrusivePtr<DoubleList>();
+  }
+  c10::intrusive_ptr<DoubleList> toDoubleList() const & {
+    AT_ASSERT(isDoubleList());
+    return toIntrusivePtr<DoubleList>();
+  }
+
+  //TensorList
+  IValue(c10::intrusive_ptr<TensorList> v);
+  IValue(std::vector<at::Tensor> v);
+  bool isTensorList() const { return Tag::TensorList == tag; }
+  c10::intrusive_ptr<TensorList> toTensorList() && {
+    AT_ASSERT(isTensorList());
+    return moveToIntrusivePtr<TensorList>();
+  }
+  c10::intrusive_ptr<TensorList> toTensorList() const & {
+    AT_ASSERT(isTensorList());
+    return toIntrusivePtr<TensorList>();
+  }
+
+  // None
+  bool isNone() {
+    return Tag::None == tag;
+  }
+  std::string toNone() const {
+    return "None";
+  }
+  // Scalar, which gets encoded as either an Int or a Double
+  IValue(at::Scalar s)
+  : IValue() {
+    if(s.isFloatingPoint()) {
+      *this = s.toDouble();
+    } else {
+      *this = s.toLong();
+    }
+  }
+  bool isScalar() {
+    return isDouble() || isInt();
+  }
+  at::Scalar toScalar() const {
+    if(isDouble())
+      return toDouble();
+    else if(isInt())
+      return toInt();
+    else
+      throw std::runtime_error("IValue is not a Scalar");
+  }
+
+  // for debugging
+  std::string tagKind() const {
+    switch(tag) {
+      #define DEFINE_CASE(x) case Tag::x: return #x;
+      TORCH_FORALL_TAGS(DEFINE_CASE)
+      #undef DEFINE_CASE
+    }
+    return "Invalid Tag";
+  }
+
+  // generic v.to<at::Tensor>() implementations
+  // that can be used in special functions like pop/push
+  // that use template meta-programming.
+  // prefer the directly named methods when you can,
+  // since they are simpler to understand
+
+  // Note: if you get linker errors saying one of these is missing,
+  // change it to ... && = delete; and you will see better error messages for why
+  // However, we cannot commit this because some compiler versions barf on it.
+  template<typename T>
+  T to() &&;
+  template<typename T>
+  T to() const &;
+
+  AT_API friend std::ostream& operator<<(std::ostream& out, const IValue& v);
+
+ private:
+  // NOTE: IValue tags are intentionally private. In the future we may encode
+  // this value different (e.g. using NaN boxing), and this would make it more
+  // costly to determine the tag for all types vs just determining if something
+  // is a particular type. Instead we want clients to use the `isX` methods when
+  // possible. If for perf. reasons you really, absolutely, must have a jump
+  // table, then we can revisit this.
+  enum class Tag : uint32_t {
+#define DEFINE_TAG(x) x,
+    TORCH_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+  };
+
+  template<class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr() {
+    auto t = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+    clearToNone();
+    return t;
+  }
+  template<typename T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const {
+    auto r = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+    auto p = r;
+    r.release();
+    return p;
+  }
+  void clearToNone() {
+    payload.as_int = 0;
+    tag = Tag::None;
+    is_intrusive_ptr = false;
+  }
+  union {
+    int64_t as_int;
+    double as_double;
+    c10::intrusive_ptr_target* as_intrusive_ptr;
+  } payload;
+  Tag tag;
+  bool is_intrusive_ptr;
+};
+
+#undef TORCH_FORALL_TAGS
+
+
+#define DEFINE_TO(type, method_name) \
+template<> \
+inline type IValue::to<type>() && { \
+  return std::move(*this).method_name(); \
+} \
+template<> \
+inline type IValue::to<type>() const & { \
+  return this->method_name(); \
+}
+DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(c10::intrusive_ptr<Tuple>, toTuple)
+DEFINE_TO(double, toDouble)
+DEFINE_TO(int64_t, toInt)
+DEFINE_TO(c10::intrusive_ptr<DoubleList>, toDoubleList)
+DEFINE_TO(c10::intrusive_ptr<IntList>, toIntList)
+DEFINE_TO(c10::intrusive_ptr<TensorList>, toTensorList)
+DEFINE_TO(c10::intrusive_ptr<ConstantString>, toString)
+DEFINE_TO(at::Scalar, toScalar)
+DEFINE_TO(bool, toInt)
+DEFINE_TO(std::vector<int64_t>, toIntListRef)
+DEFINE_TO(std::vector<double>, toDoubleListRef)
+DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
+
+#undef DEFINE_TO
+
+#define DEFINE_TO_WITH_BODY(type, body) \
+template<> \
+inline type IValue::to<type>() && { \
+  body(std::move(*this)); \
+} \
+template<> \
+inline type IValue::to<type>() const & { \
+  body((*this)); \
+}
+
+#define SCALAR_TYPE_BODY(this) return static_cast<at::ScalarType>(this.toInt());
+#define LAYOUT_BODY(this) return static_cast<at::Layout>(this.toInt());
+#define DEVICE_BODY(this)                                           \
+  /* NB: const_list might be a move of the vector, so we need to */ \
+  /*     assign it to prevent its deallocation.                  */ \
+  auto&& const_list = this.toIntList();                             \
+  const auto& elems = const_list->elements();                       \
+  AT_ASSERT(elems.size() == 2);                                     \
+  return at::Device(static_cast<at::Device::Type>(elems[0]), elems[1]);
+
+DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY)
+DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY)
+DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY)
+
+#undef DEFINE_TO_WITH_BODY
+#undef SCALAR_TYPE_BODY
+#undef LAYOUT_BODY
+#undef DEVICE_BODY
+
+inline IValue::IValue(c10::intrusive_ptr<Tuple> v)
+: tag(Tag::Tuple), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+
+inline IValue::IValue(c10::intrusive_ptr<IntList> v)
+: tag(Tag::IntList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<int64_t> v)
+: IValue(IntList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ConstantString> v)
+: tag(Tag::String), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::string v)
+: IValue(ConstantString::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<DoubleList> v)
+: tag(Tag::DoubleList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<double> v)
+: IValue(DoubleList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
+: tag(Tag::TensorList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<at::Tensor> v)
+: IValue(TensorList::create(std::move(v))) {}
+
+inline const std::vector<int64_t>& IValue::toIntListRef() const {
+  return toIntList()->elements();
+}
+
+inline const std::vector<double>& IValue::toDoubleListRef() const {
+  return toDoubleList()->elements();
+}
+
+inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
+  return toTensorList()->elements();
+}
+
+
+}}
diff --git a/setup.py b/setup.py
index b61251c46c34fa..37eaf51f0d3908 100644
--- a/setup.py
+++ b/setup.py
@@ -924,7 +924,6 @@ def run(self):
     "torch/csrc/byte_order.cpp",
     "torch/csrc/jit/batched/BatchTensor.cpp",
     "torch/csrc/jit/init.cpp",
-    "torch/csrc/jit/ivalue.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
     "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp",
     "torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 69b4963277a69c..2c1ff700dd2858 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -153,7 +153,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
   ${TORCH_SRC_DIR}/csrc/jit/constants.cpp
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/ivalue.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 16af0c9b292d5f..fb248e509853c8 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -1,425 +1 @@
-#pragma once
-
-#include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/WindowsTorchApiMacro.h"
-
-#include <ATen/ATen.h>
-
-#include <type_traits>
-
-namespace torch { namespace jit {
-
-template <typename T>
-using Shared = c10::intrusive_ptr<T>;
-
-// string
-struct TORCH_API ConstantString final : c10::intrusive_ptr_target {
- private:
-  const std::string str_;
- public:
-  ConstantString(std::string str)
-  : str_(std::move(str)) {}
-  static c10::intrusive_ptr<ConstantString> create(std::string str_) {
-    return c10::make_intrusive<ConstantString>(std::move(str_));
-  }
-  const std::string & string() const {
-    return str_;
-  }
-  operator const std::string & () const {
-    return string();
-  }
-  TORCH_API friend std::ostream& operator<<(std::ostream& out, const ConstantString & v);
-};
-
-
-// non-mutable list
-template<typename Elem>
-struct TORCH_API ConstantList final : c10::intrusive_ptr_target {
- private:
-  const std::vector<Elem> elements_;
- public:
-  ConstantList(std::vector<Elem> elements_)
-  : elements_(std::move(elements_)) {}
-  static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
-    return c10::make_intrusive<ConstantList<Elem>>(std::move(elements_));
-  }
-  const std::vector<Elem>& elements() const {
-    return elements_;
-  }
-  operator const std::vector<Elem>&() const {
-    return elements();
-  }
-};
-
-struct IValue;
-using Tuple = ConstantList<IValue>;
-using IntList = ConstantList<int64_t>;
-using TensorList = ConstantList<at::Tensor>;
-using DoubleList = ConstantList<double>;
-
-// IValue is the generic tagged union used by the interpreter to hold
-// all value types.
-// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
-// The tag is currently 4 bytes to determine the type, and 1 byte
-// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
-// retain/release calls.
-
-#define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
-
-struct TORCH_API IValue final {
-  IValue()
-  : payload{0}
-  , tag(Tag::None)
-  , is_intrusive_ptr(false) {}
-  IValue(const IValue& rhs)
-      : payload(rhs.payload),
-        tag(rhs.tag),
-        is_intrusive_ptr(rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
-    }
-  }
-  IValue(IValue&& rhs) noexcept : IValue() {
-    swap(rhs);
-  }
-  ~IValue() {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
-    }
-  }
-  IValue & operator=(IValue && rhs) & noexcept {
-    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
-    return *this;
-  }
-  IValue & operator=(IValue const & rhs) & {
-    IValue(rhs).swap(*this);
-    return *this;
-  }
-  void swap(IValue & rhs) noexcept {
-    std::swap(payload, rhs.payload);
-    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
-    std::swap(tag, rhs.tag);
-  }
-  // Accessors for subtypes are arranged together below
-  // While some of these accessors could be generated through templates,
-  // we prefer to write them manually for clarity
-
-  // Tensor
-  IValue(at::Tensor t)
-  : tag(Tag::Tensor), is_intrusive_ptr(t.defined())  {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
-    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
-  }
-  bool isTensor() const { return Tag::Tensor == tag; }
-  at::Tensor toTensor() && {
-    JIT_ASSERT(isTensor());
-    return at::Tensor(moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
-  }
-  at::Tensor toTensor() const & {
-    JIT_ASSERT(isTensor());
-    return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
-  }
-
-  // Tuple
-  IValue(c10::intrusive_ptr<Tuple> v);
-  bool isTuple() const { return Tag::Tuple == tag; }
-  c10::intrusive_ptr<Tuple> toTuple() && {
-    JIT_ASSERT(isTuple());
-    return moveToIntrusivePtr<Tuple>();
-  }
-  c10::intrusive_ptr<Tuple> toTuple() const & {
-    JIT_ASSERT(isTuple());
-    return toIntrusivePtr<Tuple>();
-  }
-
-  // Double
-  IValue(double d)
-  : tag(Tag::Double), is_intrusive_ptr(false) {
-    payload.as_double = d;
-  }
-  bool isDouble() const { return Tag::Double == tag; }
-  double toDouble() const {
-    JIT_ASSERT(isDouble());
-    return payload.as_double;
-  }
-
-  // Int
-  IValue(int64_t i)
-  : tag(Tag::Int), is_intrusive_ptr(false) {
-    payload.as_int = i;
-  }
-
-  // allow you to pass literals (3, 4) without ambiguity
-  IValue(int32_t i)
-  : IValue(static_cast<int64_t>(i)) {}
-  IValue(bool b)
-  : IValue(static_cast<int64_t>(b)) {}
-
-  bool isInt() const { return Tag::Int == tag; }
-
-  int64_t toInt() const {
-    JIT_ASSERT(isInt());
-    return payload.as_int;
-  }
-
-  // IntList
-  IValue(c10::intrusive_ptr<IntList> v);
-  IValue(std::vector<int64_t> v);
-  IValue(at::ArrayRef<int64_t> v)
-  : IValue(v.vec()) {}
-  bool isIntList() const { return Tag::IntList == tag; }
-  c10::intrusive_ptr<IntList> toIntList() && {
-    JIT_ASSERT(isIntList());
-    return moveToIntrusivePtr<IntList>();
-  }
-  c10::intrusive_ptr<IntList> toIntList() const & {
-    JIT_ASSERT(isIntList());
-    return toIntrusivePtr<IntList>();
-  }
-
-  const std::vector<int64_t>& toIntListRef() const;
-  const std::vector<double>& toDoubleListRef() const;
-  const std::vector<at::Tensor>& toTensorListRef() const;
-
-  // ConstantString
-  IValue(c10::intrusive_ptr<ConstantString> v);
-  IValue(std::string v);
-  bool isString() const { return Tag::String == tag; }
-  c10::intrusive_ptr<ConstantString> toString() && {
-    JIT_ASSERT(isString());
-    return moveToIntrusivePtr<ConstantString>();
-  }
-  c10::intrusive_ptr<ConstantString> toString() const & {
-    JIT_ASSERT(isString());
-    return toIntrusivePtr<ConstantString>();
-  }
-
-  // DoubleList
-  IValue(c10::intrusive_ptr<DoubleList> v);
-  IValue(std::vector<double> v);
-  bool isDoubleList() const { return Tag::DoubleList == tag; }
-  c10::intrusive_ptr<DoubleList> toDoubleList() && {
-    JIT_ASSERT(isDoubleList());
-    return moveToIntrusivePtr<DoubleList>();
-  }
-  c10::intrusive_ptr<DoubleList> toDoubleList() const & {
-    JIT_ASSERT(isDoubleList());
-    return toIntrusivePtr<DoubleList>();
-  }
-
-  //TensorList
-  IValue(c10::intrusive_ptr<TensorList> v);
-  IValue(std::vector<at::Tensor> v);
-  bool isTensorList() const { return Tag::TensorList == tag; }
-  c10::intrusive_ptr<TensorList> toTensorList() && {
-    JIT_ASSERT(isTensorList());
-    return moveToIntrusivePtr<TensorList>();
-  }
-  c10::intrusive_ptr<TensorList> toTensorList() const & {
-    JIT_ASSERT(isTensorList());
-    return toIntrusivePtr<TensorList>();
-  }
-
-  // None
-  bool isNone() {
-    return Tag::None == tag;
-  }
-  std::string toNone() const {
-    return "None";
-  }
-  // Scalar, which gets encoded as either an Int or a Double
-  IValue(at::Scalar s)
-  : IValue() {
-    if(s.isFloatingPoint()) {
-      *this = s.toDouble();
-    } else {
-      *this = s.toLong();
-    }
-  }
-  bool isScalar() {
-    return isDouble() || isInt();
-  }
-  at::Scalar toScalar() const {
-    if(isDouble())
-      return toDouble();
-    else if(isInt())
-      return toInt();
-    else
-      throw std::runtime_error("IValue is not a Scalar");
-  }
-
-  // for debugging
-  std::string tagKind() const {
-    switch(tag) {
-      #define DEFINE_CASE(x) case Tag::x: return #x;
-      TORCH_FORALL_TAGS(DEFINE_CASE)
-      #undef DEFINE_CASE
-    }
-    return "Invalid Tag";
-  }
-
-  // generic v.to<at::Tensor>() implementations
-  // that can be used in special functions like pop/push
-  // that use template meta-programming.
-  // prefer the directly named methods when you can,
-  // since they are simpler to understand
-
-  // Note: if you get linker errors saying one of these is missing,
-  // change it to ... && = delete; and you will see better error messages for why
-  // However, we cannot commit this because some compiler versions barf on it.
-  template<typename T>
-  T to() &&;
-  template<typename T>
-  T to() const &;
-
-  TORCH_API friend std::ostream& operator<<(std::ostream & out, const IValue & v);
-
-private:
-  // NOTE: IValue tags are intentionally private. In the future we may encode
-  // this value different (e.g. using NaN boxing), and this would make it more
-  // costly to determine the tag for all types vs just determining if something
-  // is a particular type. Instead we want clients to use the `isX` methods when
-  // possible. If for perf. reasons you really, absolutely, must have a jump
-  // table, then we can revisit this.
-  enum class Tag : uint32_t {
-#define DEFINE_TAG(x) x,
-    TORCH_FORALL_TAGS(DEFINE_TAG)
-#undef DEFINE_TAG
-  };
-
-  template<class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
-  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr() {
-    auto t = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
-    clearToNone();
-    return t;
-  }
-  template<typename T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
-  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const {
-    auto r = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
-    auto p = r;
-    r.release();
-    return p;
-  }
-  void clearToNone() {
-    payload.as_int = 0;
-    tag = Tag::None;
-    is_intrusive_ptr = false;
-  }
-  union {
-    int64_t as_int;
-    double as_double;
-    c10::intrusive_ptr_target* as_intrusive_ptr;
-  } payload;
-  Tag tag;
-  bool is_intrusive_ptr;
-};
-
-#undef TORCH_FORALL_TAGS
-
-
-#define DEFINE_TO(type, method_name) \
-template<> \
-inline type IValue::to<type>() && { \
-  return std::move(*this).method_name(); \
-} \
-template<> \
-inline type IValue::to<type>() const & { \
-  return this->method_name(); \
-}
-DEFINE_TO(at::Tensor, toTensor)
-DEFINE_TO(c10::intrusive_ptr<Tuple>, toTuple)
-DEFINE_TO(double, toDouble)
-DEFINE_TO(int64_t, toInt)
-DEFINE_TO(c10::intrusive_ptr<DoubleList>, toDoubleList)
-DEFINE_TO(c10::intrusive_ptr<IntList>, toIntList)
-DEFINE_TO(c10::intrusive_ptr<TensorList>, toTensorList)
-DEFINE_TO(c10::intrusive_ptr<ConstantString>, toString)
-DEFINE_TO(at::Scalar, toScalar)
-DEFINE_TO(bool, toInt)
-DEFINE_TO(std::vector<int64_t>, toIntListRef)
-DEFINE_TO(std::vector<double>, toDoubleListRef)
-DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
-
-#undef DEFINE_TO
-
-#define DEFINE_TO_WITH_BODY(type, body) \
-template<> \
-inline type IValue::to<type>() && { \
-  body(std::move(*this)); \
-} \
-template<> \
-inline type IValue::to<type>() const & { \
-  body((*this)); \
-}
-
-#define SCALAR_TYPE_BODY(this) return static_cast<at::ScalarType>(this.toInt());
-#define LAYOUT_BODY(this) return static_cast<at::Layout>(this.toInt());
-#define DEVICE_BODY(this) \
-  /* NB: const_list might be a move of the vector, so we need to */ \
-  /*     assign it to prevent its deallocation.                  */ \
-  auto && const_list = this.toIntList(); \
-  const auto & elems = const_list->elements(); \
-  JIT_ASSERT(elems.size() == 2); \
-  return at::Device(static_cast<at::Device::Type>(elems[0]), elems[1]);
-
-DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY)
-DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY)
-DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY)
-
-#undef DEFINE_TO_WITH_BODY
-#undef SCALAR_TYPE_BODY
-#undef LAYOUT_BODY
-#undef DEVICE_BODY
-
-inline IValue::IValue(c10::intrusive_ptr<Tuple> v)
-: tag(Tag::Tuple), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-
-inline IValue::IValue(c10::intrusive_ptr<IntList> v)
-: tag(Tag::IntList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<int64_t> v)
-: IValue(IntList::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<ConstantString> v)
-: tag(Tag::String), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::string v)
-: IValue(ConstantString::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<DoubleList> v)
-: tag(Tag::DoubleList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<double> v)
-: IValue(DoubleList::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
-: tag(Tag::TensorList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<at::Tensor> v)
-: IValue(TensorList::create(std::move(v))) {}
-
-inline const std::vector<int64_t>& IValue::toIntListRef() const {
-  return toIntList()->elements();
-}
-
-inline const std::vector<double>& IValue::toDoubleListRef() const {
-  return toDoubleList()->elements();
-}
-
-inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
-  return toTensorList()->elements();
-}
-
-
-}}
+#include <ATen/core/ivalue.h>
diff --git a/torch/csrc/jit/source_range.h b/torch/csrc/jit/source_range.h
index 0139c2527513a3..227315e643fe4b 100644
--- a/torch/csrc/jit/source_range.h
+++ b/torch/csrc/jit/source_range.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "torch/csrc/jit/source_location.h"
+#include "torch/csrc/jit/assertions.h"
 
 
 namespace torch {

From 7f0dd2487d70015218657e61c68750b8169bf9e8 Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Mon, 17 Sep 2018 18:11:19 -0700
Subject: [PATCH 209/237] Move AT_HOST_DEVICE macro to Macros.h (#10945)

Summary:
```
I'm using AT_HOST_DEVICE outside of Half.h in an upcoming PR. Since this
changes code without making any semantic changes, I wanted to make this
change in a separate PR.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10945

Differential Revision: D9539821

Pulled By: colesbury

fbshipit-source-id: 0daae40ea78b077a543f7bfeec06b225634540de
---
 aten/src/ATen/core/Half-inl.h       | 82 ++++++++++++++---------------
 aten/src/ATen/core/Half.h           | 22 +++-----
 aten/src/ATen/core/Macros.h         | 11 ++++
 aten/src/ATen/core/TensorAccessor.h | 41 +++++----------
 4 files changed, 71 insertions(+), 85 deletions(-)

diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
index 42633508e4b714..75ff2a2fe6937f 100644
--- a/aten/src/ATen/core/Half-inl.h
+++ b/aten/src/ATen/core/Half-inl.h
@@ -16,7 +16,7 @@ namespace at {
 
 /// Constructors
 
-inline AT_HOSTDEVICE Half::Half(float value) {
+inline AT_HOST_DEVICE Half::Half(float value) {
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   x = __half_as_short(__float2half(value));
 #else
@@ -26,7 +26,7 @@ inline AT_HOSTDEVICE Half::Half(float value) {
 
 /// Implicit conversions
 
-inline AT_HOSTDEVICE Half::operator float() const {
+inline AT_HOST_DEVICE Half::operator float() const {
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   return __half2float(*reinterpret_cast<const __half*>(&x));
 #else
@@ -35,10 +35,10 @@ inline AT_HOSTDEVICE Half::operator float() const {
 }
 
 #ifdef __CUDACC__
-inline AT_HOSTDEVICE Half::Half(const __half& value) {
+inline AT_HOST_DEVICE Half::Half(const __half& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
 }
-inline AT_HOSTDEVICE Half::operator __half() const {
+inline AT_HOST_DEVICE Half::operator __half() const {
   return *reinterpret_cast<const __half*>(&x);
 }
 #endif
@@ -53,140 +53,140 @@ inline __device__ Half __ldg(const Half* ptr) {
 
 /// Arithmetic
 
-inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator+(const Half& a, const Half& b) {
   return static_cast<float>(a) + static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator-(const Half& a, const Half& b) {
   return static_cast<float>(a) - static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator*(const Half& a, const Half& b) {
   return static_cast<float>(a) * static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator/(const Half& a, const Half& b) {
   return static_cast<float>(a) / static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator-(const Half& a) {
+inline AT_HOST_DEVICE Half operator-(const Half& a) {
   return -static_cast<float>(a);
 }
 
-inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator+=(Half& a, const Half& b) {
   a = a + b;
   return a;
 }
 
-inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator-=(Half& a, const Half& b) {
   a = a - b;
   return a;
 }
 
-inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator*=(Half& a, const Half& b) {
   a = a * b;
   return a;
 }
 
-inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator/=(Half& a, const Half& b) {
   a = a / b;
   return a;
 }
 
 /// Arithmetic with floats
 
-inline AT_HOSTDEVICE float operator+(Half a, float b) {
+inline AT_HOST_DEVICE float operator+(Half a, float b) {
   return static_cast<float>(a) + b;
 }
-inline AT_HOSTDEVICE float operator-(Half a, float b) {
+inline AT_HOST_DEVICE float operator-(Half a, float b) {
   return static_cast<float>(a) - b;
 }
-inline AT_HOSTDEVICE float operator*(Half a, float b) {
+inline AT_HOST_DEVICE float operator*(Half a, float b) {
   return static_cast<float>(a) * b;
 }
-inline AT_HOSTDEVICE float operator/(Half a, float b) {
+inline AT_HOST_DEVICE float operator/(Half a, float b) {
   return static_cast<float>(a) / b;
 }
 
-inline AT_HOSTDEVICE float operator+(float a, Half b) {
+inline AT_HOST_DEVICE float operator+(float a, Half b) {
   return a + static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float operator-(float a, Half b) {
+inline AT_HOST_DEVICE float operator-(float a, Half b) {
   return a - static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float operator*(float a, Half b) {
+inline AT_HOST_DEVICE float operator*(float a, Half b) {
   return a * static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float operator/(float a, Half b) {
+inline AT_HOST_DEVICE float operator/(float a, Half b) {
   return a / static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator+=(float& a, const Half& b) {
   return a += static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator-=(float& a, const Half& b) {
   return a -= static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator*=(float& a, const Half& b) {
   return a *= static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator/=(float& a, const Half& b) {
   return a /= static_cast<float>(b);
 }
 
 /// Arithmetic with doubles
 
-inline AT_HOSTDEVICE double operator+(Half a, double b) {
+inline AT_HOST_DEVICE double operator+(Half a, double b) {
   return static_cast<double>(a) + b;
 }
-inline AT_HOSTDEVICE double operator-(Half a, double b) {
+inline AT_HOST_DEVICE double operator-(Half a, double b) {
   return static_cast<double>(a) - b;
 }
-inline AT_HOSTDEVICE double operator*(Half a, double b) {
+inline AT_HOST_DEVICE double operator*(Half a, double b) {
   return static_cast<double>(a) * b;
 }
-inline AT_HOSTDEVICE double operator/(Half a, double b) {
+inline AT_HOST_DEVICE double operator/(Half a, double b) {
   return static_cast<double>(a) / b;
 }
 
-inline AT_HOSTDEVICE double operator+(double a, Half b) {
+inline AT_HOST_DEVICE double operator+(double a, Half b) {
   return a + static_cast<double>(b);
 }
-inline AT_HOSTDEVICE double operator-(double a, Half b) {
+inline AT_HOST_DEVICE double operator-(double a, Half b) {
   return a - static_cast<double>(b);
 }
-inline AT_HOSTDEVICE double operator*(double a, Half b) {
+inline AT_HOST_DEVICE double operator*(double a, Half b) {
   return a * static_cast<double>(b);
 }
-inline AT_HOSTDEVICE double operator/(double a, Half b) {
+inline AT_HOST_DEVICE double operator/(double a, Half b) {
   return a / static_cast<double>(b);
 }
 
 /// Arithmetic with ints
 
-inline AT_HOSTDEVICE Half operator+(Half a, int b) {
+inline AT_HOST_DEVICE Half operator+(Half a, int b) {
   return a + static_cast<Half>(b);
 }
-inline AT_HOSTDEVICE Half operator-(Half a, int b) {
+inline AT_HOST_DEVICE Half operator-(Half a, int b) {
   return a - static_cast<Half>(b);
 }
-inline AT_HOSTDEVICE Half operator*(Half a, int b) {
+inline AT_HOST_DEVICE Half operator*(Half a, int b) {
   return a * static_cast<Half>(b);
 }
-inline AT_HOSTDEVICE Half operator/(Half a, int b) {
+inline AT_HOST_DEVICE Half operator/(Half a, int b) {
   return a / static_cast<Half>(b);
 }
 
-inline AT_HOSTDEVICE Half operator+(int a, Half b) {
+inline AT_HOST_DEVICE Half operator+(int a, Half b) {
   return static_cast<Half>(a) + b;
 }
-inline AT_HOSTDEVICE Half operator-(int a, Half b) {
+inline AT_HOST_DEVICE Half operator-(int a, Half b) {
   return static_cast<Half>(a) - b;
 }
-inline AT_HOSTDEVICE Half operator*(int a, Half b) {
+inline AT_HOST_DEVICE Half operator*(int a, Half b) {
   return static_cast<Half>(a) * b;
 }
-inline AT_HOSTDEVICE Half operator/(int a, Half b) {
+inline AT_HOST_DEVICE Half operator/(int a, Half b) {
   return static_cast<Half>(a) / b;
 }
 
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index e2cbd1174fc91c..47a8e8e52d2adb 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -30,14 +30,6 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#ifndef AT_HOSTDEVICE
-#ifdef __CUDACC__
-#define AT_HOSTDEVICE __host__ __device__
-#else
-#define AT_HOSTDEVICE
-#endif
-#endif
-
 namespace at {
 
 namespace detail {
@@ -55,18 +47,18 @@ struct alignas(2) Half {
 
   // HIP wants __host__ __device__ tag, CUDA does not
 #ifdef __HIP_PLATFORM_HCC__
-  AT_HOSTDEVICE Half() = default;
+  AT_HOST_DEVICE Half() = default;
 #else
   Half() = default;
 #endif
 
-  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){};
-  inline AT_HOSTDEVICE Half(float value);
-  inline AT_HOSTDEVICE operator float() const;
+  constexpr AT_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits){};
+  inline AT_HOST_DEVICE Half(float value);
+  inline AT_HOST_DEVICE operator float() const;
 
 #ifdef __CUDACC__
-  inline AT_HOSTDEVICE Half(const __half& value);
-  inline AT_HOSTDEVICE operator __half() const;
+  inline AT_HOST_DEVICE Half(const __half& value);
+  inline AT_HOST_DEVICE operator __half() const;
 #endif
 };
 
@@ -191,5 +183,3 @@ AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
 } // namespace at
 
 #include "ATen/core/Half-inl.h"
-
-#undef AT_HOSTDEVICE
diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h
index 67efa523ac2bba..244124475bc08f 100644
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@@ -39,6 +39,17 @@
 #define AT_CORE_API AT_CORE_IMPORT
 #endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define AT_HOST_DEVICE __host__ __device__
+#define AT_DEVICE __device__
+#define AT_HOST __host__
+#else
+#define AT_HOST_DEVICE
+#define AT_HOST
+#define AT_DEVICE
+#endif
+
 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
 #define AT_DISABLE_COPY_AND_ASSIGN(classname) \
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index f686f0797b079d..d8a851d998332a 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -2,6 +2,7 @@
 
 #include <cstddef>
 #include <stdint.h>
+#include <ATen/core/Macros.h>
 
 namespace at {
 
@@ -20,18 +21,6 @@ struct RestrictPtrTraits {
 };
 #endif
 
-#ifndef AT_HOSTDEVICE
-#ifdef __CUDACC__
-#define AT_HOSTDEVICE __host__ __device__
-#define AT_HOST __host__
-#define AT_DEVICE __device__
-#else
-#define AT_HOSTDEVICE
-#define AT_HOST
-#define AT_DEVICE
-#endif
-#endif
-
 // TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
 // For CUDA tensors it is used in device code (only). This means that we restrict ourselves
 // to functions and types available there (e.g. IntList isn't).
@@ -42,7 +31,7 @@ class TensorAccessorBase {
 public:
   typedef typename PtrTraits<T>::PtrType PtrType;
 
-  AT_HOSTDEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
+  AT_HOST_DEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : data_(data_), sizes_(sizes_), strides_(strides_) {}
   AT_HOST IntList sizes() const {
     return IntList(sizes_,N);
@@ -50,10 +39,10 @@ class TensorAccessorBase {
   AT_HOST IntList strides() const {
     return IntList(strides_,N);
   }
-  AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; }
-  AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; }
-  AT_HOSTDEVICE T *data() { return data_; }
-  AT_HOSTDEVICE const T *data() const { return data_; }
+  AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; }
+  AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; }
+  AT_HOST_DEVICE T *data() { return data_; }
+  AT_HOST_DEVICE const T *data() const { return data_; }
 protected:
   PtrType data_;
   const int64_t* sizes_;
@@ -69,14 +58,14 @@ class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits> {
 public:
   typedef typename PtrTraits<T>::PtrType PtrType;
 
-  AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
+  AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : TensorAccessorBase<T,N>(data_,sizes_,strides_) {}
 
-  AT_HOSTDEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
+  AT_HOST_DEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
     return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
   }
 
-  AT_HOSTDEVICE const TensorAccessor<T,N-1> operator[](int64_t i) const {
+  AT_HOST_DEVICE const TensorAccessor<T,N-1> operator[](int64_t i) const {
     return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
   }
 };
@@ -86,9 +75,9 @@ class TensorAccessor<T,1,PtrTraits> : public TensorAccessorBase<T,1,PtrTraits> {
 public:
   typedef typename PtrTraits<T>::PtrType PtrType;
 
-  AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
   : TensorAccessorBase<T,1,PtrTraits>(data_,sizes_,strides_) {}
-  AT_HOSTDEVICE T & operator[](int64_t i) {
+  AT_HOST_DEVICE T & operator[](int64_t i) {
     return this->data_[this->strides_[0]*i];
   }
 };
@@ -112,8 +101,8 @@ class PackedTensorAccessorBase {
     std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
     std::copy(strides_, strides_ + N, std::begin(this->strides_));
   }
-  AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; }
-  AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; }
+  AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; }
+  AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; }
 protected:
   PtrType data_;
   int64_t sizes_[N];
@@ -157,7 +146,3 @@ class PackedTensorAccessor<T,1,PtrTraits> : public PackedTensorAccessorBase<T,1,
 };
 
 }
-
-#undef AT_HOSTDEVICE
-#undef AT_HOST
-#undef AT_DEVICE

From 63e384a381d277dc28243c748255f866a8f2178b Mon Sep 17 00:00:00 2001
From: Tianshu Bao <tbao@fb.com>
Date: Mon, 17 Sep 2018 21:21:16 -0700
Subject: [PATCH 210/237] SNNTest with Data Preproc Service (#11707)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11707

Trigger SNN offline training test with data preproc service.

Reviewed By: xsh6528

Differential Revision: D9826978

fbshipit-source-id: f98405ca1e61a7662bf0d9313aaba42436025a83
---
 caffe2/python/pipeline.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index ee38fe52df8c4e..ade4e6ac9248e8 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -324,7 +324,8 @@ def _pipe_step(
     elif hasattr(input, 'reader'):
         reader = input.reader()
     else:
-        raise ValueError('in must be a reader, queue or stream.')
+        raise ValueError(
+            'Input must be a reader, queue or stream. Got {}'.format(type(input)))
 
     if processor is not None:
         reader = ProcessingReader(reader, processor)

From a7cbcb1bb9f1b8244f2441a0fcc8d491677c988b Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Mon, 17 Sep 2018 21:39:10 -0700
Subject: [PATCH 211/237] Enable build_python on windows (#11385)

Summary:
The PR aims to resolve issues related to BUILD_PYTHON and BUILD_TEST after FULL_CAFFE2 is removed on Windows.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11385

Reviewed By: orionr

Differential Revision: D9884906

Pulled By: mingzhe09088

fbshipit-source-id: fc114c0cbff6223f1ec261161e4caecc1fef5dd6
---
 caffe2/CMakeLists.txt                         |  14 ++
 caffe2/core/common_gpu.h                      |   2 +-
 .../include/nomnigraph/Graph/Graph.h          |   8 +-
 .../nomnigraph/Representations/NeuralNet.h    |   2 +-
 .../Transformations/SubgraphMatcher.h         |   2 +-
 caffe2/core/registry.h                        |   4 +-
 caffe2/utils/math_gpu.cu                      | 184 +++++++++---------
 cmake/Dependencies.cmake                      |  15 +-
 setup.py                                      |   6 -
 tools/build_pytorch_libs.bat                  |   2 +-
 torch/CMakeLists.txt                          |   1 +
 11 files changed, 130 insertions(+), 110 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4e9b3eef8937f4..c90179cea1ddcd 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -291,6 +291,11 @@ if (MSVC AND NOT BUILD_SHARED_LIBS)
   # as the latter is not respected by nvcc
   target_compile_definitions(caffe2 PUBLIC "AT_CORE_STATIC_WINDOWS=1")
 endif()
+if (MSVC AND BUILD_SHARED_LIBS)
+  # ONNX is linked statically and needs to be exported from this library
+  # to be used externally. Make sure that references match the export.
+  target_compile_options(caffe2 PRIVATE "-DONNX_BUILD_MAIN_LIB")
+endif()
 # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
 target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
 install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
@@ -492,6 +497,9 @@ if (BUILD_PYTHON)
   if (APPLE)
     set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   endif()
+  if (WIN32)
+    set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
+  endif()
   target_include_directories(caffe2_pybind11_state PRIVATE $<INSTALL_INTERFACE:include>)
   target_include_directories(caffe2_pybind11_state PRIVATE ${Caffe2_CPU_INCLUDE})
 
@@ -517,6 +525,9 @@ if (BUILD_PYTHON)
     if (APPLE)
       set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
     endif()
+    if (WIN32)
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
+    endif()
     target_include_directories(caffe2_pybind11_state_gpu PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE})
     target_link_libraries(
@@ -542,6 +553,9 @@ if (BUILD_PYTHON)
     if (APPLE)
       set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
     endif()
+    if (WIN32)
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
+    endif()
     target_include_directories(caffe2_pybind11_state_hip PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(caffe2_pybind11_state_hip PRIVATE ${Caffe2_CPU_INCLUDE})
     target_link_libraries(
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
index 4658cd0d756099..b73a6aefa406a0 100644
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@@ -116,7 +116,7 @@ inline int CudaVersion() { return CUDA_VERSION; }
 /**
  * Returns the number of devices.
  */
-int NumCudaDevices();
+CAFFE2_CUDA_API int NumCudaDevices();
 
 /**
  * Check if the current running session has a cuda gpu present.
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index c6b10f0912eca8..568d46a61ff561 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -39,7 +39,7 @@ class Node;
 
 // \brief Edge within a Graph.
 template <typename T, typename... U>
-class CAFFE2_API Edge : public StorageType<U...> {
+class Edge : public StorageType<U...> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   Edge(NodeRef tail, NodeRef head, U... args)
@@ -73,7 +73,7 @@ class CAFFE2_API Edge : public StorageType<U...> {
 
 // \brief Node within a Graph.
 template <typename T, typename... U>
-class CAFFE2_API Node : public StorageType<T>, public Notifier<Node<T, U...>> {
+class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   using EdgeRef = typename Graph<T, U...>::EdgeRef;
@@ -152,7 +152,7 @@ class CAFFE2_API Node : public StorageType<T>, public Notifier<Node<T, U...>> {
 /// for example.
 ///
 template <typename T, typename... U>
-class CAFFE2_API Subgraph {
+class Subgraph {
  public:
   Subgraph() {
     DEBUG_PRINT("Creating instance of Subgraph: %p\n", this);
@@ -219,7 +219,7 @@ class CAFFE2_API Subgraph {
 /// Everything is owned by the graph to simplify storage concerns.
 ///
 template <typename T, typename... U>
-class CAFFE2_API Graph {
+class Graph {
  public:
   using SubgraphType = Subgraph<T, U...>;
   using NodeRef = Node<T, U...>*;
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 3abcb2d1057ae9..2a03e428619b30 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -424,7 +424,7 @@ CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m);
 template <NNGraph* G>
 struct CAFFE2_EXPORT NodeHelper {};
 
-struct CAFFE2_API NNNodeMatchCriteria {
+struct NNNodeMatchCriteria {
   std::function<bool(NNGraph::NodeRef)> predicate;
   std::string debugString;
 
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
index 66494dba3275ce..a303324fbb5701 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
@@ -29,7 +29,7 @@ namespace matcher {
  */
 
 template <typename NodeMatchCriteria>
-class CAFFE2_API MatchNode {
+class MatchNode {
  public:
   static const int kStarCount = -1;
 
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index f353e8d6601fab..7db975077ea8b9 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -35,7 +35,7 @@ namespace caffe2 {
  * objects.
  */
 template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registry {
+class Registry {
  public:
   typedef std::function<ObjectPtrType(Args...)> Creator;
 
@@ -104,7 +104,7 @@ class CAFFE2_API Registry {
 };
 
 template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registerer {
+class Registerer {
  public:
   Registerer(
       const SrcType& key,
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 920bffc0ae3bfb..2bb11cd22ad70e 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -130,7 +130,7 @@ __global__ void BroadcastBinaryOpCUDAKernel(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-void BinaryOpWith2DBroadcasting(
+CAFFE2_CUDA_EXPORT void BinaryOpWith2DBroadcasting(
     const int rows,
     const int cols,
     const bool rowwise_broadcast,
@@ -177,7 +177,7 @@ void BinaryOpWith2DBroadcasting(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, int D>
-void BroadcastBinaryOpImpl(
+CAFFE2_CUDA_EXPORT void BroadcastBinaryOpImpl(
     const int* A_dims,
     const int* B_dims,
     const int* C_dims,
@@ -212,7 +212,7 @@ void BroadcastBinaryOpImpl(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-void BroadcastBinaryOp(
+CAFFE2_CUDA_EXPORT void BroadcastBinaryOp(
     const int A_ndim,
     const int* A_dims,
     const int B_ndim,
@@ -294,7 +294,7 @@ void BroadcastBinaryOp(
     }                                                               \
   }                                                                 \
   template <>                                                       \
-  void Func<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(                                        \
       const int N, const T* x, T* y, CUDAContext* context) {        \
     Func##CUDAKernel<<<                                             \
         CAFFE_GET_BLOCKS(N),                                        \
@@ -362,7 +362,7 @@ DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv, utils::Inv<double>)
 
 #define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                            \
   template <>                                                        \
-  void SinCos<T, CUDAContext>(                                       \
+  CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>(                                       \
       const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
     SinCosCUDAKernel<<<                                              \
         CAFFE_GET_BLOCKS(N),                                         \
@@ -376,7 +376,7 @@ CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
 
 #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
   template <>                                                     \
-  void Func<TIn, CUDAContext>(                                    \
+  CAFFE2_CUDA_EXPORT void Func<TIn, CUDAContext>(                                    \
       const int N,                                                \
       const TIn* A,                                               \
       const TIn* B,                                               \
@@ -444,7 +444,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
 
 #define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)   \
   template <>                                                             \
-  void Rowwise##Func<TIn, CUDAContext, true>(                             \
+  CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, true>(                             \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -463,7 +463,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
            context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
   }                                                                       \
   template <>                                                             \
-  void Rowwise##Func<TIn, CUDAContext, false>(                            \
+  CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, false>(                            \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -482,7 +482,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
            context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
   }                                                                       \
   template <>                                                             \
-  void Colwise##Func<TIn, CUDAContext, true>(                             \
+  CAFFE2_CUDA_EXPORT void Colwise##Func<TIn, CUDAContext, true>(                             \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -501,7 +501,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
            context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
   }                                                                       \
   template <>                                                             \
-  void Colwise##Func<TIn, CUDAContext, false>(                            \
+  CAFFE2_CUDA_EXPORT void Colwise##Func<TIn, CUDAContext, false>(                            \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -573,7 +573,7 @@ DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
 #define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)  \
   template <>                                                         \
-  void Func<TIn, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Func<TIn, CUDAContext>(                                        \
       const int A_ndim,                                               \
       const int* A_dims,                                              \
       const int B_ndim,                                               \
@@ -638,7 +638,7 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
 #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
   template <>                                                           \
-  void Funcname<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Funcname<T, CUDAContext>(                                        \
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
@@ -669,7 +669,7 @@ DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)
 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <>
-void Gemm<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemm<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -710,7 +710,7 @@ void Gemm<float, CUDAContext>(
 }
 
 template <>
-void Gemm<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemm<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -781,7 +781,7 @@ void Gemm<float16, CUDAContext>(
 }
 
 template <>
-void BiasCHW<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void BiasCHW<float, CUDAContext>(
     const float* bias,
     const float* bias_multiplier,
     const int bias_channels,
@@ -803,7 +803,7 @@ void BiasCHW<float, CUDAContext>(
 }
 
 template <>
-void GemmBatched<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmBatched<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -869,7 +869,7 @@ void GemmBatched<float, CUDAContext>(
 }
 
 template <>
-void GemmStridedBatched<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -930,7 +930,7 @@ void GemmStridedBatched<float, CUDAContext>(
 }
 
 template <>
-void GemmBatched<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmBatched<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1059,7 +1059,7 @@ void GemmBatched<float16, CUDAContext>(
 }
 
 template <>
-void GemmStridedBatched<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1168,7 +1168,7 @@ void GemmStridedBatched<float16, CUDAContext>(
 
 // No change, but required. Defer to default CUDA engine
 template <>
-void Gemm<float, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void Gemm<float, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -1186,7 +1186,7 @@ void Gemm<float, CUDAContext, TensorCoreEngine>(
 }
 
 template <>
-void Gemm<float16, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void Gemm<float16, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -1245,7 +1245,7 @@ void Gemm<float16, CUDAContext, TensorCoreEngine>(
 }
 
 template <>
-void GemmStridedBatched<float, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1282,7 +1282,7 @@ void GemmStridedBatched<float, CUDAContext, TensorCoreEngine>(
 }
 
 template <>
-void GemmStridedBatched<float16, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float16, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1321,7 +1321,7 @@ void GemmStridedBatched<float16, CUDAContext, TensorCoreEngine>(
 #endif // CUDA_VERSION >= 9000
 
 template <>
-void GemmEx<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -1362,7 +1362,7 @@ void GemmEx<float, CUDAContext>(
 }
 
 template <>
-void Gemv<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemv<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -1415,7 +1415,7 @@ __global__ void AddStripedBatchKernel(
 
 #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T)              \
   template <>                                                     \
-  void AddStripedBatch<T, CUDAContext>(                           \
+  CAFFE2_CUDA_EXPORT void AddStripedBatch<T, CUDAContext>(                           \
       const int N,                                                \
       const T* first,                                             \
       T* Y,                                                       \
@@ -1434,7 +1434,7 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16);
 #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH
 
 template <>
-void Gemv<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemv<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -1514,7 +1514,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) {
 
 #define CAFFE2_SPECIALIZED_CUDA_SET(T)                              \
   template <>                                                       \
-  void Set<T, CUDAContext>(                                         \
+  CAFFE2_CUDA_API void Set<T, CUDAContext>(                         \
       const size_t N, const T alpha, T* Y, CUDAContext* context) {  \
     if (N == 0) {                                                   \
       return;                                                       \
@@ -1542,7 +1542,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
 #undef CAFFE2_SPECIALIZED_CUDA_SET
 
 template <>
-void Set<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Set<float16, CUDAContext>(
     const size_t N,
     const float16 alpha,
     float16* Y,
@@ -1577,7 +1577,7 @@ UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
 } // namespace
 
 template <>
-void RandUniform<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandUniform<float, CUDAContext>(
     const size_t n,
     const float min,
     const float max,
@@ -1592,7 +1592,7 @@ void RandUniform<float, CUDAContext>(
 }
 
 template <>
-void RandUniform<double, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandUniform<double, CUDAContext>(
     const size_t n,
     const double min,
     const double max,
@@ -1608,7 +1608,7 @@ void RandUniform<double, CUDAContext>(
 }
 
 template <>
-void RandUniform<int, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandUniform<int, CUDAContext>(
     const size_t n,
     const int min,
     const int max,
@@ -1642,7 +1642,7 @@ size_t HandleOddLengthRandGaussian(
 }
 
 template <>
-void RandGaussian<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandGaussian<float, CUDAContext>(
     const size_t n,
     const float mean,
     const float std,
@@ -1658,7 +1658,7 @@ void RandGaussian<float, CUDAContext>(
 }
 
 template <>
-void RandGaussian<double, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandGaussian<double, CUDAContext>(
     const size_t n,
     const double mean,
     const double std,
@@ -1671,7 +1671,7 @@ void RandGaussian<double, CUDAContext>(
 }
 
 template <>
-void Dot<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Dot<float, CUDAContext>(
     const int n,
     const float* a,
     const float* b,
@@ -1683,7 +1683,7 @@ void Dot<float, CUDAContext>(
 }
 
 template <>
-void Dot<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Dot<float16, CUDAContext>(
     const int n,
     const float16* a,
     const float16* b,
@@ -1760,7 +1760,7 @@ __global__ void SumConvertKernel(float* sum, T* dest) {
 }
 
 template <typename T, typename IterT>
-void SumGenericIter(
+CAFFE2_CUDA_EXPORT void SumGenericIter(
     const int N,
     IterT it,
     T*& dest,
@@ -1789,7 +1789,7 @@ void SumGenericIter(
 } // namespace
 
 template <>
-void Sum<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Sum<float, CUDAContext>(
     const int N,
     const float* x,
     float* y,
@@ -1804,7 +1804,7 @@ void Sum<float, CUDAContext>(
 }
 
 template <>
-void Sum<int32_t, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Sum<int32_t, CUDAContext>(
     const int N,
     const int32_t* x,
     int32_t* y,
@@ -1829,7 +1829,7 @@ struct FloatTransform {
 
 #define CAFFE2_MATH_SUM_FUNC(T)                                           \
   template <>                                                             \
-  void Sum<T, CUDAContext>(                                               \
+  CAFFE2_CUDA_EXPORT void Sum<T, CUDAContext>(                                               \
       const int N,                                                        \
       const T* x,                                                         \
       T* y,                                                               \
@@ -1861,7 +1861,7 @@ struct SqrTransform {
 } //  namespace
 
 template <>
-void SumSqr<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void SumSqr<float, CUDAContext>(
     const int N,
     const float* x,
     float* y,
@@ -1880,7 +1880,7 @@ void SumSqr<float, CUDAContext>(
 
 #define CAFFE2_MATH_SUMSQR_FUNC(T)                                      \
   template <>                                                           \
-  void SumSqr<T, CUDAContext>(                                          \
+  CAFFE2_CUDA_EXPORT void SumSqr<T, CUDAContext>(                                          \
       const int N,                                                      \
       const T* x,                                                       \
       T* y,                                                             \
@@ -1920,7 +1920,7 @@ SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
 } // namespace
 
 template <>
-void Select<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Select<float, CUDAContext>(
     const int N,
     const int D,
     const float* x,
@@ -1935,7 +1935,7 @@ void Select<float, CUDAContext>(
 }
 
 template <>
-void Select<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Select<float16, CUDAContext>(
     const int N,
     const int D,
     const float16* x,
@@ -1985,7 +1985,7 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
 } // namespace
 
 template <>
-void Powx<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Powx<float, CUDAContext>(
     const int N,
     const float* a,
     const float b,
@@ -2000,7 +2000,7 @@ void Powx<float, CUDAContext>(
 
 #define DELEGATE_CUBLAS_SCALE_FUNCTION(TAlpha, TData, CuBLASFunc)            \
   template <>                                                                \
-  void Scale<TAlpha, TData, CUDAContext>(                                    \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(                                    \
       const int N,                                                           \
       const TAlpha alpha,                                                    \
       const TData* x,                                                        \
@@ -2024,7 +2024,7 @@ void Powx<float, CUDAContext>(
     }                                                                        \
   }                                                                          \
   template <>                                                                \
-  void Scale<TAlpha, TData, CUDAContext>(                                    \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(                                    \
       const int N,                                                           \
       const TAlpha* alpha,                                                   \
       const TData* x,                                                        \
@@ -2051,7 +2051,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal)
 
 #define CAFFE2_SPECIALIZED_CUDA_SCALE(TAlpha, TData)  \
   template <>                                         \
-  void Scale<TAlpha, TData, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(             \
       const int N,                                    \
       const TAlpha alpha,                             \
       const TData* x,                                 \
@@ -2078,7 +2078,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal)
            context->cuda_stream()>>>(N, alpha, x, y); \
   }                                                   \
   template <>                                         \
-  void Scale<TAlpha, TData, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(             \
       const int N,                                    \
       const TAlpha* alpha,                            \
       const TData* x,                                 \
@@ -2098,7 +2098,7 @@ CAFFE2_SPECIALIZED_CUDA_SCALE(std::int64_t, std::int64_t)
 #undef CAFFE2_SPECIALIZED_CUDA_SCALE
 
 template <>
-void Scale<float16, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float16, float16, CUDAContext>(
     const int N,
     const float16 alpha,
     const float16* x,
@@ -2129,7 +2129,7 @@ void Scale<float16, float16, CUDAContext>(
 }
 
 template <>
-void Scale<float16, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float16, float16, CUDAContext>(
     const int N,
     const float16* alpha,
     const float16* x,
@@ -2160,7 +2160,7 @@ void Scale<float16, float16, CUDAContext>(
 }
 
 template <>
-void Scale<float, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float, float16, CUDAContext>(
     const int N,
     const float alpha,
     const float16* x,
@@ -2193,7 +2193,7 @@ void Scale<float, float16, CUDAContext>(
 }
 
 template <>
-void Scale<float, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float, float16, CUDAContext>(
     const int N,
     const float* alpha,
     const float16* x,
@@ -2224,7 +2224,7 @@ void Scale<float, float16, CUDAContext>(
 }
 
 template <>
-void Axpy<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float, CUDAContext>(
     const int N,
     const float alpha,
     const float* X,
@@ -2236,7 +2236,7 @@ void Axpy<float, CUDAContext>(
 }
 
 template <>
-void Axpy<double, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<double, CUDAContext>(
     const int N,
     const float alpha,
     const double* X,
@@ -2250,7 +2250,7 @@ void Axpy<double, CUDAContext>(
 }
 
 template <>
-void Axpy<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float16, CUDAContext>(
     const int N,
     const float alpha,
     const float16* X,
@@ -2273,7 +2273,7 @@ void Axpy<float16, CUDAContext>(
 }
 
 template <>
-void Axpy<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float, CUDAContext>(
     const int N,
     const float* alpha,
     const float* X,
@@ -2285,7 +2285,7 @@ void Axpy<float, CUDAContext>(
 }
 
 template <>
-void Axpy<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float16, CUDAContext>(
     const int N,
     const float* alpha,
     const float16* X,
@@ -2379,7 +2379,7 @@ __global__ void AxpbyCUDAKernel<float, float16>(
 
 #define CAFFE2_SPECIALIZED_CUDA_AXPBY(TCoeff, TData) \
   template <>                                        \
-  void Axpby<TCoeff, TData, CUDAContext>(            \
+  CAFFE2_CUDA_EXPORT void Axpby<TCoeff, TData, CUDAContext>(            \
       const int n,                                   \
       const TCoeff a,                                \
       const TData* x,                                \
@@ -2393,7 +2393,7 @@ __global__ void AxpbyCUDAKernel<float, float16>(
            context->cuda_stream()>>>(n, a, x, b, y); \
   }                                                  \
   template <>                                        \
-  void Axpby<TCoeff, TData, CUDAContext>(            \
+  CAFFE2_CUDA_EXPORT void Axpby<TCoeff, TData, CUDAContext>(            \
       const int n,                                   \
       const TCoeff* a,                               \
       const TData* x,                                \
@@ -2468,7 +2468,7 @@ __global__ void Im2ColNCHWCUDAKernel(
 }
 
 template <typename T>
-__global__ void Im2ColNHWCCUDAKernel(
+__global__  void Im2ColNHWCCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
@@ -2519,7 +2519,7 @@ __global__ void Im2ColNHWCCUDAKernel(
 }
 
 template <typename T>
-__global__ void Col2ImNCHWCUDAKernel(
+__global__  void Col2ImNCHWCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
@@ -2574,7 +2574,7 @@ __global__ void Col2ImNCHWCUDAKernel(
 }
 
 template <typename T>
-__global__ void Col2ImNHWCCUDAKernel(
+__global__  void Col2ImNHWCCUDAKernel(
     const int n,
     const int input_w,
     const int channels,
@@ -2627,7 +2627,7 @@ __global__ void Col2ImNHWCCUDAKernel(
 }
 
 template <typename T, int N, bool kCol2Im>
-__global__ void Im2ColNdNCHWCUDAKernel(
+__global__  void Im2ColNdNCHWCUDAKernel(
     const int outer_size,
     const int inner_size,
     const int kernel_size,
@@ -2683,7 +2683,7 @@ __global__ void Im2ColNdNCHWCUDAKernel(
 }
 
 template <typename T, int N>
-void Im2ColNdNCHWCUDAImpl(
+CAFFE2_CUDA_EXPORT void Im2ColNdNCHWCUDAImpl(
     const int img_size,
     const int col_size,
     const int* img_shape,
@@ -2730,7 +2730,7 @@ void Im2ColNdNCHWCUDAImpl(
 }
 
 template <typename T, int N>
-void Col2ImNdNCHWCUDAImpl(
+CAFFE2_CUDA_EXPORT void Col2ImNdNCHWCUDAImpl(
     const int img_size,
     const int col_size,
     const int* img_shape,
@@ -2780,7 +2780,7 @@ void Col2ImNdNCHWCUDAImpl(
 } // namespace
 
 template <>
-void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
     const int channels,
     const int height,
     const int width,
@@ -2826,7 +2826,7 @@ void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
+CAFFE2_CUDA_EXPORT void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
     const int channels,
     const int height,
     const int width,
@@ -2874,7 +2874,7 @@ void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
 }
 
 template <>
-void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
     const int channels,
     const int height,
     const int width,
@@ -2920,7 +2920,7 @@ void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
+CAFFE2_CUDA_EXPORT void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
     const int channels,
     const int height,
     const int width,
@@ -2968,7 +2968,7 @@ void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
 }
 
 template <>
-void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -2999,7 +2999,7 @@ void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -3030,7 +3030,7 @@ void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void CopyMatrix<CUDAContext>(
+CAFFE2_CUDA_EXPORT void CopyMatrix<CUDAContext>(
     const size_t itemsize,
     const int M,
     const int N,
@@ -3082,7 +3082,7 @@ CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(TIndex)
 #undef CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX
 
 template <>
-void CopyVector<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void CopyVector<float, CUDAContext>(
     const int N,
     const float* src,
     float* dst,
@@ -3152,7 +3152,7 @@ __global__ void ColwiseReduceKernel(
 
 #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T)                            \
   template <>                                                             \
-  void RowwiseMax<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void RowwiseMax<T, CUDAContext>(                                        \
       const int N, const int D, const T* x, T* y, CUDAContext* context) { \
     RowwiseReduceKernel<<<                                                \
         std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
@@ -3166,7 +3166,7 @@ CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float)
 
 #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T)                            \
   template <>                                                             \
-  void ColwiseMax<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void ColwiseMax<T, CUDAContext>(                                        \
       const int N, const int D, const T* x, T* y, CUDAContext* context) { \
     ColwiseReduceKernel<<<                                                \
         std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
@@ -3188,7 +3188,7 @@ maximum_kernel(const int N, const float alpha, const float* x, float* y) {
 } // namespace
 
 template <>
-void Maximum(
+CAFFE2_CUDA_EXPORT void Maximum(
     const int N,
     const float alpha,
     const float* x,
@@ -3241,7 +3241,7 @@ __global__ void ReduceTensorCUDAKernel(
 }
 
 template <typename T, class Reducer, int D>
-void ReduceTensorCUDAImpl(
+CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl(
     const int outer_size,
     const int inner_size,
     const int* dims,
@@ -3275,7 +3275,7 @@ void ReduceTensorCUDAImpl(
 }
 
 template <typename T, class Reducer>
-void ReduceTensorCUDA(
+CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
     const int num_dims,
     const int* dims,
     const int num_axes,
@@ -3353,7 +3353,7 @@ void ReduceTensorCUDA(
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
   template <>                                 \
-  void ReduceMin<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceMin<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
@@ -3382,7 +3382,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
   template <>                                 \
-  void ReduceMax<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceMax<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
@@ -3411,7 +3411,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
   template <>                                 \
-  void ReduceSum<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
@@ -3440,7 +3440,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \
   template <>                                  \
-  void ReduceMean<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceMean<T, CUDAContext>(             \
       const int num_dims,                      \
       const int* dims,                         \
       const int num_axes,                      \
@@ -3496,7 +3496,7 @@ __global__ void BroadcastCUDAKernel(
 }
 
 template <typename T, int D>
-void BroadcastCUDAImpl(
+CAFFE2_CUDA_EXPORT void BroadcastCUDAImpl(
     const int X_ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -3534,7 +3534,7 @@ void BroadcastCUDAImpl(
 
 #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \
   template <>                                \
-  void Broadcast<T, CUDAContext>(            \
+  CAFFE2_CUDA_EXPORT void Broadcast<T, CUDAContext>(            \
       const int X_ndim,                      \
       const int* X_dims,                     \
       const int Y_ndim,                      \
@@ -3676,7 +3676,7 @@ __global__ void MomentsCUDAKernel(
 }
 
 template <typename T, int D>
-void MomentsCUDAImpl(
+CAFFE2_CUDA_EXPORT void MomentsCUDAImpl(
     const int outer_size,
     const int inner_size,
     const int* dims,
@@ -3700,7 +3700,7 @@ void MomentsCUDAImpl(
 }
 
 template <typename T>
-void MomentsCUDA(
+CAFFE2_CUDA_EXPORT void MomentsCUDA(
     const int num_dims,
     const int* dims,
     const int num_axes,
@@ -3783,7 +3783,7 @@ void MomentsCUDA(
 
 #define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T)                           \
   template <>                                                        \
-  void Moments<T, CUDAContext>(                                      \
+  CAFFE2_CUDA_EXPORT void Moments<T, CUDAContext>(                                      \
       const int num_dims,                                            \
       const int* dims,                                               \
       const int num_axes,                                            \
@@ -3819,7 +3819,7 @@ DELEGATE_INV_STD_KERNEL_FUNCTION(float, rsqrtf)
 
 #define CAFFE2_SPECIALIZED_CUDA_INV_STD(T)                      \
   template <>                                                   \
-  void InvStd<T, CUDAContext>(                                  \
+  CAFFE2_CUDA_EXPORT void InvStd<T, CUDAContext>(                                  \
       const int N,                                              \
       const T epsilon,                                          \
       const T* var,                                             \
@@ -3861,7 +3861,7 @@ __global__ void TransposeCUDAKernel(
 }
 
 template <typename T, int D>
-void TransposeCUDAImpl(
+CAFFE2_CUDA_EXPORT void TransposeCUDAImpl(
     const int* dims,
     const int* axes,
     const T* X,
@@ -3886,7 +3886,7 @@ void TransposeCUDAImpl(
 
 #define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T)                             \
   template <>                                                            \
-  void Transpose<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Transpose<T, CUDAContext>(                                        \
       const int ndim,                                                    \
       const int* dims,                                                   \
       const int* axes,                                                   \
@@ -3933,7 +3933,7 @@ __global__ void AffineChannelCUDAKernel(
 
 #define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T, kOrder)              \
   template <>                                                          \
-  void AffineChannel<T, CUDAContext, kOrder>(                          \
+  CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, kOrder>(                          \
       const int N,                                                     \
       const int C,                                                     \
       const int HxW,                                                   \
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index dbec0d07ebf6e3..4fbf634cf7ac71 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -349,7 +349,9 @@ if(BUILD_PYTHON)
     execute_process(
       COMMAND "which" "python" RESULT_VARIABLE _exitcode OUTPUT_VARIABLE _py_exe)
     if(${_exitcode} EQUAL 0)
-      string(STRIP ${_py_exe} PYTHON_EXECUTABLE)
+      if (NOT MSVC)
+        string(STRIP ${_py_exe} PYTHON_EXECUTABLE)
+      endif()
       message(STATUS "Setting Python to ${PYTHON_EXECUTABLE}")
     endif()
   endif()
@@ -388,7 +390,11 @@ if(BUILD_PYTHON)
     pycmd_no_exit(_py_lib _exitcode "from sysconfig import get_paths; print(get_paths()['stdlib'])")
     if("${_exitcode}" EQUAL 0 AND EXISTS "${_py_lib}" AND EXISTS "${_py_lib}")
       SET(PYTHON_LIBRARY "${_py_lib}")
-      message(STATUS "Setting Python's library to ${_py_lib}")
+      if (MSVC)
+        STRING(REPLACE "Lib" "libs" _py_static_lib ${_py_lib})
+        link_directories(${_py_static_lib})
+      endif()
+      message(STATUS "Setting Python's library to ${PYTHON_LIBRARY}")
     endif()
   endif(NOT DEFINED PYTHON_LIBRARY)
 
@@ -793,6 +799,11 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
   # We will build onnx as static libs and embed it directly into the binary.
   set(BUILD_SHARED_LIBS OFF)
+  if (MSVC AND BUILD_SHARED_LIBS)
+    # That also means we want to export all symbols from the shared
+    # library we are building
+    set(ONNX_BUILD_MAIN_LIB ON)
+  endif()
   set(ONNX_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
   set(ONNX_USE_LITE_PROTO ${CAFFE2_USE_LITE_PROTO})
   # If linking local protobuf, make sure ONNX has the same protobuf
diff --git a/setup.py b/setup.py
index 37eaf51f0d3908..5fbe75833da0fb 100644
--- a/setup.py
+++ b/setup.py
@@ -875,12 +875,6 @@ def run(self):
         CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
     if USE_ROCM:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
-    # Windows needs direct access to ONNX libraries as well
-    # as through Caffe2 library
-    CAFFE2_LIBS += [
-        os.path.join(lib_path, 'onnx.lib'),
-        os.path.join(lib_path, 'onnx_proto.lib'),
-    ]
     if DEBUG:
         PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobufd.lib')
     else:
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 34d7ea1ac0d206..acc5bed4a98bfb 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -176,7 +176,7 @@ goto:eof
                   -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^
                   -DNO_API=ON ^
                   -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^
-                  -DBUILD_PYTHON=OFF ^
+                  -DBUILD_PYTHON=%BUILD_PYTHON% ^
                   -DBUILD_BINARY=%BUILD_BINARY% ^
                   -DBUILD_TEST=OFF ^
                   -DINSTALL_TEST=%INSTALL_TEST% ^
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 2c1ff700dd2858..58814d21ffa301 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -343,6 +343,7 @@ if(USE_CUDA)
     set(TORCH_CUDA_LIBRARIES
       ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
       ${CUDA_LIBRARIES})
+    set_target_properties(torch PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
     target_include_directories(torch PRIVATE "${NVTOOLEXT_HOME}/include")
   elseif(APPLE)
     set(TORCH_CUDA_LIBRARIES

From 3cbec5453ba92d1f6def57f958b2b8bb14da1ce3 Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pcnoordhuis@gmail.com>
Date: Mon, 17 Sep 2018 21:47:00 -0700
Subject: [PATCH 212/237] Reorder statements for readability (#11764)

Summary:
I was reading this a couple times before figuring out it's also the entry point for the MPI_COMM_WORLD.

Reordered statements and added comment to clarify.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11764

Differential Revision: D9882834

Pulled By: pietern

fbshipit-source-id: a9282d55368815925fd695a2541354e5aec599da
---
 torch/lib/c10d/ProcessGroupMPI.cpp | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 03699962c15b59..63846b443ea072 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -253,33 +253,30 @@ std::shared_ptr<ProcessGroupMPI> ProcessGroupMPI::createProcessGroupMPI(
   MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &size));
   MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
 
-  globalLock.unlock();
-
   if (rank < 0 || size < 0) {
     throw std::runtime_error("Failed to get the world_size / rank");
   }
 
+  // If no ranks are specified, assume we're creating the root group
   if (ranks.empty()) {
+    globalLock.unlock();
     return std::make_shared<ProcessGroupMPI>(rank, size, MPI_COMM_WORLD);
-  } else {
-    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  }
 
-    MPI_Group worldGroup;
-    MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
+  MPI_Group worldGroup;
+  MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
 
-    MPI_Group ranksGroup;
-    MPI_CHECK(
-        MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
+  MPI_Group ranksGroup;
+  MPI_CHECK(MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
 
-    MPI_Comm groupComm;
-    MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm));
+  MPI_Comm groupComm;
+  MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm));
 
-    MPI_CHECK(MPI_Group_free(&worldGroup));
-    MPI_CHECK(MPI_Group_free(&ranksGroup));
+  MPI_CHECK(MPI_Group_free(&worldGroup));
+  MPI_CHECK(MPI_Group_free(&ranksGroup));
 
-    globalLock.unlock();
-    return std::make_shared<ProcessGroupMPI>(rank, size, groupComm);
-  }
+  globalLock.unlock();
+  return std::make_shared<ProcessGroupMPI>(rank, size, groupComm);
 }
 
 ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pgComm)

From a02685e1092d264b34d4511b3a5a12c8ec3840e4 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv@beamnet.de>
Date: Mon, 17 Sep 2018 21:47:43 -0700
Subject: [PATCH 213/237] Fix test_torch's test_potri (#11770)

Summary:
tset_potri -> test_potri, even though it has been like this for a long time

More a curiosity than grave functionality...
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11770

Reviewed By: ezyang

Differential Revision: D9884767

Pulled By: soumith

fbshipit-source-id: 9bedde2e94ade281ab1ecc2293ca3cb1a0107387
---
 test/test_torch.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index fbabc12d32e051..ad803347317d80 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5233,7 +5233,7 @@ def test_potrs(self):
         self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12)
 
     @skipIfNoLapack
-    def tset_potri(self):
+    def test_potri(self):
         a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
                           (-6.05, -3.30, 5.36, -4.44, 1.08),
                           (-0.45, 2.58, -2.70, 0.27, 9.04),
@@ -5241,7 +5241,7 @@ def tset_potri(self):
                           (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
 
         # make sure 'a' is symmetric PSD
-        a = a * a.t()
+        a = torch.mm(a, a.t())
 
         # compute inverse directly
         inv0 = torch.inverse(a)
@@ -5252,13 +5252,13 @@ def tset_potri(self):
         self.assertLessEqual(inv0.dist(inv1), 1e-12)
 
         # upper Triangular Test
-        chol = torch.potrf(a, 'U')
-        inv1 = torch.potri(chol, 'U')
+        chol = torch.potrf(a, True)
+        inv1 = torch.potri(chol, True)
         self.assertLessEqual(inv0.dist(inv1), 1e-12)
 
         # lower Triangular Test
-        chol = torch.potrf(a, 'L')
-        inv1 = torch.potri(chol, 'L')
+        chol = torch.potrf(a, False)
+        inv1 = torch.potri(chol, False)
         self.assertLessEqual(inv0.dist(inv1), 1e-12)
 
     @skipIfNoLapack

From bd43d64dd5b7e96b70e84add0a2d6c5256632a25 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Mon, 17 Sep 2018 21:57:37 -0700
Subject: [PATCH 214/237] Add strides to Tensor (#11763)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11763

baseline-std vector
```
============================================================================
caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative  time/iter  iters/s
============================================================================
TensorConstructionDestruction                                6.74us  148.26K
TensorShareData                                              5.89us  169.78K
TensorShareExternalPointer                                   1.01us  994.35K
TensorReallocation                                           2.46us  405.78K
============================================================================
============================================================================
caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative  time/iter  iters/s
============================================================================
TensorConstructionDestruction                                7.50us  133.27K
TensorShareData                                              7.07us  141.38K
TensorShareExternalPointer                                   1.05us  955.19K
TensorReallocation                                           2.55us  391.62K
============================================================================

```

baseline-smallvector
```
============================================================================
caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative  time/iter  iters/s
============================================================================
TensorConstructionDestruction                                6.56us  152.34K
TensorShareData                                              5.84us  171.32K
TensorShareExternalPointer                                 962.49ns    1.04M
TensorReallocation                                           2.32us  431.73K
============================================================================
============================================================================
caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative  time/iter  iters/s
============================================================================
TensorConstructionDestruction                                6.29us  159.04K
TensorShareData                                              5.73us  174.39K
TensorShareExternalPointer                                 914.90ns    1.09M
TensorReallocation                                           2.29us  435.80K
============================================================================
```

Reviewed By: ezyang

Differential Revision: D9694097

fbshipit-source-id: c462e770a4b40e640d8c9d38e0ae7036a4e6e84a
---
 aten/src/ATen/DimVector.h      | 11 +-----
 aten/src/ATen/core/DimVector.h | 11 ++++++
 caffe2/core/tensor.h           | 12 ++++++
 caffe2/core/tensor_impl.h      | 68 ++++++++++++++++++++++++++++++++--
 4 files changed, 89 insertions(+), 13 deletions(-)
 create mode 100644 aten/src/ATen/core/DimVector.h

diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h
index 9bbf37b5a9f4d4..cb652fffcb1481 100644
--- a/aten/src/ATen/DimVector.h
+++ b/aten/src/ATen/DimVector.h
@@ -1,11 +1,2 @@
 #pragma once
-
-#include <ATen/core/SmallVector.h>
-#include <stdint.h>
-
-namespace at {
-
-/// A container for sizes or strides
-using DimVector = SmallVector<int64_t, 5>;
-
-}
+#include <ATen/core/DimVector.h>
diff --git a/aten/src/ATen/core/DimVector.h b/aten/src/ATen/core/DimVector.h
new file mode 100644
index 00000000000000..a98c841a94777b
--- /dev/null
+++ b/aten/src/ATen/core/DimVector.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/SmallVector.h>
+#include <stdint.h>
+
+namespace at {
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, 5>;
+
+} // namespace at
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index a167658060e7fc..27c09f00c4c1e2 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -286,6 +286,18 @@ class CAFFE2_API Tensor final {
     return impl_.get()->canonical_axis_index(axis_index);
   }
 
+  inline int64_t stride(int64_t dim) const {
+    return impl_.get()->stride(dim);
+  }
+
+  inline at::DimVector strides() {
+    return impl_.get()->strides();
+  }
+
+  inline bool is_contiguous() const {
+    return impl_.get()->is_contiguous();
+  }
+
   template <typename T>
   inline bool IsType() const {
     return impl_.get()->IsType<T>();
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 3cd5bd23d446fb..3f42ed36b30954 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <ATen/core/DimVector.h>
 #include <ATen/core/TensorImpl.h>
-
 #include <ATen/core/context_base.h>
 
 #include "caffe2/core/allocator.h"
@@ -133,12 +133,17 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       return;
     }
     if (data_type_ != src.meta()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          src.is_contiguous(),
+          "Right now only copy of contiguous source Tensor is supported.");
       storage_ = at::Storage(GetDeviceType(), src.meta());
       data_type_ = src.meta();
     }
     if (src.size() == -1) {
       dims_.clear();
       numel_ = -1;
+      strides_.clear();
+      is_contiguous_ = true;
       storage_.reset();
       data_type_ = TypeMeta();
       return;
@@ -203,6 +208,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(
         num, 0, "`num` must be non-negative for Extend");
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Extend is only supported for contiguous Tensor.");
     auto newDims = dims_;
     newDims[0] += num;
     if (!storage_.data()) {
@@ -243,6 +251,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * that the extra capacity after the end of the shurnk tensor is maintained.
    */
   void ShrinkTo(TIndex outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ShrinkTo is only supported on contiguous Tensor.");
     CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
     CAFFE_ENFORCE_WITH_CALLER(
         outer_dim <= dims_[0],
@@ -266,6 +277,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   template <class T>
   void ReserveSpace(const T& outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ReserveSpace is only supported for contiguous Tensor.");
     CAFFE_ENFORCE(
         numel_ != -1, "size should be initialized before calling ReserveSpace");
     CAFFE_ENFORCE(
@@ -335,6 +349,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * sugar wrapper that essentially calls Resize(src_tensor.dims()).
    */
   inline void ResizeLike(const TensorImpl& src_tensor) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        src_tensor.is_contiguous(),
+        "Right now ResizeLike is only supported for contiguous Tensor.");
     // Note: need casting for different context types.
     if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
       Resize(src_tensor.dims());
@@ -346,6 +363,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * This requires the total size of the tensor to remains constant.
    */
   inline void Reshape(const std::vector<TIndex>& dims) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Reshape is only supported for contiguous Tensor.");
     TIndex new_size = 1;
     for (auto d : dims) {
       CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
@@ -454,6 +474,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       const TypeMeta& data_type,
       size_t capacity = 0,
       MemoryDeleter d = nullptr) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ShareExternalPointer is only supported for contiguos Tensor.");
     CAFFE_ENFORCE_WITH_CALLER(
         data_type.id() != TypeIdentifier::uninitialized(),
         "To share with a raw external pointer you need to pass in an "
@@ -706,6 +729,25 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return canonical_axis_index_(axis_index, ndim());
   }
 
+  inline int64_t stride(int64_t dim) const {
+#ifndef NDEBUG
+    // TODO: dim wrapping?
+    CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        dim, 0, "Cannot have negative dimension index");
+#endif
+    return strides_[dim];
+  }
+
+  // TODO: Change to ArrayRef later
+  inline at::DimVector strides() {
+    return strides_;
+  }
+
+  inline bool is_contiguous() const {
+    return is_contiguous_;
+  }
+
   /**
    * Checks if the tensor content is of the given data type.
    */
@@ -772,9 +814,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
  protected:
-  using DimVector = std::vector<TIndex>;
-  DimVector dims_; // sizes_
+  // TODO: change to DimVector
+  std::vector<TIndex> dims_; // sizes_
+  at::DimVector strides_;
   TIndex numel_ = -1; // numel_
+  bool is_contiguous_ = true;
   // we decide to keep reserved_ and it will
   // live in Tensor after the split
   // The logic is that if Extend() or ReserveSpace() were ever called,
@@ -796,6 +840,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       new_numel *= src[i];
       dims_[i] = src[i];
     }
+    update_strides();
     numel_ = new_numel;
     return numel_ != old_numel;
   }
@@ -803,6 +848,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   bool SetDims() {
     auto old_numel = numel_;
     dims_.resize(0);
+    update_strides();
     numel_ = 1;
     return numel_ != old_numel;
   }
@@ -814,6 +860,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     auto old_numel = numel_;
     dims_.resize(1);
     dims_[0] = d0;
+    update_strides();
     numel_ = d0;
     return numel_ != old_numel;
   }
@@ -823,6 +870,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     dims_.resize(2);
     dims_[0] = d0;
     dims_[1] = d1;
+    update_strides();
     numel_ = d0 * d1;
     return numel_ != old_numel;
   }
@@ -833,6 +881,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     dims_[0] = d0;
     dims_[1] = d1;
     dims_[2] = d2;
+    update_strides();
     numel_ = d0 * d1 * d2;
     return numel_ != old_numel;
   }
@@ -845,9 +894,22 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     dims_[1] = d1;
     dims_[2] = d2;
     dims_[3] = d3;
+    update_strides();
     numel_ = d0 * d1 * d2 * d3;
     return numel_ != old_numel;
   }
+
+  inline void update_strides() {
+    strides_.resize(dims_.size());
+    if (ndim() > 0) {
+      int last_idx = ndim() - 1;
+      strides_[last_idx] = 1;
+      for (auto i = last_idx - 1; i >= 0; --i) {
+        strides_[i] = strides_[i + 1] * std::max<int64_t>(dims_[i + 1], 1);
+      }
+    }
+    is_contiguous_ = true;
+  }
 };
 
 }

From 63c811b3a68d9c23addc219cffb691678b7407c2 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Mon, 17 Sep 2018 23:15:57 -0700
Subject: [PATCH 215/237] Include some JIT things in C++ docs (#11712)

Summary:
Since we're making parts of the JIT public as part of loading script modules, they should be on the cppdocs website.

Orthogonal: We decided not to export things like `IValue` into the `torch` namespace, so `RegisterOperators` shouldn't be there either.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11712

Differential Revision: D9837578

Pulled By: goldsborough

fbshipit-source-id: 4c06d2fa9dd4b4216951f27424c2ce795febab9c
---
 docs/cpp/Doxyfile           |  4 ++++
 docs/cpp/check-doxygen.sh   |  5 +++--
 test/custom_operator/op.cpp |  2 +-
 torch/csrc/jit/import.h     | 11 +++++++++--
 torch/script.h              |  5 -----
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile
index 3c942a9025c2dd..37abadc83ff7ca 100644
--- a/docs/cpp/Doxyfile
+++ b/docs/cpp/Doxyfile
@@ -752,6 +752,10 @@ WARN_LOGFILE           =
 
 INPUT                  =  ../../torch/csrc/api/include \
                           ../../torch/csrc/api/src \
+                          ../../torch/csrc/jit/custom_operator.h \
+                          ../../torch/csrc/jit/import.h \
+                          ../../torch/csrc/jit/ivalue.h \
+                          ../../torch/csrc/jit/script/module.h \
                           ../../aten/src/ATen/ATen.h \
                           ../../aten/src/ATen/Backend.h \
                           ../../aten/src/ATen/Device.h \
diff --git a/docs/cpp/check-doxygen.sh b/docs/cpp/check-doxygen.sh
index 375347974ac749..0af06ad2350c39 100755
--- a/docs/cpp/check-doxygen.sh
+++ b/docs/cpp/check-doxygen.sh
@@ -21,11 +21,12 @@ ignore_warning "warning: source ../../build/aten/src/ is not a readable file"
 ignore_warning "warning: source ../../build/aten/src/ATen/Tensor.h is not a readable file"
 ignore_warning "warning: source ../../build/aten/src/ATen/Functions.h is not a readable file"
 ignore_warning "warning: documented symbol \`torch::nn::FunctionalImpl::FunctionalImpl' was not declared or defined"
+ignore_warning "functional.h:81: warning: Found ';' while parsing initializer list!"
 
 # Count the number of remaining warnings.
-warnings=$(grep 'warning:' doxygen-log.txt | wc -l)
+warnings="$(grep 'warning:' doxygen-log.txt | wc -l)"
 
-if [[ $warnings != 0 ]]; then
+if [[ "$warnings" -ne "0" ]]; then
   echo "Filtered output"
   cat doxygen-log.txt
   rm -f doxygen-log.txt original-doxygen-log.txt
diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp
index f48189728ea293..c7f7ee1e7ab3f2 100644
--- a/test/custom_operator/op.cpp
+++ b/test/custom_operator/op.cpp
@@ -16,7 +16,7 @@ std::vector<at::Tensor> custom_op(
 }
 
 static auto registry =
-    torch::RegisterOperators()
+    torch::jit::RegisterOperators()
         // We parse the schema for the user.
         .op("custom::op", &custom_op)
         // User provided schema. Among other things, allows defaulting values,
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
index 21f9d605ec59b3..6ce901c4369961 100644
--- a/torch/csrc/jit/import.h
+++ b/torch/csrc/jit/import.h
@@ -3,7 +3,8 @@
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/script/module.h"
 
-namespace torch { namespace jit {
+namespace torch {
+namespace jit {
 
 using ModuleLookup = std::function<std::shared_ptr<script::Module>(
     const std::vector<std::string>&)>;
@@ -12,6 +13,12 @@ TORCH_API void import_ir_module(
     ModuleLookup module_lookup,
     const std::string& filename);
 
+/// Loads a serialized `script::Module` from the given `filename`.
+///
+/// The file stored at the location given in `filename` must contain a
+/// serialized `script::Module`, exported either via `ScriptModule.save()` in
+/// Python or `torch::jit::ExportModule` in C++.
 TORCH_API std::shared_ptr<script::Module> load(const std::string& filename);
 
-}}
+} // namespace jit
+} // namespace torch
diff --git a/torch/script.h b/torch/script.h
index 9033857d672925..01f9e38cd7766d 100644
--- a/torch/script.h
+++ b/torch/script.h
@@ -6,8 +6,3 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <ATen/ATen.h>
-
-namespace torch {
-using jit::createOperator;
-using jit::RegisterOperators;
-} // namespace torch

From 407a9fee0c8fb172459ac9b3556419305e47f8a7 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Mon, 17 Sep 2018 23:24:49 -0700
Subject: [PATCH 216/237] make copy constructed tensor a leaf variable when
 using torch.tensor(sourceTensor) (#11061)

Summary:
- fix https://github.com/pytorch/pytorch/issues/10876
- the cause of the bug is because copy constructor cannot distinguish between default value of requires_grad and requires_grad=False, thus it makes a copy from source tensor along with its grad_fn if requires_grad=True at source
- with this fix, the behavior becomes
```
>>> source = torch.randn(2, 2, requires_grad=True)
>>> copy = torch.tensor(source, requires_grad=True)
>>> print(copy)
tensor([[-1.2001,  1.9869],
        [-1.0134,  1.3096]], grad_fn=<CopyBackwards>)

>>> source = torch.randn(2, 2, requires_grad=True)
>>> copy = torch.tensor(source, requires_grad=False)
>>> print(copy)
tensor([[-0.7402,  0.0467],
        [ 0.4344, -0.0420]])

>>> source = torch.randn(2, 2, requires_grad=True)
>>> copy = torch.tensor(source)
>>> print(copy)
tensor([[-0.7402,  0.0467],
        [ 0.4344, -0.0420]])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11061

Differential Revision: D9569714

Pulled By: weiyangfb

fbshipit-source-id: ea368688bdc0f1ce5997870e164e42835b64b4a1
---
 test/test_torch.py              | 25 +++++++++++++++++++++++++
 torch/_torch_docs.py            |  7 +++++++
 torch/csrc/utils/tensor_new.cpp | 16 +++++++++++++---
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index ad803347317d80..8fcca02207c89c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2396,6 +2396,31 @@ def test_tensor_factory(self):
             a[0] = 7.
             self.assertEqual(5., res1[0].item())
 
+    def test_tensor_factory_copy_var(self):
+        # default copy from var
+        source = torch.randn(5, 5, requires_grad=True)
+        copy = torch.tensor(source)
+        self.assertEqual(copy.data, source.data)
+        self.assertTrue(source.requires_grad)
+        self.assertTrue(copy.is_leaf)
+        self.assertFalse(copy.requires_grad)
+
+        # copy with requires_grad=False
+        source = torch.randn(5, 5, requires_grad=True)
+        copy = torch.tensor(source, requires_grad=False)
+        self.assertEqual(copy.data, source.data)
+        self.assertTrue(source.requires_grad)
+        self.assertTrue(copy.is_leaf)
+        self.assertFalse(copy.requires_grad)
+
+        # copy with requires_grad=True
+        source = torch.randn(5, 5, requires_grad=True)
+        copy = torch.tensor(source, requires_grad=True)
+        self.assertEqual(copy.data, source.data)
+        self.assertTrue(source.requires_grad)
+        self.assertTrue(copy.is_leaf)
+        self.assertTrue(copy.requires_grad)
+
     def test_tensor_factory_type_inference(self):
         def test_inference(default_dtype):
             saved_dtype = torch.get_default_dtype()
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 64d93a5916ab16..432cb64d722765 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3818,6 +3818,13 @@ def parse_kwargs(desc):
     If you have a NumPy ``ndarray`` and want to avoid a copy, use
     :func:`torch.from_numpy`.
 
+.. warning::
+
+    When data is a tensor `x`, :func:`torch.tensor` reads out 'the data' from whatever it is passed,
+    and constructs a leaf variable. Therefore ``torch.tensor(x)`` is equivalent to ``x.clone().detach()``
+    and ``torch.tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
+    The equivalents use ``clone()`` and ``detach()`` are recommended.
+
 Args:
     {data}
     {dtype}
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index ebed9c07fad7a7..1b6ec75d13ebc2 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -197,7 +197,7 @@ void recursive_store(char* data, IntList sizes, IntList strides, int64_t dim,
 
 Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt, PyObject* data,
                                      bool copy_variables, bool copy_numpy,
-                                     bool type_inference) {
+                                     bool type_inference, bool args_requires_grad=false) {
   int32_t device_index = -1;
   if (device_opt.has_value()) {
     device_index = device_opt->index();
@@ -207,6 +207,10 @@ Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt
   }
 
   if (THPVariable_Check(data)) {
+      PyErr_WarnEx(PyExc_UserWarning,
+        "To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() "
+        "or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).", 1);
+
       auto var = reinterpret_cast<THPVariable*>(data)->cdata;
       auto type_inference_device_type = device_opt.has_value() ? device_opt->type()
                                                                : torch::getDeviceType(var.type());
@@ -216,8 +220,11 @@ Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt
                                                        *torch::getLayout(type.backend()),
                                                        type_inference_device_type);
       const auto& type_to_use = type_inference ? type_inference_type : type;
-      return copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) :
+      auto new_tensor = copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) :
                               new_with_type_conversion(type_to_use, var, device_index);
+      new_tensor.detach_(); // making copy constructed tensor a leaf node
+      new_tensor.set_requires_grad(args_requires_grad);
+      return new_tensor;
   }
 
 #ifdef USE_NUMPY
@@ -500,13 +507,16 @@ Tensor tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
     bool type_inference = r.isNone(1);
+    // args_requires_grad=True if requires_grad is set to True (requires_grad=True)
+    bool args_requires_grad = r.toBool(3);
     return internal_new_from_data(
                typeWithDefault(r, 1, 2, type),
                r.deviceOptional(2),
                r.pyobject(0),
                true,
                true,
-               type_inference)
+               type_inference,
+               args_requires_grad)
         .set_requires_grad(r.toBool(3));
   }
   throw std::runtime_error("tensor(): invalid arguments");

From e734c94fa2d4399d33b37a3dcc9096a0dbb776e5 Mon Sep 17 00:00:00 2001
From: Marc Ferradou <zippeurfou@gmail.com>
Date: Mon, 17 Sep 2018 23:44:09 -0700
Subject: [PATCH 217/237] Quick update to embedding_bag doc (#11784)

Summary:
Related to #11624 adding maxes to the function def of embedding_bag.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11784

Differential Revision: D9892598

Pulled By: ezyang

fbshipit-source-id: e6372ccf631826ddf1e1885b2f8f75f354a36c0b
---
 torch/nn/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 61c0fdce5fe048..3c67393a01119b 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1206,7 +1206,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
 
 def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
                   scale_grad_by_freq=False, mode='mean', sparse=False):
-    r"""Computes sums or means of 'bags' of embeddings, without instantiating the
+    r"""Computes sums, means or maxes of 'bags' of embeddings, without instantiating the
     intermediate embeddings.
 
     See :class:`torch.nn.EmbeddingBag` for more details.

From 91b6458e2d0dba935da2cc7c2cdc6d7907bc3f48 Mon Sep 17 00:00:00 2001
From: nehz <nehz@users.noreply.github.com>
Date: Tue, 18 Sep 2018 01:11:55 -0700
Subject: [PATCH 218/237] Container __getitem__ slicing for subclasses (#11694)

Summary:
Simple change to allow ModuleList subclasses's `__getitem__(slice)` to return class of subclass rather than ModuleList
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11694

Differential Revision: D9892824

Pulled By: ezyang

fbshipit-source-id: b75e9c196487f55cb93f0dab6c20d850e8e759ff
---
 torch/nn/modules/container.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 01e12e621ba4ac..15590dbe5c9a92 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -63,7 +63,7 @@ def _get_item_by_idx(self, iterator, idx):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return Sequential(OrderedDict(list(self._modules.items())[idx]))
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
         else:
             return self._get_item_by_idx(self._modules.values(), idx)
 
@@ -132,7 +132,7 @@ def _get_abs_string_index(self, idx):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return ModuleList(list(self._modules.values())[idx])
+            return self.__class__(list(self._modules.values())[idx])
         else:
             return self._modules[self._get_abs_string_index(idx)]
 
@@ -334,7 +334,7 @@ def __init__(self, parameters=None):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return ParameterList(list(self._parameters.values())[idx])
+            return self.__class__(list(self._parameters.values())[idx])
         else:
             idx = operator.index(idx)
             if not (-len(self) <= idx < len(self)):

From e2bc95e1bd39f4cd1813a21290f2f8f6fb51c0eb Mon Sep 17 00:00:00 2001
From: Xingdong Zuo <zuoxingdong@users.noreply.github.com>
Date: Tue, 18 Sep 2018 07:33:31 -0700
Subject: [PATCH 219/237] add `ModuleList.insert` (#11664)

Summary:
fixes #11652
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11664

Differential Revision: D9892845

Pulled By: ezyang

fbshipit-source-id: 2c910d6bc0b28a999e25beca6e398fd0f35535c5
---
 test/test_nn.py               |  3 +++
 torch/nn/modules/container.py | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index 29e7c6ba8ebcc9..abeddaa2f0cc9a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1137,6 +1137,9 @@ def check():
         modules += [nn.Conv2d(3, 4, 3)]
         module_list += [modules[-1]]
         check()
+        modules.insert(1, nn.Linear(3, 2))
+        module_list.insert(1, modules[1])
+        check()
         modules.append(nn.Tanh())
         module_list.append(modules[-1])
         check()
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 15590dbe5c9a92..dc8f61c2309c21 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -164,6 +164,17 @@ def __dir__(self):
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
+    def insert(self, index, module):
+        r"""Insert a given module before a given index in the list.
+
+        Arguments:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
     def append(self, module):
         r"""Appends a given module to the end of the list.
 

From 4ee0a78ee6452b7c1584b6020aef791b9d3e8ab6 Mon Sep 17 00:00:00 2001
From: Amitesh Arora <amitesharora1@gmail.com>
Date: Tue, 18 Sep 2018 07:36:15 -0700
Subject: [PATCH 220/237] varargs for meshgrid (#11600)

Summary:
Adds vararg support for meshgrid and adds checks for all the tensor arguments to have the same dtype and device.

Fixes: [#10823](https://github.com/pytorch/pytorch/issues/10823), #11446

The earlier pull request closed without any changes because I had some rebasing issues, so I made another pull request to close out #10823. Sorry for the inconvenience.

Differential Revision: D9892876

Pulled By: ezyang

fbshipit-source-id: 93d96cafc876102ccbad3ca2cc3d81cb4c9bf556
---
 aten/src/ATen/native/TensorShape.cpp |  4 +++
 test/test_torch.py                   |  7 +++++
 torch/_torch_docs.py                 | 33 ------------------------
 torch/functional.py                  | 38 ++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c22374d0964e81..634e7a443d21fd 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -596,6 +596,10 @@ std::vector<Tensor> meshgrid(TensorList tensors) {
       AT_ERROR("Expected scalar or 1D tensor in the tensor list but got: ", tensors[i]);
     }
   }
+  for(int64_t i = 0; i < size - 1; i++){
+      AT_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype");
+      AT_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device");
+  }
   std::vector<Tensor> grids;
   for(int64_t i = 0; i < size; i++) {
     std::vector<int64_t> view_shape(size, 1);
diff --git a/test/test_torch.py b/test/test_torch.py
index 8fcca02207c89c..9a67440a9961a0 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8899,6 +8899,10 @@ def test_meshgrid(self):
         self.assertEqual(grid_a.shape, torch.Size([1, 3, 2]))
         self.assertEqual(grid_b.shape, torch.Size([1, 3, 2]))
         self.assertEqual(grid_c.shape, torch.Size([1, 3, 2]))
+        grid_a2, grid_b2, grid_c2 = torch.meshgrid(a, b, c)
+        self.assertEqual(grid_a2.shape, torch.Size([1, 3, 2]))
+        self.assertEqual(grid_b2.shape, torch.Size([1, 3, 2]))
+        self.assertEqual(grid_c2.shape, torch.Size([1, 3, 2]))
         expected_grid_a = torch.ones(1, 3, 2, dtype=torch.int64)
         expected_grid_b = torch.tensor([[[1, 1],
                                          [2, 2],
@@ -8909,6 +8913,9 @@ def test_meshgrid(self):
         self.assertTrue(grid_a.equal(expected_grid_a))
         self.assertTrue(grid_b.equal(expected_grid_b))
         self.assertTrue(grid_c.equal(expected_grid_c))
+        self.assertTrue(grid_a2.equal(expected_grid_a))
+        self.assertTrue(grid_b2.equal(expected_grid_b))
+        self.assertTrue(grid_c2.equal(expected_grid_c))
 
     @unittest.skipIf(torch.cuda.is_available(), "CUDA is available, can't test CUDA not built error")
     def test_cuda_not_built(self):
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 432cb64d722765..9abb9f1bbf76d3 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -5997,36 +5997,3 @@ def parse_kwargs(desc):
     >>>                            [7, 8, 9]]))
     (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
 """)
-
-
-add_docstr(torch.meshgrid,
-           r"""
-meshgrid(seq) -> seq
-
-Take a sequence of :math:`N` tensors, each of which can be either scalar or 1-dimensional
-vector, and create :math:`N` N-dimensional grids, where the :math:`i` th grid is defined by
-expanding the :math:`i` th input over dimensions defined by other inputs.
-
-Arguments:
-    seq (sequence of Tensors): sequence of scalars or 1 dimensional tensors. Scalars will be
-        treated as tensors of size :math:`(1,)` automatically.
-
-Returns:
-    seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`,
-    then the output would also has :math:`k` tensors, where all tensors are
-    of size :math:`(N_1, N_2, \ldots , N_k)`.
-
-Example::
-
-    >>> x = torch.tensor([1, 2, 3])
-    >>> y = torch.tensor([4, 5, 6])
-    >>> grid_x, grid_y = torch.meshgrid([x, y])
-    >>> grid_x
-    tensor([[1, 1, 1],
-            [2, 2, 2],
-            [3, 3, 3]])
-    >>> grid_y
-    tensor([[4, 5, 6],
-            [4, 5, 6],
-            [4, 5, 6]])
-""")
diff --git a/torch/functional.py b/torch/functional.py
index e31903db5a7129..40c74ba59abd94 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -16,6 +16,7 @@
     'isfinite',
     'isinf',
     'isnan',
+    'meshgrid',
     'split',
     'stft',
     'tensordot',
@@ -278,6 +279,43 @@ def isinf(tensor):
     return tensor.abs() == inf
 
 
+def meshgrid(*tensors, **kwargs):
+    r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
+vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by
+expanding the :math:`i`th input over dimensions defined by other inputs.
+
+
+    Args:
+        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
+        treated as tensors of size :math:`(1,)` automatically
+
+    Returns:
+        seq (sequence of Tensors): If the input has :math:`k` tensors of size
+        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors,
+        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
+
+    Example::
+
+        >>> x = torch.tensor([1, 2, 3])
+        >>> y = torch.tensor([4, 5, 6])
+        >>> grid_x, grid_y = torch.meshgrid(x, y)
+        >>> grid_x
+        tensor([[1, 1, 1],
+                [2, 2, 2],
+                [3, 3, 3]])
+        >>> grid_y
+        tensor([[4, 5, 6],
+                [4, 5, 6],
+                [4, 5, 6]])
+    """
+    if kwargs:
+        raise TypeError("meshgrid() got an unexpected keyword argument '%s'" % (list(kwargs)[0],))
+    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
+        # the old interface of passing the operands as one list argument
+        tensors = tensors[0]
+    return torch._C._VariableFunctions.meshgrid(tensors)
+
+
 def stft(input, n_fft, hop_length=None, win_length=None, window=None,
          center=True, pad_mode='reflect', normalized=False, onesided=True):
     r"""Short-time Fourier transform (STFT).

From e00fb69b251c7a1af2b172cc5c059945686b71b5 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Tue, 18 Sep 2018 07:59:41 -0700
Subject: [PATCH 221/237] Use CATCH prefix to avoid name conflicts with Caffe2.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11780

Differential Revision: D9889925

Pulled By: gchanan

fbshipit-source-id: 5eca849c36ced00b8ae7482b7945b445a3e1687e
---
 aten/src/ATen/test/apply_test.cpp             | 104 +++---
 aten/src/ATen/test/apply_utils_test.cpp       |  14 +-
 aten/src/ATen/test/basic.cpp                  | 152 ++++-----
 aten/src/ATen/test/broadcast_test.cpp         |  86 ++---
 aten/src/ATen/test/catch_utils.hpp            |   8 +
 aten/src/ATen/test/cuda_half_test.cu          |   6 +-
 aten/src/ATen/test/cuda_optional_test.cu      |  12 +-
 .../test/cuda_packedtensoraccessor_test.cu    |   8 +-
 aten/src/ATen/test/cuda_rng_test.cpp          |   6 +-
 aten/src/ATen/test/cudnn_test.cpp             |  10 +-
 aten/src/ATen/test/dlconvertor_test.cpp       |  10 +-
 aten/src/ATen/test/half_test.cpp              | 102 +++---
 aten/src/ATen/test/integer_divider_test.cu    |  24 +-
 aten/src/ATen/test/native_test.cpp            |  58 ++--
 aten/src/ATen/test/scalar_tensor_test.cpp     |  76 ++---
 aten/src/ATen/test/scalar_test.cpp            |  54 ++--
 aten/src/ATen/test/stream_test.cpp            | 130 ++++----
 aten/src/ATen/test/test_parallel.cpp          |   6 +-
 aten/src/ATen/test/undefined_tensor_test.cpp  |  48 +--
 aten/src/ATen/test/weakref_test.cpp           |  48 +--
 aten/src/ATen/test/wrapdim_test.cpp           |  30 +-
 test/cpp/api/any.cpp                          | 200 ++++++------
 test/cpp/api/catch_utils.hpp                  |   8 +
 test/cpp/api/cursor.cpp                       | 276 ++++++++--------
 test/cpp/api/integration.cpp                  |  14 +-
 test/cpp/api/jit.cpp                          |  12 +-
 test/cpp/api/main.cpp                         |   2 +-
 test/cpp/api/misc.cpp                         | 274 ++++++++--------
 test/cpp/api/module.cpp                       | 250 +++++++--------
 test/cpp/api/modules.cpp                      | 234 +++++++-------
 test/cpp/api/optim.cpp                        |  94 +++---
 test/cpp/api/parallel.cpp                     | 120 +++----
 test/cpp/api/rnn.cpp                          |  92 +++---
 test/cpp/api/sequential.cpp                   | 150 ++++-----
 test/cpp/api/serialization.cpp                |  84 ++---
 test/cpp/api/tensor.cpp                       | 146 ++++-----
 test/cpp/api/tensor_cuda.cpp                  |   8 +-
 test/cpp/api/tensor_options.cpp               |  52 +--
 test/cpp/api/tensor_options_cuda.cpp          |  44 +--
 torch/csrc/jit/catch_utils.hpp                |   8 +
 torch/csrc/jit/test_jit.cpp                   | 296 +++++++++---------
 41 files changed, 1690 insertions(+), 1666 deletions(-)
 create mode 100644 aten/src/ATen/test/catch_utils.hpp
 create mode 100644 test/cpp/api/catch_utils.hpp
 create mode 100644 torch/csrc/jit/catch_utils.hpp

diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp
index 986f599da6d11b..fc39eccee3926b 100644
--- a/aten/src/ATen/test/apply_test.cpp
+++ b/aten/src/ATen/test/apply_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "cuda.h"
 #include "cuda_runtime.h"
@@ -11,111 +11,111 @@ Tests related to tensor indexing and applying operations.
 */
 #ifndef _WIN32
 
-TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
+CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
     int sizes[] = {4, 4};
     int strides[] = {4, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == (4 * 4));
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == (4 * 4));
 }
 
-TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
+CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
     int sizes[] = {6, 3, 7};
     int strides[] = {3 * 7, 7, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == (6 * 3 * 7));
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7));
 }
 
-TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
+CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
     int sizes[] = {4, 3, 2};
     int strides[] = {3 * 3, 3, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 2);
-    REQUIRE(ti.sizes[0] == (4 * 3));
-    REQUIRE(ti.sizes[1] == 2);
+    CATCH_REQUIRE(ti.dims == 2);
+    CATCH_REQUIRE(ti.sizes[0] == (4 * 3));
+    CATCH_REQUIRE(ti.sizes[1] == 2);
 }
 
-TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
+CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
     int sizes[] = {3, 2};
     int strides[] = {2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == (3 * 2));
-    REQUIRE(ti.strides[0] == 2);
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == (3 * 2));
+    CATCH_REQUIRE(ti.strides[0] == 2);
 }
 
-TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
+CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
     int sizes[] = {3, 6, 5, 2};
     int strides[] = {6 * 22, 22, 2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 2);
-    REQUIRE(ti.sizes[0] == (3 * 6));
-    REQUIRE(ti.strides[0] == 22);
-    REQUIRE(ti.sizes[1] == (5 * 2));
-    REQUIRE(ti.strides[1] == 2);
+    CATCH_REQUIRE(ti.dims == 2);
+    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
+    CATCH_REQUIRE(ti.strides[0] == 22);
+    CATCH_REQUIRE(ti.sizes[1] == (5 * 2));
+    CATCH_REQUIRE(ti.strides[1] == 2);
 }
 
-TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
+CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
     int sizes[] = {1, 10, 1, 5, 4};
     int strides[] = {4, 0, 16, 0, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 2);
-    REQUIRE(ti.sizes[0] == (10 * 5));
-    REQUIRE(ti.strides[0] == 0);
-    REQUIRE(ti.sizes[1] == 4);
-    REQUIRE(ti.strides[1] == 1);
+    CATCH_REQUIRE(ti.dims == 2);
+    CATCH_REQUIRE(ti.sizes[0] == (10 * 5));
+    CATCH_REQUIRE(ti.strides[0] == 0);
+    CATCH_REQUIRE(ti.sizes[1] == 4);
+    CATCH_REQUIRE(ti.strides[1] == 1);
 }
 
-TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
+CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
     int sizes[] = {1, 1, 1};
     int strides[] = {17, 12, 3};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    REQUIRE(ti.collapseDims() == 0);
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == 1);
-    REQUIRE(ti.strides[0] == 1);
+    CATCH_REQUIRE(ti.collapseDims() == 0);
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == 1);
+    CATCH_REQUIRE(ti.strides[0] == 1);
 }
 
-TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
+CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
     int sizes[] = {3, 6, 5, 2};
     int strides[] = {6 * 22, 22, 2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    REQUIRE(ti.collapseDims(1) == 1);
-    REQUIRE(ti.dims == 3);
-    REQUIRE(ti.sizes[0] == 3);
-    REQUIRE(ti.strides[0] == (6 * 22));
-    REQUIRE(ti.sizes[1] == 6);
-    REQUIRE(ti.strides[1] == 22);
-    REQUIRE(ti.sizes[2] == (5 * 2));
-    REQUIRE(ti.strides[2] == 2);
+    CATCH_REQUIRE(ti.collapseDims(1) == 1);
+    CATCH_REQUIRE(ti.dims == 3);
+    CATCH_REQUIRE(ti.sizes[0] == 3);
+    CATCH_REQUIRE(ti.strides[0] == (6 * 22));
+    CATCH_REQUIRE(ti.sizes[1] == 6);
+    CATCH_REQUIRE(ti.strides[1] == 22);
+    CATCH_REQUIRE(ti.sizes[2] == (5 * 2));
+    CATCH_REQUIRE(ti.strides[2] == 2);
 }
 
-TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
+CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
     int sizes[] = {3, 6, 5, 2};
     int strides[] = {6 * 22, 22, 2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    REQUIRE(ti.collapseDims(2) == 1);
-    REQUIRE(ti.dims == 3);
-    REQUIRE(ti.sizes[0] == (3 * 6));
-    REQUIRE(ti.strides[0] == 22);
-    REQUIRE(ti.sizes[1] == 5);
-    REQUIRE(ti.strides[1] == 4);
-    REQUIRE(ti.sizes[2] == 2);
-    REQUIRE(ti.strides[2] == 2);
+    CATCH_REQUIRE(ti.collapseDims(2) == 1);
+    CATCH_REQUIRE(ti.dims == 3);
+    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
+    CATCH_REQUIRE(ti.strides[0] == 22);
+    CATCH_REQUIRE(ti.sizes[1] == 5);
+    CATCH_REQUIRE(ti.strides[1] == 4);
+    CATCH_REQUIRE(ti.sizes[2] == 2);
+    CATCH_REQUIRE(ti.strides[2] == 2);
 }
 
-TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
+CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
     int sizes[] = {1, 1, 1};
     int strides[] = {17, 12, 3};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    REQUIRE_THROWS(ti.collapseDims(5));
+    _CATCH_REQUIRE_THROWS(ti.collapseDims(5));
 } 
 
 #endif
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index 38027baae97b73..22be6de7acbc02 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/CPUApplyUtils.h"
@@ -108,32 +108,32 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
   });
 }
 
-TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1}, -1, -1);
 }
 
-TEST_CASE("apply utils test 2-dim small", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1});
 }
 
-TEST_CASE("apply utils test 2-dim", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {20, 10});
 }
 
-TEST_CASE("apply utils test 3-dim", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2});
 }
 
-TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 40, 2});
 }
 
-TEST_CASE("apply utils test 10-dim", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3});
 }
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 94988122adedcc..c64fdec0089dff 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/core/Reduction.h"
@@ -20,66 +20,66 @@ using namespace at;
 using Catch::Matchers::StartsWith;
 
 static void test(Type & type) {
-  SECTION( "resize" ) {
+  CATCH_SECTION( "resize" ) {
     auto a = type.tensor();
     a.resize_({3,4});
-    REQUIRE(a.numel() == 12);
+    CATCH_REQUIRE(a.numel() == 12);
     a.resize_({5, 7});
-    REQUIRE(a.numel() == 35);
+    CATCH_REQUIRE(a.numel() == 35);
 
   }
 
-  SECTION( "ones and dot" ) {
+  CATCH_SECTION( "ones and dot" ) {
     Tensor b0 = ones({1, 1}, type);
-    REQUIRE(2 == (b0+b0).sum().toCDouble());
+    CATCH_REQUIRE(2 == (b0+b0).sum().toCDouble());
 
     Tensor b1 = ones({1, 2}, type);
-    REQUIRE(4 == (b1+b1).sum().toCDouble());
+    CATCH_REQUIRE(4 == (b1+b1).sum().toCDouble());
 
     Tensor b = ones({3, 4}, type);
-    REQUIRE(24 == (b+b).sum().toCDouble());
-    REQUIRE(12 == b.numel());
-    REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
+    CATCH_REQUIRE(24 == (b+b).sum().toCDouble());
+    CATCH_REQUIRE(12 == b.numel());
+    CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
   }
 
-  SECTION( "rand" ) {
+  CATCH_SECTION( "rand" ) {
     for(auto i = 0; i < 10; i++) {
       Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
     }
   }
 
-  SECTION( "sort" ) {
+  CATCH_SECTION( "sort" ) {
     Tensor b = rand({3, 4}, type);
 
     auto z = b.sort(1);
     auto z_sorted = std::get<0>(z);
 
-    REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat());
+    CATCH_REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat());
   }
 
   if(type.backend() != Backend::CUDA)
-  SECTION( "randperm" ) {
+  CATCH_SECTION( "randperm" ) {
     Tensor b = randperm(15, type);
     Tensor rv, ri;
     std::tie(rv, ri) = sort(b, 0);
-    REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat());
+    CATCH_REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat());
   }
 
-  SECTION( "context" ) {
+  CATCH_SECTION( "context" ) {
     std::stringstream ss;
     ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl;
   }
 
-  SECTION( "add" ) {
+  CATCH_SECTION( "add" ) {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({3, 4}, type);
     Tensor c = add(a, add(a, b));
     //TODO:0-dim Tensor d(3.f);
     Scalar d = 3.f;
-    REQUIRE( add(c, d).allclose(a + a + b + d) );
+    CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) );
   }
 
-  SECTION( "loads of adds" ) {
+  CATCH_SECTION( "loads of adds" ) {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
@@ -89,10 +89,10 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
   }
 
-  SECTION( "loads of adds (with copy)" ) {
+  CATCH_SECTION( "loads of adds (with copy)" ) {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
@@ -102,59 +102,59 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
   }
 
-  SECTION( "isContiguous" ) {
+  CATCH_SECTION( "isContiguous" ) {
     Tensor a = rand({3, 4}, type);
-    REQUIRE(a.is_contiguous());
+    CATCH_REQUIRE(a.is_contiguous());
     a = a.transpose(0, 1);
-    REQUIRE(!a.is_contiguous());
+    CATCH_REQUIRE(!a.is_contiguous());
   }
 
-  SECTION( "permute" ) {
+  CATCH_SECTION( "permute" ) {
     Tensor a = rand({3, 4, 5}, type);
     Tensor b = a.permute({1, 2, 0});
-    REQUIRE(b.sizes().equals({4, 5, 3}));
-    REQUIRE(b.strides().equals({5, 1, 20}));
+    CATCH_REQUIRE(b.sizes().equals({4, 5, 3}));
+    CATCH_REQUIRE(b.strides().equals({5, 1, 20}));
   }
 
-  SECTION( "mm" ) {
+  CATCH_SECTION( "mm" ) {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({4}, type);
     Tensor c = mv(a, b);
-    REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
+    CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
   }
 
-  SECTION( "squeeze" ) {
+  CATCH_SECTION( "squeeze" ) {
     Tensor a = rand({2, 1}, type);
     Tensor b = squeeze(a);
-    REQUIRE(b.dim() == 1);
+    CATCH_REQUIRE(b.dim() == 1);
     a = rand({1}, type);
     b = squeeze(a);
     //TODO 0-dim squeeze
-    REQUIRE(a[0].equal(b));
+    CATCH_REQUIRE(a[0].equal(b));
   }
 
-  SECTION( "copy" ) {
+  CATCH_SECTION( "copy" ) {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({4, 3}, type);
     a.copy_(e);
-    REQUIRE(a.equal(e));
+    CATCH_REQUIRE(a.equal(e));
   }
 
-  SECTION( "copy (broadcasting)" ) {
+  CATCH_SECTION( "copy (broadcasting)" ) {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({3}, type);
     a.copy_(e);
     for (int i = 0; i < 4; ++i) {
-      REQUIRE(a[i].equal(e));
+      CATCH_REQUIRE(a[i].equal(e));
     }
   }
 
-  SECTION( "abs(value)" ) {
+  CATCH_SECTION( "abs(value)" ) {
     Tensor r = at::abs(type.scalarTensor(-3));
-    REQUIRE(r.toCInt() == 3);
+    CATCH_REQUIRE(r.toCInt() == 3);
   }
 
 //TODO(zach): operator overloads
@@ -168,120 +168,120 @@ static void test(Type & type) {
   }
 #endif
 
-  SECTION( "adding a value with a scalar" ) {
+  CATCH_SECTION( "adding a value with a scalar" ) {
     Tensor a = rand({4, 3}, type);
-    REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
+    CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
   }
 
-  SECTION( "select" ) {
+  CATCH_SECTION( "select" ) {
     Tensor a = rand({3, 7}, type);
     auto a_13 = select(a, 1, 3);
     auto a_13_02 = select(select(a, 1, 3), 0, 2);
-    REQUIRE( a[0][3].equal(a_13[0]) );
-    REQUIRE( a[2][3].equal(a_13_02) );
+    CATCH_REQUIRE( a[0][3].equal(a_13[0]) );
+    CATCH_REQUIRE( a[2][3].equal(a_13_02) );
   }
 
-  SECTION( "zero-dim" ) {
+  CATCH_SECTION( "zero-dim" ) {
     Tensor a =  type.scalarTensor(4); //rand(type, {1});
 
     Tensor b = rand({3,4}, type);
-    REQUIRE((a + a).dim() == 0);
-    REQUIRE((1 + a).dim() == 0);
-    REQUIRE((b + a).dim() == 2);
-    REQUIRE((a + b).dim() == 2);
+    CATCH_REQUIRE((a + a).dim() == 0);
+    CATCH_REQUIRE((1 + a).dim() == 0);
+    CATCH_REQUIRE((b + a).dim() == 2);
+    CATCH_REQUIRE((a + b).dim() == 2);
     auto c = rand({3,4}, type);
-    REQUIRE(c[1][2].dim() == 0);
+    CATCH_REQUIRE(c[1][2].dim() == 0);
 
     auto f = rand({3,4}, type);
     f[2] = zeros({4}, type);
     f[1][0] = -1;
-    REQUIRE(f[2][0].toCDouble() == 0);
+    CATCH_REQUIRE(f[2][0].toCDouble() == 0);
   }
 
-  SECTION( "tensor from TH" ) {
+  CATCH_SECTION( "tensor from TH" ) {
     int a = 4;
     THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
     THFloatTensor_fill(t, a);
     Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false);
-    REQUIRE_NOTHROW(tt);
+    CATCH_REQUIRE_NOTHROW(tt);
   }
 
-  SECTION( "toCFloat" ) {
+  CATCH_SECTION( "toCFloat" ) {
     Tensor a = zeros({3,4});
     Tensor b = ones({3,7});
     Tensor c = cat({a,b},1);
-    REQUIRE(c.size(1) == 11);
+    CATCH_REQUIRE(c.size(1) == 11);
 
     Tensor e = rand({});
-    REQUIRE(*e.data<float>() == e.sum().toCFloat());
+    CATCH_REQUIRE(*e.data<float>() == e.sum().toCFloat());
   }
 
-  SECTION( "to string" ) {
+  CATCH_SECTION( "to string" ) {
     Tensor b = ones({3,7})*.0000001f;
     std::stringstream s;
     s << b << "\n";
     std::string expect = "1e-07 *";
-    REQUIRE(s.str().substr(0,expect.size()) == expect);
+    CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect);
   }
-  SECTION("indexing by Scalar") {
+  CATCH_SECTION("indexing by Scalar") {
     Tensor tensor = arange(0, 10, kInt);
     Tensor one = ones({}, kInt);
     for (int64_t i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
     for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
-      REQUIRE(tensor[i].equal(one * static_cast<int64_t>(i)));
+      CATCH_REQUIRE(tensor[i].equal(one * static_cast<int64_t>(i)));
     }
     for (int i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
     for (int16_t i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
     for (int8_t i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[Scalar(3.14)].equal(one),
         StartsWith(
             "Can only index tensors with integral scalars"));
   }
-  SECTION("indexing by zero-dim tensor") {
+  CATCH_SECTION("indexing by zero-dim tensor") {
     Tensor tensor = arange(0, 10, kInt);
     Tensor one = ones({}, kInt);
     for (int i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[one * i].equal(one * i));
+      CATCH_REQUIRE(tensor[one * i].equal(one * i));
     }
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[ones({}) * 3.14].equal(one),
         StartsWith(
             "Can only index tensors with integral scalars"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[Tensor()].equal(one),
         StartsWith("Can only index with tensors that are defined"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[ones({2, 3, 4}, kInt)].equal(one),
         StartsWith("Can only index with tensors that are scalars (zero-dim)"));
   }
-  SECTION("dispatch") {
+  CATCH_SECTION("dispatch") {
     Tensor tensor = randn({20, 20});
     Tensor other = randn({20, 20});
     auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean);
-    REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
+    CATCH_REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
   }
-  SECTION("core") {
+  CATCH_SECTION("core") {
     int i = CoreTest();
-    REQUIRE(i + 1 == CoreTest());
+    CATCH_REQUIRE(i + 1 == CoreTest());
   }
 }
 
-TEST_CASE( "basic tests CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
-TEST_CASE( "basic tests GPU", "[cuda]" ) {
+CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
   if(at::hasCUDA()) {
diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp
index cd5c43d32fae86..822a1d79df1bda 100644
--- a/aten/src/ATen/test/broadcast_test.cpp
+++ b/aten/src/ATen/test/broadcast_test.cpp
@@ -1,154 +1,154 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-TEST_CASE( "broadcast", "[]" ) {
+CATCH_TEST_CASE( "broadcast", "[]" ) {
 
   manual_seed(123, at::kCPU);
 
   Type & T = CPU(kFloat);
 
   // 0) pre-req tests:
-  SECTION( "can't expand empty tensor" ) {
+  CATCH_SECTION( "can't expand empty tensor" ) {
     auto empty = randn({0}, T);
-    REQUIRE_THROWS(empty.expand({3}));
+    _CATCH_REQUIRE_THROWS(empty.expand({3}));
   }
 
   // 1) out-place function with 2 args
-  SECTION( "out-place function with 2 args" ) {
+  CATCH_SECTION( "out-place function with 2 args" ) {
 
-    SECTION( "basic" ) {
+    CATCH_SECTION( "basic" ) {
       auto a = randn({3, 1}, T);
       auto b = randn({5}, T);
       std::vector<int64_t> expanded_sizes = {3, 5};
-      REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
+      CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto aScalar = ones({1}, T);
       aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
       auto b = randn({3, 5}, T);
-      REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
+      CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
     }
 
-    SECTION( "old fallback behavior yields error" ) {
+    CATCH_SECTION( "old fallback behavior yields error" ) {
       auto a = randn({3, 5}, T);
       auto b = randn({5, 3}, T);
-      REQUIRE_THROWS(a + b);
+      _CATCH_REQUIRE_THROWS(a + b);
     }
 
-    SECTION( "with mismatched sizes" ) {
+    CATCH_SECTION( "with mismatched sizes" ) {
       auto a = randn({3, 5}, T);
       auto b = randn({7, 5}, T);
-      REQUIRE_THROWS(a + b);
+      _CATCH_REQUIRE_THROWS(a + b);
     }
   }
 
-  SECTION( "out-place function with 3 args" ) {
+  CATCH_SECTION( "out-place function with 3 args" ) {
 
-    SECTION( "basic" ) {
+    CATCH_SECTION( "basic" ) {
       auto a = randn({3, 1, 1}, T);
       auto b = randn({1, 2, 1}, T);
       auto c = randn({1, 1, 5}, T);
       std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
+      CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto aTensorScalar = ones({1}, T);
       aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
       auto b = randn({3, 2, 1}, T);
       auto c = randn({1, 2, 5}, T);
       std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      REQUIRE(aTensorScalar.addcmul(b, c).equal(
+      CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal(
                 aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
     }
 
-    SECTION( "old fallback behavior yields error" ) {
+    CATCH_SECTION( "old fallback behavior yields error" ) {
       auto a = randn({3, 2, 5}, T);
       auto b = randn({2, 3, 5}, T);
       auto c = randn({5, 3, 2}, T);
-      REQUIRE_THROWS(a.addcmul(b, c));
+      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
     }
 
-    SECTION( "with mismatched sizes" ){
+    CATCH_SECTION( "with mismatched sizes" ){
       auto a = randn({3, 2, 5}, T);
       auto b = randn({2, 3, 5}, T);
       auto c = randn({5, 5, 5}, T);
-      REQUIRE_THROWS(a.addcmul(b, c));
+      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
     }
   }
 
-  SECTION( "in-place function with 2 args" ) {
-    SECTION( "basic" ) {
+  CATCH_SECTION( "in-place function with 2 args" ) {
+    CATCH_SECTION( "basic" ) {
       auto a = randn({3, 5}, T);
       auto b = randn({3, 1}, T);
-      REQUIRE((a + b).equal(a + b.expand({3, 5})));
+      CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5})));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto a = randn({3, 5}, T);
       auto bScalar = ones({1}, T);
       bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
+      CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
     }
 
-    SECTION( "error: would have to expand inplace arg" ) {
+    CATCH_SECTION( "error: would have to expand inplace arg" ) {
       auto a = randn({1, 5}, T);
       auto b = randn({3, 1}, T);
-      REQUIRE_THROWS(a.add_(b));
+      _CATCH_REQUIRE_THROWS(a.add_(b));
     }
   }
 
-  SECTION( "in-place function with 3 args" ) {
+  CATCH_SECTION( "in-place function with 3 args" ) {
 
     auto a = randn({3, 5, 2}, T);
     auto b = randn({3, 1, 2}, T);
     auto c = randn({1, 5, 1}, T);
 
-    SECTION( "basic" ) {
+    CATCH_SECTION( "basic" ) {
       auto aClone = a.clone();
-      REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
+      CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto aClone = a.clone();
       auto bScalar = ones({1}, T);
       bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
+      CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
     }
 
-    SECTION( "error: would have to expand inplace arg" ) {
+    CATCH_SECTION( "error: would have to expand inplace arg" ) {
       auto a = randn({1, 3, 5}, T);
       auto b = randn({4, 1, 1}, T);
       auto c = randn({1, 3, 1}, T);
-      REQUIRE_THROWS(a.addcmul_(b, c));
+      _CATCH_REQUIRE_THROWS(a.addcmul_(b, c));
     }
   }
 
-  SECTION( "explicit dim specification" ) {
+  CATCH_SECTION( "explicit dim specification" ) {
 
     auto a = randn({1}, T);
     auto b = randn({5, 3}, T);
     auto c = randn({3, 7}, T);
 
-    SECTION( "basic" ) {
-      REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
+    CATCH_SECTION( "basic" ) {
+      CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       Tensor aScalar = ones({1}, T);
       aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
+      CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
     }
 
-    SECTION( "with mismatched sizes" ) {
+    CATCH_SECTION( "with mismatched sizes" ) {
       auto a = randn({3, 3}, T);
-      REQUIRE_THROWS(a.addmm(b, c));
+      _CATCH_REQUIRE_THROWS(a.addmm(b, c));
     }
   }
 }
diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp
new file mode 100644
index 00000000000000..b9b0a87990a9ce
--- /dev/null
+++ b/aten/src/ATen/test/catch_utils.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
+// define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index fa00e534ee07ef..cce267100589e1 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/cuda/NumericLimits.cuh"
@@ -82,9 +82,9 @@ void launch_function(){
   kernel<<<1,1>>>();
 }
 
-TEST_CASE( "half common math functions tests in device", "[cuda]" ) {
+CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) {
   launch_function();
   cudaError_t err = cudaDeviceSynchronize();
-  REQUIRE(err == cudaSuccess);
+  CATCH_REQUIRE(err == cudaSuccess);
 }
 
diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu
index 9956dcf52b04ef..b64c530b355914 100644
--- a/aten/src/ATen/test/cuda_optional_test.cu
+++ b/aten/src/ATen/test/cuda_optional_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/optional.h"
@@ -8,15 +8,15 @@
 
 using namespace at;
 
-TEST_CASE( "optional in cuda files", "[cuda]" ) {
+CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) {
   at::optional<int64_t> trivially_destructible;
   at::optional<std::vector<int64_t>> non_trivially_destructible;
-  REQUIRE(!trivially_destructible.has_value());
-  REQUIRE(!non_trivially_destructible.has_value());
+  CATCH_REQUIRE(!trivially_destructible.has_value());
+  CATCH_REQUIRE(!non_trivially_destructible.has_value());
 
   trivially_destructible = {5};
   non_trivially_destructible = std::vector<int64_t>{5, 10};
-  REQUIRE(trivially_destructible.has_value());
-  REQUIRE(non_trivially_destructible.has_value());
+  CATCH_REQUIRE(trivially_destructible.has_value());
+  CATCH_REQUIRE(non_trivially_destructible.has_value());
 }
 
diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
index f1eb5cbbe4b1f3..a529f38d748a1b 100644
--- a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
+++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -22,7 +22,7 @@ __global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,
   }
 }
 
-TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
+CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
@@ -38,9 +38,9 @@ TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
   
   test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a);
   cudaError_t err = cudaDeviceSynchronize();
-  REQUIRE(err == cudaSuccess);
+  CATCH_REQUIRE(err == cudaSuccess);
 
   auto expected = mv(t1, t2);
 
-  REQUIRE(res.allclose(expected));
+  CATCH_REQUIRE(res.allclose(expected));
 }
diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp
index d32903dd2fe1f3..7b14174d3baeb3 100644
--- a/aten/src/ATen/test/cuda_rng_test.cpp
+++ b/aten/src/ATen/test/cuda_rng_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "cuda.h"
@@ -21,7 +21,7 @@ void testCudaRNGMultithread() {
   }
 };
 
-TEST_CASE( "CUDA RNG test", "[cuda]" ) {
-  SECTION( "multithread" )
+CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) {
+  CATCH_SECTION( "multithread" )
     testCudaRNGMultithread();
 }
diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp
index 31786e88a0944d..4391867d166772 100644
--- a/aten/src/ATen/test/cudnn_test.cpp
+++ b/aten/src/ATen/test/cudnn_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/cudnn/Descriptors.h"
@@ -9,7 +9,7 @@
 using namespace at;
 using namespace at::native;
 
-TEST_CASE( "cudnn", "[cuda]" ) {
+CATCH_TEST_CASE( "cudnn", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
 #if CUDNN_VERSION < 7000
@@ -18,8 +18,8 @@ TEST_CASE( "cudnn", "[cuda]" ) {
   desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42);
   desc2.set(handle, 0.5, desc1.state);
 
-  REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
-  REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
-  REQUIRE(desc1.desc()->states == desc2.desc()->states);
+  CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
+  CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
+  CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states);
 #endif
 }
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
index 48829298760276..bf0cf93f7c4064 100644
--- a/aten/src/ATen/test/dlconvertor_test.cpp
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -11,17 +11,17 @@
 
 using namespace at;
 
-TEST_CASE( "dlconvertor", "[cpu]" ) {
+CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) {
 
   manual_seed(123, at::kCPU);
 
-  INFO( "convert ATen to DLTensor" );
+  CATCH_INFO( "convert ATen to DLTensor" );
 
   Tensor a = rand({3,4});
   DLManagedTensor* dlMTensor = toDLPack(a);
 
-  INFO( "convert DLTensor to ATen" );
+  CATCH_INFO( "convert DLTensor to ATen" );
   Tensor b = fromDLPack(dlMTensor);
 
-  REQUIRE(a.equal(b));
+  CATCH_REQUIRE(a.equal(b));
 }
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 3b2944803e6b5a..32177705a2f883 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <ATen/ATen.h>
 #include <iostream>
@@ -12,53 +12,53 @@
 
 using namespace at;
 
-TEST_CASE( "half arithmetic", "[]" ) {
+CATCH_TEST_CASE( "half arithmetic", "[]" ) {
   Half zero = 0;
   Half one = 1;
-  REQUIRE(zero + one == one);
-  REQUIRE(zero + zero == zero);
-  REQUIRE(zero * one == zero);
-  REQUIRE(one * one == one);
-  REQUIRE(one / one == one);
-  REQUIRE(one - one == zero);
-  REQUIRE(one - zero == one);
-  REQUIRE(zero - one == -one);
-  REQUIRE(one + one == Half(2));
-  REQUIRE(one + one == 2);
+  CATCH_REQUIRE(zero + one == one);
+  CATCH_REQUIRE(zero + zero == zero);
+  CATCH_REQUIRE(zero * one == zero);
+  CATCH_REQUIRE(one * one == one);
+  CATCH_REQUIRE(one / one == one);
+  CATCH_REQUIRE(one - one == zero);
+  CATCH_REQUIRE(one - zero == one);
+  CATCH_REQUIRE(zero - one == -one);
+  CATCH_REQUIRE(one + one == Half(2));
+  CATCH_REQUIRE(one + one == 2);
 }
 
-TEST_CASE( "half comparisons", "[]" ) {
+CATCH_TEST_CASE( "half comparisons", "[]" ) {
   Half zero = 0;
   Half one = 1;
-  REQUIRE(zero < one);
-  REQUIRE(zero < 1);
-  REQUIRE(1 > zero);
-  REQUIRE(0 >= zero);
-  REQUIRE(0 != one);
-  REQUIRE(zero == 0);
-  REQUIRE(zero == zero);
-  REQUIRE(zero == -zero);
+  CATCH_REQUIRE(zero < one);
+  CATCH_REQUIRE(zero < 1);
+  CATCH_REQUIRE(1 > zero);
+  CATCH_REQUIRE(0 >= zero);
+  CATCH_REQUIRE(0 != one);
+  CATCH_REQUIRE(zero == 0);
+  CATCH_REQUIRE(zero == zero);
+  CATCH_REQUIRE(zero == -zero);
 }
 
-TEST_CASE( "half cast", "[]" ) {
+CATCH_TEST_CASE( "half cast", "[]" ) {
   Half value = 1.5f;
-  REQUIRE((int)value == 1);
-  REQUIRE((short)value == 1);
-  REQUIRE((long long)value == 1LL);
-  REQUIRE((float)value == 1.5f);
-  REQUIRE((double)value == 1.5);
-  REQUIRE((bool)value == true);
-  REQUIRE((bool)Half(0.0f) == false);
+  CATCH_REQUIRE((int)value == 1);
+  CATCH_REQUIRE((short)value == 1);
+  CATCH_REQUIRE((long long)value == 1LL);
+  CATCH_REQUIRE((float)value == 1.5f);
+  CATCH_REQUIRE((double)value == 1.5);
+  CATCH_REQUIRE((bool)value == true);
+  CATCH_REQUIRE((bool)Half(0.0f) == false);
 }
 
-TEST_CASE( "half construction", "[]" ) {
-  REQUIRE(Half((short)3) == Half(3.0f));
-  REQUIRE(Half((unsigned short)3) == Half(3.0f));
-  REQUIRE(Half(3) == Half(3.0f));
-  REQUIRE(Half(3U) == Half(3.0f));
-  REQUIRE(Half(3LL) == Half(3.0f));
-  REQUIRE(Half(3ULL) == Half(3.0f));
-  REQUIRE(Half(3.5) == Half(3.5f));
+CATCH_TEST_CASE( "half construction", "[]" ) {
+  CATCH_REQUIRE(Half((short)3) == Half(3.0f));
+  CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f));
+  CATCH_REQUIRE(Half(3) == Half(3.0f));
+  CATCH_REQUIRE(Half(3U) == Half(3.0f));
+  CATCH_REQUIRE(Half(3LL) == Half(3.0f));
+  CATCH_REQUIRE(Half(3ULL) == Half(3.0f));
+  CATCH_REQUIRE(Half(3.5) == Half(3.5f));
 }
 
 static std::string to_string(const Half& h) {
@@ -67,22 +67,22 @@ static std::string to_string(const Half& h) {
   return ss.str();
 }
 
-TEST_CASE( "half to string", "[]" ) {
-  REQUIRE(to_string(Half(3.5f)) == "3.5");
-  REQUIRE(to_string(Half(-100.0f)) == "-100");
+CATCH_TEST_CASE( "half to string", "[]" ) {
+  CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5");
+  CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100");
 }
 
-TEST_CASE( "half numeric limits", "[]" ) {
+CATCH_TEST_CASE( "half numeric limits", "[]" ) {
   using limits = std::numeric_limits<Half>;
-  REQUIRE(limits::lowest() == -65504.0f);
-  REQUIRE(limits::max() == 65504.0f);
-  REQUIRE(limits::min() > 0);
-  REQUIRE(limits::min() < 1);
-  REQUIRE(limits::denorm_min() > 0);
-  REQUIRE(limits::denorm_min() / 2  == 0);
-  REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
-  REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
-  REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
+  CATCH_REQUIRE(limits::lowest() == -65504.0f);
+  CATCH_REQUIRE(limits::max() == 65504.0f);
+  CATCH_REQUIRE(limits::min() > 0);
+  CATCH_REQUIRE(limits::min() < 1);
+  CATCH_REQUIRE(limits::denorm_min() > 0);
+  CATCH_REQUIRE(limits::denorm_min() / 2  == 0);
+  CATCH_REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
+  CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
+  CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
 }
 
 // Check the declared type of members of numeric_limits<Half> matches
@@ -119,7 +119,7 @@ ASSERT_SAME_TYPE(max_exponent10);
 ASSERT_SAME_TYPE(traps);
 ASSERT_SAME_TYPE(tinyness_before);
 
-TEST_CASE( "half common math functions test", "[]" ) {
+CATCH_TEST_CASE( "half common math functions test", "[]" ) {
   float threshold = 0.00001;
   assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold);
   assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold);
diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu
index 4c63ab3a8fd205..d09a423d7ca72d 100644
--- a/aten/src/ATen/test/integer_divider_test.cu
+++ b/aten/src/ATen/test/integer_divider_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or
 // (b-1), so it takes a few minutes to run.
@@ -62,18 +62,18 @@ class IntDividerTester {
     cudaError_t err;
 
     err = cudaMalloc(&dividersBuf_, NUM_CASES * sizeof(IntDivider<Value>));
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
     err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase<Value>));
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
   }
 
   ~IntDividerTester() {
     cudaError_t err;
 
     err = cudaFree(dividersBuf_);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
     err = cudaFree(testCasesBuf_);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
   }
 
   void addTestCase(Value dividend, Value divisor, int steps) {
@@ -92,18 +92,18 @@ class IntDividerTester {
     cudaError_t err;
 
     if (testCases_.empty()) return;
-    REQUIRE(!dividers_.empty());
+    CATCH_REQUIRE(!dividers_.empty());
 
-    REQUIRE(dividers_.size() <= NUM_CASES);
-    REQUIRE(testCases_.size() <= NUM_CASES);
+    CATCH_REQUIRE(dividers_.size() <= NUM_CASES);
+    CATCH_REQUIRE(testCases_.size() <= NUM_CASES);
     err = cudaMemcpy(dividersBuf_, dividers_.data(),
                      dividers_.size() * sizeof(IntDivider<Value>),
                      cudaMemcpyHostToDevice);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
     err = cudaMemcpy(testCasesBuf_, testCases_.data(),
                      testCases_.size() * sizeof(TestCase<Value>),
                      cudaMemcpyHostToDevice);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
 
     int numCases = testCases_.size();
     testIntDivider<Value><<<512, 512>>>(
@@ -180,11 +180,11 @@ static void testUint64Divider()
   tester.flush();
 }
 
-TEST_CASE( "CUDA integer divider", "[cuda]" ) {
+CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) {
 
   testUint64Divider();
   testUint32Divider();
 
   cudaError_t err = cudaDeviceSynchronize();
-  REQUIRE(err == cudaSuccess);
+  CATCH_REQUIRE(err == cudaSuccess);
 }
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index e10de30ae8e023..4c57b7d8ee1d96 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -9,18 +9,18 @@ using namespace at;
 using Catch::Matchers::StartsWith;
 
 #define REQUIRE_EQUAL(t1, t2) \
-  REQUIRE(t1.equal(t2));
+  CATCH_REQUIRE(t1.equal(t2));
 
 #define REQUIRE_ALLCLOSE(t1, t2)   \
-  REQUIRE(t1.is_same_size(t2));    \
-  REQUIRE(t1.allclose(t2));
+  CATCH_REQUIRE(t1.is_same_size(t2));    \
+  CATCH_REQUIRE(t1.allclose(t2));
 
 #define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol)   \
-  REQUIRE(t1.is_same_size(t2));    \
-  REQUIRE(t1.allclose(t2, atol, rtol));
+  CATCH_REQUIRE(t1.is_same_size(t2));    \
+  CATCH_REQUIRE(t1.allclose(t2, atol, rtol));
 
 void requireEqualTensorList(TensorList t1, TensorList t2) {
-  REQUIRE(t1.size() == t2.size());
+  CATCH_REQUIRE(t1.size() == t2.size());
   for (size_t i = 0; i < t1.size(); ++i) {
     REQUIRE_EQUAL(t1[ i ], t2[ i ]);
   }
@@ -29,7 +29,7 @@ void requireEqualTensorList(TensorList t1, TensorList t2) {
 void test(Type & T, Type & AccT) {
   auto t = randn({3, 3}, T);
 
-  SECTION( "split: test method, type, namespace give same result" ) {
+  CATCH_SECTION( "split: test method, type, namespace give same result" ) {
     auto splitMethod = t.split(1, 0);
     auto splitType = T.split(t, 1, 0);
     auto splitNs = at::split(t, 1, 0);
@@ -40,7 +40,7 @@ void test(Type & T, Type & AccT) {
     REQUIRE_EQUAL(at::cat(splitMethod, 0), t);
   }
 
-  SECTION( "chunk: test method, type, namespace give same result" ) {
+  CATCH_SECTION( "chunk: test method, type, namespace give same result" ) {
     // test method, type, namespace give same result
     auto chunkMethod = t.chunk(3, 0);
     auto chunkType = T.chunk(t, 3, 0);
@@ -53,7 +53,7 @@ void test(Type & T, Type & AccT) {
   }
 
   // stack
-  SECTION( "stack" ) {
+  CATCH_SECTION( "stack" ) {
     auto x = rand({2, 3, 4});
     auto y = rand({2, 3, 4});
     auto z = rand({2, 3, 4});
@@ -66,36 +66,36 @@ void test(Type & T, Type & AccT) {
       expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
 
       REQUIRE_EQUAL(res, res_neg);
-      REQUIRE(res.sizes().equals(expected_size));
+      CATCH_REQUIRE(res.sizes().equals(expected_size));
       REQUIRE_EQUAL(res.select(dim, 0), x);
       REQUIRE_EQUAL(res.select(dim, 1), y);
       REQUIRE_EQUAL(res.select(dim, 2), z);
     }
   }
 
-  SECTION( "size / stride" ) {
+  CATCH_SECTION( "size / stride" ) {
     auto scalar = randn({}, T);
-    REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
-    REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
 
     auto empty = randn({0}, T);
-    REQUIRE(empty.size(0) == 0);
-    REQUIRE(empty.size(-1) == 0);
-    REQUIRE(empty.stride(0) == 1);
-    REQUIRE(empty.stride(-1) == 1);
+    CATCH_REQUIRE(empty.size(0) == 0);
+    CATCH_REQUIRE(empty.size(-1) == 0);
+    CATCH_REQUIRE(empty.stride(0) == 1);
+    CATCH_REQUIRE(empty.stride(-1) == 1);
   }
 
   // matmul
-  SECTION( "matmul" ) {
+  CATCH_SECTION( "matmul" ) {
     auto scalar = randn({}, T);
     auto d1 = randn({3}, T);
     auto d2 = randn({2, 3}, T);
 
     // 0-d
-    REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
-    REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
+    CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
 
     // 1-d
     REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
@@ -140,11 +140,11 @@ void test(Type & T, Type & AccT) {
 
     // non-expandable case
     auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
-    REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
+    CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
   }
 
   // _standard_gamma_grad
-  SECTION( "_standard_gamma_grad" ) {
+  CATCH_SECTION( "_standard_gamma_grad" ) {
     // check empty
     auto empty = ones({0}, T);
     REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
@@ -158,10 +158,10 @@ void test(Type & T, Type & AccT) {
     // check mixing types
     auto t1 = randn({3, 4}, T);
     auto t2 = randn({3, 4}, T).toType(kDouble);
-    REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
+    CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
   }
 
-  SECTION( "where" ) {
+  CATCH_SECTION( "where" ) {
     // empty
     auto empty = ones({0}, T);
     auto &bT = T.toScalarType(ScalarType::Byte);
@@ -180,13 +180,13 @@ void test(Type & T, Type & AccT) {
   }
 }
 
-TEST_CASE( "native test CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "native test CPU", "[cpu]" ) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat), CPU(kDouble));
 }
 
-TEST_CASE( "native test CUDA", "[cuda]" ) {
+CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index d52dc27e20295e..964f6260e7d9ff 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -18,14 +18,14 @@ using namespace at;
       _passed = true;                                           \
       els;                                                      \
     } catch (std::exception &e) {                               \
-      REQUIRE(!_passed);                                        \
+      CATCH_REQUIRE(!_passed);                                        \
       catc;                                                     \
     }                                                           \
   }
 
 void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) {
-  REQUIRE(lhs.dim() == rhs.dim());
-  REQUIRE(lhs.sizes().equals(rhs.sizes()));
+  CATCH_REQUIRE(lhs.dim() == rhs.dim());
+  CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes()));
 }
 
 bool should_expand(const IntList &from_size, const IntList &to_size) {
@@ -49,15 +49,15 @@ void test(Type &T) {
   for (auto s = sizes.begin(); s != sizes.end(); ++s) {
     // verify that the dim, sizes, strides, etc match what was requested.
     auto t = ones(*s, T);
-    REQUIRE((size_t)t.dim() == s->size());
-    REQUIRE((size_t)t.ndimension() == s->size());
-    REQUIRE(t.sizes().equals(*s));
-    REQUIRE(t.strides().size() == s->size());
+    CATCH_REQUIRE((size_t)t.dim() == s->size());
+    CATCH_REQUIRE((size_t)t.ndimension() == s->size());
+    CATCH_REQUIRE(t.sizes().equals(*s));
+    CATCH_REQUIRE(t.strides().size() == s->size());
     auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
-    REQUIRE(t.numel() == numel);
+    CATCH_REQUIRE(t.numel() == numel);
     // verify we can output
     std::stringstream ss;
-    REQUIRE_NOTHROW(ss << t << std::endl);
+    CATCH_REQUIRE_NOTHROW(ss << t << std::endl);
 
     // set_
     auto t2 = ones(*s, T);
@@ -65,22 +65,22 @@ void test(Type &T) {
     require_equal_size_dim(t2, ones({0}, T));
 
     // unsqueeze
-    REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+    CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
 
     // unsqueeze_
     {
       auto t2 = ones(*s, T);
       auto r = t2.unsqueeze_(0);
-      REQUIRE(r.dim() == t.dim() + 1);
+      CATCH_REQUIRE(r.dim() == t.dim() + 1);
     }
 
     // squeeze (with dimension argument)
     if (t.dim() == 0 || t.sizes()[0] == 1) {
-      REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
       // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
       // in NumPy this is an error.
-      REQUIRE(t.squeeze(0).dim() == t.dim());
+      CATCH_REQUIRE(t.squeeze(0).dim() == t.dim());
     }
 
     // squeeze (with no dimension argument)
@@ -99,11 +99,11 @@ void test(Type &T) {
       // squeeze_ (with dimension argument)
       auto t2 = ones(*s, T);
       if (t2.dim() == 0 ||  t2.sizes()[0] == 1) {
-        REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+        CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
       } else {
         // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
         // in NumPy this is an error.
-        REQUIRE(t2.squeeze_(0).dim() == t.dim());
+        CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim());
       }
     }
 
@@ -122,31 +122,31 @@ void test(Type &T) {
 
     // reduce (with dimension argument and with 1 return argument)
     if (t.numel() != 0) {
-      REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      REQUIRE(t.sum(0).equal(at::zeros({}, T)));
+      CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T)));
     }
 
     // reduce (with dimension argument and with 2 return arguments)
     if (t.numel() != 0) {
       auto ret = t.min(0);
-      REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
-      REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      REQUIRE_THROWS(t.min(0));
+      _CATCH_REQUIRE_THROWS(t.min(0));
     }
 
     // simple indexing
     if (t.dim() > 0 && t.numel() != 0) {
-      REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      REQUIRE_THROWS(t[0]);
+      _CATCH_REQUIRE_THROWS(t[0]);
     }
 
     // fill_ (argument to fill_ can only be a 0-dim tensor)
     TRY_CATCH_ELSE(t.fill_(t.sum(0)),
-                   REQUIRE(t.dim() > 1),
-                   REQUIRE(t.dim() <= 1));
+                   CATCH_REQUIRE(t.dim() > 1),
+                   CATCH_REQUIRE(t.dim() <= 1));
   }
 
   for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) {
@@ -156,8 +156,8 @@ void test(Type &T) {
           auto lhs = ones(*lhs_it, T);
           auto rhs = ones(*rhs_it, T);
           if(*lhs_it != *rhs_it) {
-            REQUIRE(!lhs.is_same_size(rhs));
-            REQUIRE(!rhs.is_same_size(lhs));
+            CATCH_REQUIRE(!lhs.is_same_size(rhs));
+            CATCH_REQUIRE(!rhs.is_same_size(lhs));
           }
       }
       // forced size functions (resize_, resize_as, set_)
@@ -192,7 +192,7 @@ void test(Type &T) {
             auto storage = T.storage(rhs.numel(), false);
             lhs.set_(storage);
             // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars
-            REQUIRE(lhs.dim() != 0);
+            CATCH_REQUIRE(lhs.dim() != 0);
           }
           {
             // with storage, offset, sizes, strides
@@ -211,8 +211,8 @@ void test(Type &T) {
         auto rhs = ones(*rhs_it, T);
         auto rhs_size = *rhs_it;
         TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
-                       REQUIRE(lhs.numel() != rhs.numel()),
-                       REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
+                       CATCH_REQUIRE(lhs.numel() != rhs.numel()),
+                       CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
       }
 
       // take
@@ -220,7 +220,7 @@ void test(Type &T) {
         auto lhs = ones(*lhs_it, T);
         auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
         TRY_CATCH_ELSE(auto result = lhs.take(rhs),
-                       REQUIRE(lhs.numel() == 0); REQUIRE(rhs.numel() != 0),
+                       CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0),
                        require_equal_size_dim(result, rhs));
       }
 
@@ -230,7 +230,7 @@ void test(Type &T) {
         auto lhs = ones(*lhs_it, T);
         auto rhs = ones(*rhs_it, T);
         TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
-                       REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
+                       CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
                        [&]() {
                          int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
                          int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
@@ -246,8 +246,8 @@ void test(Type &T) {
         auto rhs_size = *rhs_it;
         bool should_pass = should_expand(lhs_size, rhs_size);
         TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
-                       REQUIRE(!should_pass),
-                       REQUIRE(should_pass); require_equal_size_dim(result, rhs););
+                       CATCH_REQUIRE(!should_pass),
+                       CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs););
 
         // in-place functions (would be good if we can also do a non-broadcasting one, b/c
         // broadcasting functions will always end up operating on tensors of same size;
@@ -255,21 +255,21 @@ void test(Type &T) {
         {
           bool should_pass_inplace = should_expand(rhs_size, lhs_size);
           TRY_CATCH_ELSE(lhs.add_(rhs),
-                         REQUIRE(!should_pass_inplace),
-                         REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
+                         CATCH_REQUIRE(!should_pass_inplace),
+                         CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
         }
       }
     }
   }
 }
 
-TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
-TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
+CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 72ef4e4ad3cf4c..247830c3cc839c 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <iostream>
 // define constants like M_PI and C keywords for MSVC
@@ -33,25 +33,25 @@ struct Foo<Half> {
 
 void test_overflow() {
   auto s1 = Scalar(M_PI);
-  REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
+  CATCH_REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
   s1.toHalf();
 
   s1 = Scalar(100000);
-  REQUIRE(s1.toFloat() == 100000.0);
-  REQUIRE(s1.toInt() == 100000);
+  CATCH_REQUIRE(s1.toFloat() == 100000.0);
+  CATCH_REQUIRE(s1.toInt() == 100000);
 
-  REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
+  CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
 
   s1 = Scalar(NAN);
-  REQUIRE(std::isnan(s1.toFloat()));
-  REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  CATCH_REQUIRE(std::isnan(s1.toFloat()));
+  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
 
   s1 = Scalar(INFINITY);
-  REQUIRE(std::isinf(s1.toFloat()));
-  REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  CATCH_REQUIRE(std::isinf(s1.toFloat()));
+  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
 }
 
-TEST_CASE( "scalar test", "[]" ) {
+CATCH_TEST_CASE( "scalar test", "[]" ) {
 
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
@@ -62,7 +62,7 @@ TEST_CASE( "scalar test", "[]" ) {
   Scalar h2 = h;
   cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() <<  "\n";
   Generator & gen = at::globalContext().defaultGenerator(at::kCPU);
-  REQUIRE_NOTHROW(gen.seed());
+  CATCH_REQUIRE_NOTHROW(gen.seed());
   auto && C = at::globalContext();
   if(at::hasCUDA()) {
     auto t2 = zeros({4,4}, at::kCUDA);
@@ -71,12 +71,12 @@ TEST_CASE( "scalar test", "[]" ) {
   auto t = ones({4,4});
 
   auto wha2 = zeros({4,4}).add(t).sum();
-  REQUIRE( wha2.toCDouble() == 16.0 );
+  CATCH_REQUIRE( wha2.toCDouble() == 16.0 );
 
-  REQUIRE( t.sizes()[0] == 4 );
-  REQUIRE( t.sizes()[1] == 4 );
-  REQUIRE( t.strides()[0] == 4 );
-  REQUIRE( t.strides()[1] == 1 );
+  CATCH_REQUIRE( t.sizes()[0] == 4 );
+  CATCH_REQUIRE( t.sizes()[1] == 4 );
+  CATCH_REQUIRE( t.strides()[0] == 4 );
+  CATCH_REQUIRE( t.strides()[1] == 1 );
 
   Type & T = CPU(Float);
   Tensor x = randn({1,10}, T);
@@ -88,26 +88,26 @@ TEST_CASE( "scalar test", "[]" ) {
   Tensor next_h = i2h.add(h2h);
   next_h = next_h.tanh();
 
-  REQUIRE_THROWS(at::_local_scalar(Tensor{}));
+  _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{}));
 
   test_overflow();
 
   if(at::hasCUDA()) {
     auto r = CUDA(Float).copy(next_h);
-    REQUIRE(CPU(Float).copy(r).equal(next_h));
+    CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h));
   }
-  REQUIRE_NOTHROW(randn({10,10,2}, T));
+  CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T));
 
   // check Scalar.toTensor on Scalars backed by different data types
-  REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble);
-  REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong);
-  REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble);
+  CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble);
+  CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong);
+  CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble);
 
   if (x.type().scalarType() != ScalarType::Half) {
     AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] {
       scalar_t s = 1;
       std::stringstream ss;
-      REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
+      CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
@@ -116,10 +116,10 @@ TEST_CASE( "scalar test", "[]" ) {
   // test direct C-scalar type conversions
   {
     auto x = ones({1,2}, T);
-    REQUIRE_THROWS(x.toCFloat());
+    _CATCH_REQUIRE_THROWS(x.toCFloat());
   }
   auto float_one = ones({}, T);
-  REQUIRE(float_one.toCFloat() == 1);
-  REQUIRE(float_one.toCInt() == 1);
-  REQUIRE((float_one.toCHalf() == 1));
+  CATCH_REQUIRE(float_one.toCFloat() == 1);
+  CATCH_REQUIRE(float_one.toCInt() == 1);
+  CATCH_REQUIRE((float_one.toCHalf() == 1));
 }
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 145c4f4c261276..8dc015dd1d06ae 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAGuard.h"
@@ -14,7 +14,7 @@
 /*
 Tests related to ATen streams.
 */
-TEST_CASE(
+CATCH_TEST_CASE(
     "Copying and Moving Streams",
     "Verifies streams are live through copying and moving") {
   int32_t device = -1;
@@ -29,14 +29,14 @@ TEST_CASE(
 
     copyStream = s;
 
-    REQUIRE(copyStream.internals() == s.internals());
-    REQUIRE(copyStream.device() == device);
-    REQUIRE(copyStream.stream() == cuda_stream);
+    CATCH_REQUIRE(copyStream.internals() == s.internals());
+    CATCH_REQUIRE(copyStream.device() == device);
+    CATCH_REQUIRE(copyStream.stream() == cuda_stream);
   }
 
-  REQUIRE(copyStream.internals());
-  REQUIRE(copyStream.device() == device);
-  REQUIRE(copyStream.stream() == cuda_stream);
+  CATCH_REQUIRE(copyStream.internals());
+  CATCH_REQUIRE(copyStream.device() == device);
+  CATCH_REQUIRE(copyStream.stream() == cuda_stream);
 
   // Tests that moving works as expected and preserves the stream
   at::cuda::CUDAStream moveStream;
@@ -47,41 +47,41 @@ TEST_CASE(
 
     moveStream = std::move(s);
 
-    REQUIRE(moveStream.device() == device);
-    REQUIRE(moveStream.stream() == cuda_stream);
+    CATCH_REQUIRE(moveStream.device() == device);
+    CATCH_REQUIRE(moveStream.stream() == cuda_stream);
   }
 
-  REQUIRE(moveStream.internals());
-  REQUIRE(moveStream.device() == device);
-  REQUIRE(moveStream.stream() == cuda_stream);
+  CATCH_REQUIRE(moveStream.internals());
+  CATCH_REQUIRE(moveStream.device() == device);
+  CATCH_REQUIRE(moveStream.stream() == cuda_stream);
 }
 
-TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
+CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
   at::cuda::CUDAStream myStream = at::cuda::createCUDAStream();
 
   // Sets and gets
   at::cuda::setCurrentCUDAStream(myStream);
   at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream();
 
-  REQUIRE(myStream == curStream);
+  CATCH_REQUIRE(myStream == curStream);
 
   // Gets, sets, and gets default stream
   at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
   at::cuda::setCurrentCUDAStream(defaultStream);
   curStream = at::cuda::getCurrentCUDAStream();
 
-  REQUIRE(defaultStream != myStream);
-  REQUIRE(curStream == defaultStream);
+  CATCH_REQUIRE(defaultStream != myStream);
+  CATCH_REQUIRE(curStream == defaultStream);
 }
 
 void thread_fun(at::cuda::CUDAStream& cur_thread_stream) {
   auto new_stream = at::cuda::createCUDAStream();
   at::cuda::setCurrentCUDAStream(new_stream);
   cur_thread_stream = at::cuda::getCurrentCUDAStream();
-  REQUIRE(cur_thread_stream == new_stream);
+  CATCH_REQUIRE(cur_thread_stream == new_stream);
 }
 
-TEST_CASE(
+CATCH_TEST_CASE(
     "Multithread Getting and Setting",
     "Ensures streams are thread local") {
   at::cuda::CUDAStream s0, s1;
@@ -94,25 +94,25 @@ TEST_CASE(
   at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream();
   at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream();
 
-  REQUIRE(cur_stream == default_stream);
-  REQUIRE(cur_stream != s0);
-  REQUIRE(cur_stream != s1);
-  REQUIRE(s0 != s1);
+  CATCH_REQUIRE(cur_stream == default_stream);
+  CATCH_REQUIRE(cur_stream != s0);
+  CATCH_REQUIRE(cur_stream != s1);
+  CATCH_REQUIRE(s0 != s1);
 }
 
-TEST_CASE("CUDAGuard") {
+CATCH_TEST_CASE("CUDAGuard") {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
 
   // -- begin setup
 
-  REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
   std::vector<at::cuda::CUDAStream> streams0 = {
       at::cuda::getDefaultCUDAStream(),
       at::cuda::createCUDAStream()};
-  REQUIRE(streams0[0].device() == 0);
-  REQUIRE(streams0[1].device() == 0);
+  CATCH_REQUIRE(streams0[0].device() == 0);
+  CATCH_REQUIRE(streams0[1].device() == 0);
   at::cuda::setCurrentCUDAStream(streams0[0]);
 
   std::vector<at::cuda::CUDAStream> streams1;
@@ -121,47 +121,47 @@ TEST_CASE("CUDAGuard") {
     streams1.push_back(at::cuda::getDefaultCUDAStream());
     streams1.push_back(at::cuda::createCUDAStream());
   }
-  REQUIRE(streams1[0].device() == 1);
-  REQUIRE(streams1[1].device() == 1);
+  CATCH_REQUIRE(streams1[0].device() == 1);
+  CATCH_REQUIRE(streams1[1].device() == 1);
   at::cuda::setCurrentCUDAStream(streams1[0]);
 
-  REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
 
   // -- end setup
 
   // Test that all original streams are recorded.
   {
     at::cuda::CUDAGuard guard;
-    REQUIRE(guard.original_streams().empty());
+    CATCH_REQUIRE(guard.original_streams().empty());
     guard.set_stream(streams0[0]);
-    REQUIRE(
+    CATCH_REQUIRE(
         guard.original_streams().size() == at::cuda::getNumGPUs());
-    REQUIRE(guard.original_streams()[0] == streams0[0]);
-    REQUIRE(guard.original_streams()[1] == streams1[0]);
+    CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]);
+    CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]);
   }
 
   // Setting a stream changes the current device and the stream on that device
   {
     at::cuda::CUDAGuard guard(streams1[1]);
-    REQUIRE(guard.last_device() == 1);
-    REQUIRE(at::cuda::current_device() == 1);
-    REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]);
+    CATCH_REQUIRE(guard.last_device() == 1);
+    CATCH_REQUIRE(at::cuda::current_device() == 1);
+    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]);
   }
 
   // Device and stream are now reset
-  REQUIRE(at::cuda::current_device() == 0);
-  REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
 
   // Setting only the device changes only the current device and not the stream
   {
     at::cuda::CUDAGuard guard(/*device=*/1);
-    REQUIRE(guard.last_device() == 1);
-    REQUIRE(at::cuda::current_device() == 1);
-    REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+    CATCH_REQUIRE(guard.last_device() == 1);
+    CATCH_REQUIRE(at::cuda::current_device() == 1);
+    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
   }
 
-  REQUIRE(at::cuda::current_device() == 0);
-  REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
 
   // Setting the stream first, and then the device, first changes the devices
   // back, and then resets the stream on the initial device.
@@ -171,12 +171,12 @@ TEST_CASE("CUDAGuard") {
     guard.set_device(1);
   }
 
-  REQUIRE(at::cuda::current_device() == 0);
-  REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
-  REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
 }
 
-TEST_CASE("CUDAGuardIsMovable") {
+CATCH_TEST_CASE("CUDAGuardIsMovable") {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
@@ -185,17 +185,17 @@ TEST_CASE("CUDAGuardIsMovable") {
   at::cuda::CUDAGuard first(stream);
   first.set_device(1);
   at::cuda::CUDAGuard second(std::move(first));
-  REQUIRE(second.original_streams().size() == device_count);
-  REQUIRE(second.original_device() == 0);
-  REQUIRE(second.last_device() == 1);
+  CATCH_REQUIRE(second.original_streams().size() == device_count);
+  CATCH_REQUIRE(second.original_device() == 0);
+  CATCH_REQUIRE(second.last_device() == 1);
   at::cuda::CUDAGuard third;
   third = std::move(second);
-  REQUIRE(third.original_streams().size() == device_count);
-  REQUIRE(third.original_device() == 0);
-  REQUIRE(third.last_device() == 1);
+  CATCH_REQUIRE(third.original_streams().size() == device_count);
+  CATCH_REQUIRE(third.original_device() == 0);
+  CATCH_REQUIRE(third.last_device() == 1);
 }
 
-TEST_CASE("Streampool Round Robin") {
+CATCH_TEST_CASE("Streampool Round Robin") {
   std::vector<at::cuda::CUDAStream> streams{};
   for (int i = 0; i < 200; ++i) {
     streams.emplace_back(at::cuda::detail::CUDAStream_createStream());
@@ -209,10 +209,10 @@ TEST_CASE("Streampool Round Robin") {
     if (!result_pair.second) hasDuplicates = true;
   }
 
-  REQUIRE(hasDuplicates);
+  CATCH_REQUIRE(hasDuplicates);
 }
 
-TEST_CASE("Multi-GPU") {
+CATCH_TEST_CASE("Multi-GPU") {
   if (at::cuda::getNumGPUs() < 2) return;
 
   at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0);
@@ -221,17 +221,17 @@ TEST_CASE("Multi-GPU") {
   at::cuda::setCurrentCUDAStream(s0);
   at::cuda::setCurrentCUDAStream(s1);
 
-  REQUIRE(s0 == at::cuda::getCurrentCUDAStream());
+  CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream());
 
   at::DeviceGuard device_guard{1};
-  REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
+  CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
 }
 
-TEST_CASE("CUDAEvent Syncs") {
+CATCH_TEST_CASE("CUDAEvent Syncs") {
   const auto stream = at::cuda::createCUDAStream();
   at::cuda::CUDAEvent event;
 
-  REQUIRE(!event.happened());
+  CATCH_REQUIRE(!event.happened());
 
   event.recordOnce(stream);
 
@@ -242,10 +242,10 @@ TEST_CASE("CUDAEvent Syncs") {
   wait_stream1.synchronize_with(event);
 
   cudaStreamSynchronize(wait_stream0);
-  REQUIRE(event.happened());
+  CATCH_REQUIRE(event.happened());
 }
 
-TEST_CASE("Cross-Device Events") {
+CATCH_TEST_CASE("Cross-Device Events") {
   if (at::cuda::getNumGPUs() < 2) return;
 
   const auto stream0 = at::cuda::createCUDAStream();
@@ -260,10 +260,10 @@ TEST_CASE("Cross-Device Events") {
   
   event0 = std::move(event1);
   
-  REQUIRE(event0.device() == 1);
+  CATCH_REQUIRE(event0.device() == 1);
 
   stream0.synchronize_with(event0);
   
   cudaStreamSynchronize(stream0);
-  REQUIRE(event0.happened());
+  CATCH_REQUIRE(event0.happened());
 }
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
index 552328029ce03c..81701733b53693 100644
--- a/aten/src/ATen/test/test_parallel.cpp
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -11,7 +11,7 @@
 
 using namespace at;
 
-TEST_CASE( "parallel", "[cpu]" ) {
+CATCH_TEST_CASE( "parallel", "[cpu]" ) {
 
   manual_seed(123, at::kCPU);
   set_num_threads(1);
@@ -24,5 +24,5 @@ TEST_CASE( "parallel", "[cpu]" ) {
   as[0] = 1;
   as[1] = 0;
   as[2] = 0;
-  REQUIRE(a.sum(0).equal(as));
+  CATCH_REQUIRE(a.sum(0).equal(as));
 }
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index e47772a7e8a80b..c01dff2d0038b1 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/core/UndefinedTensorImpl.h"
@@ -8,7 +8,7 @@
 
 using namespace at;
 
-TEST_CASE( "undefined tensor test", "[]" ) {
+CATCH_TEST_CASE( "undefined tensor test", "[]" ) {
   manual_seed(123, at::kCPU);
 
   // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
@@ -17,36 +17,36 @@ TEST_CASE( "undefined tensor test", "[]" ) {
 
   std::stringstream ss;
   ss << und << std::endl;
-  REQUIRE(!und.defined());
-  REQUIRE(std::string("UndefinedType") == und.toString());
-
-  REQUIRE_THROWS(und.strides());
-  REQUIRE_THROWS(und.dim());
-  REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5));
-  REQUIRE_THROWS(und.add(und));
-  REQUIRE_THROWS(und.add(ft));
-  REQUIRE_THROWS(ft.add(und));
-  REQUIRE_THROWS(und.add(5));
-  REQUIRE_THROWS(und.mm(und));
+  CATCH_REQUIRE(!und.defined());
+  CATCH_REQUIRE(std::string("UndefinedType") == und.toString());
+
+  _CATCH_REQUIRE_THROWS(und.strides());
+  _CATCH_REQUIRE_THROWS(und.dim());
+  _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5));
+  _CATCH_REQUIRE_THROWS(und.add(und));
+  _CATCH_REQUIRE_THROWS(und.add(ft));
+  _CATCH_REQUIRE_THROWS(ft.add(und));
+  _CATCH_REQUIRE_THROWS(und.add(5));
+  _CATCH_REQUIRE_THROWS(und.mm(und));
 
   und.toType(und.type());
-  REQUIRE_THROWS(und.toType(ft.type()));
-  REQUIRE_THROWS(ft.toType(und.type()));
+  _CATCH_REQUIRE_THROWS(und.toType(ft.type()));
+  _CATCH_REQUIRE_THROWS(ft.toType(und.type()));
   und.toType(ScalarType::Undefined);
-  REQUIRE_THROWS(und.toType(ScalarType::Float));
-  REQUIRE_THROWS(ft.toType(ScalarType::Undefined));
+  _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float));
+  _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined));
 
   // copy_
-  REQUIRE_THROWS(und.copy_(und));
-  REQUIRE_THROWS(und.copy_(ft));
-  REQUIRE_THROWS(ft.copy_(und));
+  _CATCH_REQUIRE_THROWS(und.copy_(und));
+  _CATCH_REQUIRE_THROWS(und.copy_(ft));
+  _CATCH_REQUIRE_THROWS(ft.copy_(und));
 
   und.toBackend(Backend::Undefined);
-  REQUIRE_THROWS(und.toBackend(Backend::CPU));
-  REQUIRE_THROWS(ft.toBackend(Backend::Undefined));
+  _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU));
+  _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined));
 
   Tensor to_move = ones({1}, CPU(kFloat));
   Tensor m(std::move(to_move));
-  REQUIRE(!to_move.defined());
-  REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton());
+  CATCH_REQUIRE(!to_move.defined());
+  CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton());
 }
diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp
index 167520beb58382..42c9f61b19b5e1 100644
--- a/aten/src/ATen/test/weakref_test.cpp
+++ b/aten/src/ATen/test/weakref_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 
@@ -10,53 +10,53 @@
 using at::Tensor;
 using at::WeakTensor;
 
-TEST_CASE( "Weak pointer tests", "" ) {
-  SECTION("gets invalidated") {
+CATCH_TEST_CASE( "Weak pointer tests", "" ) {
+  CATCH_SECTION("gets invalidated") {
     Tensor a = at::ones({2, 2});
     WeakTensor b = a;
     a.reset();
-    REQUIRE_FALSE(b.lock().defined());
+    CATCH_REQUIRE_FALSE(b.lock().defined());
   }
 
-  SECTION("can successfully lock") {
+  CATCH_SECTION("can successfully lock") {
     Tensor a = at::ones({2, 2});
     WeakTensor b = a;
     auto c = b.lock();
-    REQUIRE(c.defined());
+    CATCH_REQUIRE(c.defined());
 
     a.reset();
-    REQUIRE(b.lock().defined());
+    CATCH_REQUIRE(b.lock().defined());
     c.reset();
-    REQUIRE_FALSE(b.lock().defined());
+    CATCH_REQUIRE_FALSE(b.lock().defined());
   }
 
-  SECTION("updates refcounts correctly") {
+  CATCH_SECTION("updates refcounts correctly") {
     Tensor a = at::ones({2, 2});
-    REQUIRE(a.use_count() == 1);
-    REQUIRE(a.weak_use_count() == 1);
+    CATCH_REQUIRE(a.use_count() == 1);
+    CATCH_REQUIRE(a.weak_use_count() == 1);
     {
       WeakTensor b = a;
-      REQUIRE(a.use_count() == 1);
-      REQUIRE(a.weak_use_count() == 2);
+      CATCH_REQUIRE(a.use_count() == 1);
+      CATCH_REQUIRE(a.weak_use_count() == 2);
     }
-    REQUIRE(a.use_count() == 1);
-    REQUIRE(a.weak_use_count() == 1);
+    CATCH_REQUIRE(a.use_count() == 1);
+    CATCH_REQUIRE(a.weak_use_count() == 1);
     {
       WeakTensor b = a;
-      REQUIRE(a.use_count() == 1);
+      CATCH_REQUIRE(a.use_count() == 1);
       auto locked = b.lock();
-      REQUIRE(locked.defined());
-      REQUIRE(a.use_count() == 2);
+      CATCH_REQUIRE(locked.defined());
+      CATCH_REQUIRE(a.use_count() == 2);
     }
-    REQUIRE(a.use_count() == 1);
-    REQUIRE(a.weak_use_count() == 1);
+    CATCH_REQUIRE(a.use_count() == 1);
+    CATCH_REQUIRE(a.weak_use_count() == 1);
     {
       WeakTensor b = a;
-      REQUIRE(a.use_count() == 1);
-      REQUIRE(a.weak_use_count() == 2);
+      CATCH_REQUIRE(a.use_count() == 1);
+      CATCH_REQUIRE(a.weak_use_count() == 2);
       a.reset();
-      REQUIRE(b.use_count() == 0);
-      REQUIRE(b.weak_use_count() == 1);
+      CATCH_REQUIRE(b.use_count() == 0);
+      CATCH_REQUIRE(b.weak_use_count() == 1);
     }
   }
 }
diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp
index 8e813bc7f7deeb..f76dac212a0921 100644
--- a/aten/src/ATen/test/wrapdim_test.cpp
+++ b/aten/src/ATen/test/wrapdim_test.cpp
@@ -1,43 +1,43 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-TEST_CASE( "wrapdim test", "[]" ) {
+CATCH_TEST_CASE( "wrapdim test", "[]" ) {
   manual_seed(123, at::kCPU);
 
   Type & T = CPU(kFloat);
 
-  SECTION( "simple case" ) {
+  CATCH_SECTION( "simple case" ) {
     auto a = randn({2, 3, 4, 5}, T);
-    REQUIRE(a.prod(-4).equal(a.prod(0)));
-    REQUIRE(a.prod(3).equal(a.prod(-1)));
+    CATCH_REQUIRE(a.prod(-4).equal(a.prod(0)));
+    CATCH_REQUIRE(a.prod(3).equal(a.prod(-1)));
   }
 
-  SECTION( "expression specification" ) {
+  CATCH_SECTION( "expression specification" ) {
     auto a = randn({2, 3, 4, 5}, T);
-    REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
-    REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
+    CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
+    CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
 
     // can unsqueeze scalar
     auto b = randn(1, T);
     b.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
+    CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
   }
 
-  SECTION( "empty tensor" ) {
+  CATCH_SECTION( "empty tensor" ) {
     auto a = randn(0, T);
-    REQUIRE(a.prod(0).equal(at::ones({}, T)));
+    CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T)));
   }
 
-  SECTION( "scalar vs 1-dim, 1-size" ) {
+  CATCH_SECTION( "scalar vs 1-dim, 1-size" ) {
     auto a = randn(1, T);
-    REQUIRE(a.prod(0).equal(a.prod(-1)));
+    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
     a.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    REQUIRE(a.dim() == 0);
-    REQUIRE(a.prod(0).equal(a.prod(-1)));
+    CATCH_REQUIRE(a.dim() == 0);
+    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
   }
 }
diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp
index 9368d4dd5a4983..18db2f531ee358 100644
--- a/test/cpp/api/any.cpp
+++ b/test/cpp/api/any.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/any.h>
 #include <torch/torch.h>
@@ -13,39 +13,39 @@ using namespace torch::detail;
 using Catch::Contains;
 using Catch::StartsWith;
 
-TEST_CASE("any-module") {
+CATCH_TEST_CASE("any-module") {
   torch::manual_seed(0);
-  SECTION("int()") {
+  CATCH_SECTION("int()") {
     struct M : torch::nn::Module {
       int forward() {
         return 123;
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward<int>() == 123);
+    CATCH_REQUIRE(any.forward<int>() == 123);
   }
 
-  SECTION("int(int)") {
+  CATCH_SECTION("int(int)") {
     struct M : torch::nn::Module {
       int forward(int x) {
         return x;
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward<int>(5) == 5);
+    CATCH_REQUIRE(any.forward<int>(5) == 5);
   }
 
-  SECTION("const char*(const char*)") {
+  CATCH_SECTION("const char*(const char*)") {
     struct M : torch::nn::Module {
       const char* forward(const char* x) {
         return x;
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward<const char*>("hello") == std::string("hello"));
+    CATCH_REQUIRE(any.forward<const char*>("hello") == std::string("hello"));
   }
 
-  SECTION("string(int, const double)") {
+  CATCH_SECTION("string(int, const double)") {
     struct M : torch::nn::Module {
       std::string forward(int x, const double f) {
         return std::to_string(static_cast<int>(x + f));
@@ -53,10 +53,10 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{});
     int x = 4;
-    REQUIRE(any.forward<std::string>(x, 3.14) == std::string("7"));
+    CATCH_REQUIRE(any.forward<std::string>(x, 3.14) == std::string("7"));
   }
 
-  SECTION("Tensor(string, const string&, string&&)") {
+  CATCH_SECTION("Tensor(string, const string&, string&&)") {
     struct M : torch::nn::Module {
       torch::Tensor forward(
           std::string a,
@@ -67,42 +67,42 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any(M{});
-    REQUIRE(
+    CATCH_REQUIRE(
         any.forward(
                std::string("a"), std::string("ab"), std::string("abc"))
             .sum()
             .toCInt() == 6);
   }
-  SECTION("wrong argument type") {
+  CATCH_SECTION("wrong argument type") {
     struct M : torch::nn::Module {
       int forward(float x) {
         return x;
       }
     };
     AnyModule any(M{});
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(5.0),
         StartsWith("Expected argument #0 to be of type float, "
                    "but received value of type double"));
   }
-  SECTION("wrong number of arguments") {
+  CATCH_SECTION("wrong number of arguments") {
     struct M : torch::nn::Module {
       int forward(int a, int b) {
         return a + b;
       }
     };
     AnyModule any(M{});
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(),
         Contains("M's forward() method expects 2 arguments, but received 0"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(5),
         Contains("M's forward() method expects 2 arguments, but received 1"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(1, 2, 3),
         Contains("M's forward() method expects 2 arguments, but received 3"));
   }
-  SECTION("get()") {
+  CATCH_SECTION("get()") {
     struct M : torch::nn::Module {
       explicit M(int value_) : torch::nn::Module("M"), value(value_) {}
       int value;
@@ -112,16 +112,16 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{5});
 
-    SECTION("good cast") {
-      REQUIRE(any.get<M>().value == 5);
+    CATCH_SECTION("good cast") {
+      CATCH_REQUIRE(any.get<M>().value == 5);
     }
 
-    SECTION("bad cast") {
+    CATCH_SECTION("bad cast") {
       struct N : torch::nn::Module {};
-      REQUIRE_THROWS_WITH(any.get<N>(), StartsWith("Attempted to cast module"));
+      CATCH_REQUIRE_THROWS_WITH(any.get<N>(), StartsWith("Attempted to cast module"));
     }
   }
-  SECTION("ptr()") {
+  CATCH_SECTION("ptr()") {
     struct M : torch::nn::Module {
       explicit M(int value_) : torch::nn::Module("M"), value(value_) {}
       int value;
@@ -131,24 +131,24 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{5});
 
-    SECTION("base class cast") {
+    CATCH_SECTION("base class cast") {
       auto ptr = any.ptr();
-      REQUIRE(ptr != nullptr);
-      REQUIRE(ptr->name() == "M");
+      CATCH_REQUIRE(ptr != nullptr);
+      CATCH_REQUIRE(ptr->name() == "M");
     }
 
-    SECTION("good downcast") {
+    CATCH_SECTION("good downcast") {
       auto ptr = any.ptr<M>();
-      REQUIRE(ptr != nullptr);
-      REQUIRE(ptr->value == 5);
+      CATCH_REQUIRE(ptr != nullptr);
+      CATCH_REQUIRE(ptr->value == 5);
     }
 
-    SECTION("bad downcast") {
+    CATCH_SECTION("bad downcast") {
       struct N : torch::nn::Module {};
-      REQUIRE_THROWS_WITH(any.ptr<N>(), StartsWith("Attempted to cast module"));
+      CATCH_REQUIRE_THROWS_WITH(any.ptr<N>(), StartsWith("Attempted to cast module"));
     }
   }
-  SECTION("default state is empty") {
+  CATCH_SECTION("default state is empty") {
     struct M : torch::nn::Module {
       explicit M(int value_) : value(value_) {}
       int value;
@@ -157,33 +157,33 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any;
-    REQUIRE(any.is_empty());
+    CATCH_REQUIRE(any.is_empty());
     any = std::make_shared<M>(5);
-    REQUIRE(!any.is_empty());
-    REQUIRE(any.get<M>().value == 5);
+    CATCH_REQUIRE(!any.is_empty());
+    CATCH_REQUIRE(any.get<M>().value == 5);
   }
-  SECTION("all methods throw for empty AnyModule") {
+  CATCH_SECTION("all methods throw for empty AnyModule") {
     struct M : torch::nn::Module {
       int forward(int x) {
         return x;
       }
     };
     AnyModule any;
-    REQUIRE(any.is_empty());
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE(any.is_empty());
+    CATCH_REQUIRE_THROWS_WITH(
         any.get<M>(), StartsWith("Cannot call get() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.ptr<M>(), StartsWith("Cannot call ptr() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.ptr(), StartsWith("Cannot call ptr() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.type_info(),
         StartsWith("Cannot call type_info() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward<int>(5),
         StartsWith("Cannot call forward() on an empty AnyModule"));
   }
-  SECTION("can move assign different modules") {
+  CATCH_SECTION("can move assign different modules") {
     struct M : torch::nn::Module {
       std::string forward(int x) {
         return std::to_string(x);
@@ -195,15 +195,15 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any;
-    REQUIRE(any.is_empty());
+    CATCH_REQUIRE(any.is_empty());
     any = std::make_shared<M>();
-    REQUIRE(!any.is_empty());
-    REQUIRE(any.forward<std::string>(5) == "5");
+    CATCH_REQUIRE(!any.is_empty());
+    CATCH_REQUIRE(any.forward<std::string>(5) == "5");
     any = std::make_shared<N>();
-    REQUIRE(!any.is_empty());
-    REQUIRE(any.forward<int>(5.0f) == 8);
+    CATCH_REQUIRE(!any.is_empty());
+    CATCH_REQUIRE(any.forward<int>(5.0f) == 8);
   }
-  SECTION("constructs from ModuleHolder") {
+  CATCH_SECTION("constructs from ModuleHolder") {
     struct MImpl : torch::nn::Module {
       explicit MImpl(int value_) : torch::nn::Module("M"), value(value_) {}
       int value;
@@ -218,14 +218,14 @@ TEST_CASE("any-module") {
     };
 
     AnyModule any(M{5});
-    REQUIRE(any.get<MImpl>().value == 5);
-    REQUIRE(any.get<M>()->value == 5);
+    CATCH_REQUIRE(any.get<MImpl>().value == 5);
+    CATCH_REQUIRE(any.get<M>()->value == 5);
 
     AnyModule module(Linear(3, 4));
     std::shared_ptr<Module> ptr = module.ptr();
     Linear linear(module.get<Linear>());
   }
-  SECTION("converts autograd::Variable to torch::Tensor correctly") {
+  CATCH_SECTION("converts autograd::Variable to torch::Tensor correctly") {
     struct M : torch::nn::Module {
       torch::Tensor forward(torch::Tensor input) {
         return input;
@@ -236,12 +236,12 @@ TEST_CASE("any-module") {
       // torch::Tensor before being passed to the function (to avoid a type
       // mismatch).
       AnyModule any(M{});
-      REQUIRE(
+      CATCH_REQUIRE(
           any.forward(torch::autograd::Variable(torch::ones(5)))
               .sum()
               .toCFloat() == 5);
       // at::Tensors that are not variables work too.
-      REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5);
+      CATCH_REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5);
     }
   }
 }
@@ -263,92 +263,92 @@ AnyModule::Value make_value(T&& value) {
 } // namespace nn
 } // namespace torch
 
-TEST_CASE("any-value") {
+CATCH_TEST_CASE("any-value") {
   torch::manual_seed(0);
-  SECTION("gets the correct value for the right type") {
-    SECTION("int") {
+  CATCH_SECTION("gets the correct value for the right type") {
+    CATCH_SECTION("int") {
       auto value = make_value(5);
       // const and non-const types have the same typeid()
-      REQUIRE(value.try_get<int>() != nullptr);
-      REQUIRE(value.try_get<const int>() != nullptr);
-      REQUIRE(value.get<int>() == 5);
+      CATCH_REQUIRE(value.try_get<int>() != nullptr);
+      CATCH_REQUIRE(value.try_get<const int>() != nullptr);
+      CATCH_REQUIRE(value.get<int>() == 5);
     }
-    SECTION("const int") {
+    CATCH_SECTION("const int") {
       auto value = make_value(5);
-      REQUIRE(value.try_get<const int>() != nullptr);
-      REQUIRE(value.try_get<int>() != nullptr);
-      REQUIRE(value.get<const int>() == 5);
+      CATCH_REQUIRE(value.try_get<const int>() != nullptr);
+      CATCH_REQUIRE(value.try_get<int>() != nullptr);
+      CATCH_REQUIRE(value.get<const int>() == 5);
     }
-    SECTION("const char*") {
+    CATCH_SECTION("const char*") {
       auto value = make_value("hello");
-      REQUIRE(value.try_get<const char*>() != nullptr);
-      REQUIRE(value.get<const char*>() == std::string("hello"));
+      CATCH_REQUIRE(value.try_get<const char*>() != nullptr);
+      CATCH_REQUIRE(value.get<const char*>() == std::string("hello"));
     }
-    SECTION("std::string") {
+    CATCH_SECTION("std::string") {
       auto value = make_value(std::string("hello"));
-      REQUIRE(value.try_get<std::string>() != nullptr);
-      REQUIRE(value.get<std::string>() == "hello");
+      CATCH_REQUIRE(value.try_get<std::string>() != nullptr);
+      CATCH_REQUIRE(value.get<std::string>() == "hello");
     }
-    SECTION("pointers") {
+    CATCH_SECTION("pointers") {
       std::string s("hello");
       std::string* p = &s;
       auto value = make_value(p);
-      REQUIRE(value.try_get<std::string*>() != nullptr);
-      REQUIRE(*value.get<std::string*>() == "hello");
+      CATCH_REQUIRE(value.try_get<std::string*>() != nullptr);
+      CATCH_REQUIRE(*value.get<std::string*>() == "hello");
     }
-    SECTION("references") {
+    CATCH_SECTION("references") {
       std::string s("hello");
       const std::string& t = s;
       auto value = make_value(t);
-      REQUIRE(value.try_get<std::string>() != nullptr);
-      REQUIRE(value.get<std::string>() == "hello");
+      CATCH_REQUIRE(value.try_get<std::string>() != nullptr);
+      CATCH_REQUIRE(value.get<std::string>() == "hello");
     }
   }
-  SECTION("try_get returns nullptr for the wrong type") {
+  CATCH_SECTION("try_get returns nullptr for the wrong type") {
     auto value = make_value(5);
-    REQUIRE(value.try_get<int>() != nullptr);
-    REQUIRE(value.try_get<float>() == nullptr);
-    REQUIRE(value.try_get<long>() == nullptr);
-    REQUIRE(value.try_get<std::string>() == nullptr);
+    CATCH_REQUIRE(value.try_get<int>() != nullptr);
+    CATCH_REQUIRE(value.try_get<float>() == nullptr);
+    CATCH_REQUIRE(value.try_get<long>() == nullptr);
+    CATCH_REQUIRE(value.try_get<std::string>() == nullptr);
   }
-  SECTION("get throws for the wrong type") {
+  CATCH_SECTION("get throws for the wrong type") {
     auto value = make_value(5);
-    REQUIRE(value.try_get<int>() != nullptr);
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE(value.try_get<int>() != nullptr);
+    CATCH_REQUIRE_THROWS_WITH(
         value.get<float>(),
         StartsWith("Attempted to cast Value to float, "
                    "but its actual type is int"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         value.get<long>(),
         StartsWith("Attempted to cast Value to long, "
                    "but its actual type is int"));
   }
-  SECTION("move is allowed") {
+  CATCH_SECTION("move is allowed") {
     auto value = make_value(5);
-    SECTION("construction") {
+    CATCH_SECTION("construction") {
       auto copy = make_value(std::move(value));
-      REQUIRE(copy.try_get<int>() != nullptr);
-      REQUIRE(copy.get<int>() == 5);
+      CATCH_REQUIRE(copy.try_get<int>() != nullptr);
+      CATCH_REQUIRE(copy.get<int>() == 5);
     }
-    SECTION("assignment") {
+    CATCH_SECTION("assignment") {
       auto copy = make_value(10);
       copy = std::move(value);
-      REQUIRE(copy.try_get<int>() != nullptr);
-      REQUIRE(copy.get<int>() == 5);
+      CATCH_REQUIRE(copy.try_get<int>() != nullptr);
+      CATCH_REQUIRE(copy.get<int>() == 5);
     }
   }
-  SECTION("type_info is correct") {
-    SECTION("int") {
+  CATCH_SECTION("type_info is correct") {
+    CATCH_SECTION("int") {
       auto value = make_value(5);
-      REQUIRE(value.type_info().hash_code() == typeid(int).hash_code());
+      CATCH_REQUIRE(value.type_info().hash_code() == typeid(int).hash_code());
     }
-    SECTION("const char") {
+    CATCH_SECTION("const char") {
       auto value = make_value("hello");
-      REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code());
+      CATCH_REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code());
     }
-    SECTION("std::string") {
+    CATCH_SECTION("std::string") {
       auto value = make_value(std::string("hello"));
-      REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code());
+      CATCH_REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code());
     }
   }
 }
diff --git a/test/cpp/api/catch_utils.hpp b/test/cpp/api/catch_utils.hpp
new file mode 100644
index 00000000000000..b9b0a87990a9ce
--- /dev/null
+++ b/test/cpp/api/catch_utils.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
+// define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
diff --git a/test/cpp/api/cursor.cpp b/test/cpp/api/cursor.cpp
index 5c998661be2368..e08bd7858dbb4a 100644
--- a/test/cpp/api/cursor.cpp
+++ b/test/cpp/api/cursor.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/cursor.h>
 #include <torch/nn/module.h>
@@ -58,158 +58,158 @@ struct Container : public torch::nn::Module {
   std::vector<std::shared_ptr<Module>> m;
 };
 
-TEST_CASE("cursor/module") {
+CATCH_TEST_CASE("cursor/module") {
   torch::manual_seed(0);
-  SECTION("Works for flat models (depth = 1)") {
+  CATCH_SECTION("Works for flat models (depth = 1)") {
     Container model(TestModule(1), TestModule(2), TestModule(3));
     auto cursor = model.modules();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(&iterator->value == &model[0]);
-      REQUIRE(&(++iterator)->value == &model[1]);
-      REQUIRE(&(++iterator)->value == &model[2]);
-      REQUIRE(++iterator == cursor.end());
+      CATCH_REQUIRE(&iterator->value == &model[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[1]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[2]);
+      CATCH_REQUIRE(++iterator == cursor.end());
     }
 
-    SECTION("names are flat") {
+    CATCH_SECTION("names are flat") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->key == "0");
-      REQUIRE((++iterator)->key == "1");
-      REQUIRE((++iterator)->key == "2");
+      CATCH_REQUIRE(iterator->key == "0");
+      CATCH_REQUIRE((++iterator)->key == "1");
+      CATCH_REQUIRE((++iterator)->key == "2");
     }
 
-    SECTION("Apply works") {
+    CATCH_SECTION("Apply works") {
       size_t count = 0;
       cursor.apply([&count, &model](Module& module) {
-        REQUIRE(&module == &model[count]);
+        CATCH_REQUIRE(&module == &model[count]);
         count += 1;
       });
-      REQUIRE(count == 3);
+      CATCH_REQUIRE(count == 3);
     }
 
-    SECTION("Apply_items works") {
+    CATCH_SECTION("Apply_items works") {
       size_t count = 0;
       cursor.apply_items(
           [&count, &model](const std::string& key, Module& module) {
-            REQUIRE(&module == &model[count]);
+            CATCH_REQUIRE(&module == &model[count]);
             count += 1;
           });
-      REQUIRE(count == 3);
+      CATCH_REQUIRE(count == 3);
     }
 
-    SECTION("Map works") {
+    CATCH_SECTION("Map works") {
       std::vector<Module*> vector(3);
       cursor.map(vector.begin(), [](Module& module) { return &module; });
-      REQUIRE(vector[0] == &model[0]);
-      REQUIRE(vector[1] == &model[1]);
-      REQUIRE(vector[2] == &model[2]);
+      CATCH_REQUIRE(vector[0] == &model[0]);
+      CATCH_REQUIRE(vector[1] == &model[1]);
+      CATCH_REQUIRE(vector[2] == &model[2]);
 
       std::list<Module*> list;
       cursor.map(std::inserter(list, list.end()), [](Module& module) {
         return &module;
       });
-      REQUIRE(list.size() == 3);
+      CATCH_REQUIRE(list.size() == 3);
       auto iterator = list.begin();
-      REQUIRE(*iterator++ == &model[0]);
-      REQUIRE(*iterator++ == &model[1]);
-      REQUIRE(*iterator++ == &model[2]);
-      REQUIRE(iterator == list.end());
+      CATCH_REQUIRE(*iterator++ == &model[0]);
+      CATCH_REQUIRE(*iterator++ == &model[1]);
+      CATCH_REQUIRE(*iterator++ == &model[2]);
+      CATCH_REQUIRE(iterator == list.end());
     }
 
-    SECTION("Map_items works") {
+    CATCH_SECTION("Map_items works") {
       std::map<std::string, Module*> output;
       cursor.map_items(
           std::inserter(output, output.end()),
           [](const std::string& key, Module& module) {
             return std::make_pair(key, &module);
           });
-      REQUIRE(output.size() == 3);
-      REQUIRE(output.count("0"));
-      REQUIRE(output.count("1"));
-      REQUIRE(output.count("2"));
-      REQUIRE(output["0"] == &model[0]);
-      REQUIRE(output["1"] == &model[1]);
-      REQUIRE(output["2"] == &model[2]);
+      CATCH_REQUIRE(output.size() == 3);
+      CATCH_REQUIRE(output.count("0"));
+      CATCH_REQUIRE(output.count("1"));
+      CATCH_REQUIRE(output.count("2"));
+      CATCH_REQUIRE(output["0"] == &model[0]);
+      CATCH_REQUIRE(output["1"] == &model[1]);
+      CATCH_REQUIRE(output["2"] == &model[2]);
     }
 
-    SECTION("Count works for flat models") {
-      REQUIRE(cursor.size() == model.m.size());
+    CATCH_SECTION("Count works for flat models") {
+      CATCH_REQUIRE(cursor.size() == model.m.size());
     }
 
-    SECTION("find() finds the correct modules when given a valid key") {
-      REQUIRE(cursor.find("0") == &model[0]);
-      REQUIRE(cursor.find("1") == &model[1]);
-      REQUIRE(cursor.find("2") == &model[2]);
+    CATCH_SECTION("find() finds the correct modules when given a valid key") {
+      CATCH_REQUIRE(cursor.find("0") == &model[0]);
+      CATCH_REQUIRE(cursor.find("1") == &model[1]);
+      CATCH_REQUIRE(cursor.find("2") == &model[2]);
     }
 
-    SECTION("find() returns nullptr when given an invalid key") {
-      REQUIRE(cursor.find("foo") == nullptr);
-      REQUIRE(cursor.find("bar") == nullptr);
+    CATCH_SECTION("find() returns nullptr when given an invalid key") {
+      CATCH_REQUIRE(cursor.find("foo") == nullptr);
+      CATCH_REQUIRE(cursor.find("bar") == nullptr);
     }
 
-    SECTION("at(key) returns the correct modules when given a valid key") {
-      REQUIRE(&cursor.at("0") == &model[0]);
-      REQUIRE(&cursor.at("1") == &model[1]);
-      REQUIRE(&cursor.at("2") == &model[2]);
+    CATCH_SECTION("at(key) returns the correct modules when given a valid key") {
+      CATCH_REQUIRE(&cursor.at("0") == &model[0]);
+      CATCH_REQUIRE(&cursor.at("1") == &model[1]);
+      CATCH_REQUIRE(&cursor.at("2") == &model[2]);
     }
 
-    SECTION("at(key) throws when given an invalid key") {
-      REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'"));
-      REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'"));
+    CATCH_SECTION("at(key) throws when given an invalid key") {
+      CATCH_REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'"));
+      CATCH_REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'"));
     }
 
-    SECTION(
+    CATCH_SECTION(
         "operator[key] returns the correct modules when given a valid key") {
-      REQUIRE(&cursor["0"] == &model[0]);
-      REQUIRE(&cursor["1"] == &model[1]);
-      REQUIRE(&cursor["2"] == &model[2]);
+      CATCH_REQUIRE(&cursor["0"] == &model[0]);
+      CATCH_REQUIRE(&cursor["1"] == &model[1]);
+      CATCH_REQUIRE(&cursor["2"] == &model[2]);
     }
 
-    SECTION("operator[key] throws when given an invalid key") {
-      REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'"));
-      REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'"));
+    CATCH_SECTION("operator[key] throws when given an invalid key") {
+      CATCH_REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'"));
+      CATCH_REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'"));
     }
 
-    SECTION("at(index) returns the correct modules when given a valid index") {
-      REQUIRE(&cursor.at(0).value == &model[0]);
-      REQUIRE(&cursor.at(1).value == &model[1]);
-      REQUIRE(&cursor.at(2).value == &model[2]);
+    CATCH_SECTION("at(index) returns the correct modules when given a valid index") {
+      CATCH_REQUIRE(&cursor.at(0).value == &model[0]);
+      CATCH_REQUIRE(&cursor.at(1).value == &model[1]);
+      CATCH_REQUIRE(&cursor.at(2).value == &model[2]);
     }
 
-    SECTION("at(index) throws when given an invalid index") {
-      REQUIRE_THROWS_WITH(
+    CATCH_SECTION("at(index) throws when given an invalid index") {
+      CATCH_REQUIRE_THROWS_WITH(
           cursor.at(5),
           StartsWith("Index 5 is out of range for cursor of size 3"));
-      REQUIRE_THROWS_WITH(
+      CATCH_REQUIRE_THROWS_WITH(
           cursor.at(123),
           StartsWith("Index 123 is out of range for cursor of size 3"));
     }
 
-    SECTION(
+    CATCH_SECTION(
         "operator[index] returns the correct modules when given a valid index") {
-      REQUIRE(&cursor[0].value == &model[0]);
-      REQUIRE(&cursor[1].value == &model[1]);
-      REQUIRE(&cursor[2].value == &model[2]);
+      CATCH_REQUIRE(&cursor[0].value == &model[0]);
+      CATCH_REQUIRE(&cursor[1].value == &model[1]);
+      CATCH_REQUIRE(&cursor[2].value == &model[2]);
     }
 
-    SECTION("operator[index] throws when given an invalid key") {
-      REQUIRE_THROWS_WITH(
+    CATCH_SECTION("operator[index] throws when given an invalid key") {
+      CATCH_REQUIRE_THROWS_WITH(
           cursor[5],
           StartsWith("Index 5 is out of range for cursor of size 3"));
-      REQUIRE_THROWS_WITH(
+      CATCH_REQUIRE_THROWS_WITH(
           cursor[123],
           StartsWith("Index 123 is out of range for cursor of size 3"));
     }
 
-    SECTION("contains() is correct") {
-      REQUIRE(cursor.contains("0"));
-      REQUIRE(cursor.contains("1"));
-      REQUIRE(cursor.contains("2"));
+    CATCH_SECTION("contains() is correct") {
+      CATCH_REQUIRE(cursor.contains("0"));
+      CATCH_REQUIRE(cursor.contains("1"));
+      CATCH_REQUIRE(cursor.contains("2"));
     }
   }
 
-  SECTION("Works for deeper hierarchies (depth > 1)") {
+  CATCH_SECTION("Works for deeper hierarchies (depth > 1)") {
     // clang-format off
     Container model(
         Container(
@@ -227,106 +227,106 @@ TEST_CASE("cursor/module") {
     auto cursor = model.modules();
     // This is sufficient for the hierarchical case
     // (other tests build on top)
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
 
-      REQUIRE(&iterator->value == &model[0]);
+      CATCH_REQUIRE(&iterator->value == &model[0]);
 
       auto* seq = dynamic_cast<Container*>(&model[0]);
-      REQUIRE(seq != nullptr);
-      REQUIRE(&(++iterator)->value == &(*seq)[0]);
-      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+      CATCH_REQUIRE(seq != nullptr);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]);
 
-      REQUIRE(&(++iterator)->value == &model[1]);
-      REQUIRE(&(++iterator)->value == &model[2]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[1]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[2]);
 
       seq = dynamic_cast<Container*>(&model[2]);
-      REQUIRE(seq != nullptr);
-      REQUIRE(&(++iterator)->value == &(*seq)[0]);
-      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+      CATCH_REQUIRE(seq != nullptr);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]);
 
       seq = dynamic_cast<Container*>(&(*seq)[1]);
-      REQUIRE(seq != nullptr);
-      REQUIRE(&(++iterator)->value == &(*seq)[0]);
-      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+      CATCH_REQUIRE(seq != nullptr);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]);
     }
 
-    SECTION("children() returns only the first level of submodules") {
+    CATCH_SECTION("children() returns only the first level of submodules") {
       auto children = model.children();
-      REQUIRE(children.size() == 3);
-      REQUIRE(&children.at("0") == &model[0]);
-      REQUIRE(&children.at("1") == &model[1]);
-      REQUIRE(&children.at("2") == &model[2]);
-      REQUIRE(!children.contains("0.0"));
+      CATCH_REQUIRE(children.size() == 3);
+      CATCH_REQUIRE(&children.at("0") == &model[0]);
+      CATCH_REQUIRE(&children.at("1") == &model[1]);
+      CATCH_REQUIRE(&children.at("2") == &model[2]);
+      CATCH_REQUIRE(!children.contains("0.0"));
       size_t count = 0;
       for (auto& child : children) {
-        REQUIRE(child.key == std::to_string(count));
-        REQUIRE(&child.value == &model[count]);
+        CATCH_REQUIRE(child.key == std::to_string(count));
+        CATCH_REQUIRE(&child.value == &model[count]);
         count += 1;
       }
     }
   }
 }
 
-TEST_CASE("cursor/parameter") {
+CATCH_TEST_CASE("cursor/parameter") {
   torch::manual_seed(0);
-  SECTION("Works for single models") {
+  CATCH_SECTION("Works for single models") {
     TestModule model(1);
     auto cursor = model.parameters();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->value.equal(model.tensor1));
-      REQUIRE((++iterator)->value.equal(model.tensor2));
+      CATCH_REQUIRE(iterator->value.equal(model.tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(model.tensor2));
     }
   }
 
-  SECTION("Works for flat models (depth = 1)") {
+  CATCH_SECTION("Works for flat models (depth = 1)") {
     auto first = std::make_shared<TestModule>(1);
     auto second = std::make_shared<TestModule>(2);
     Container model(first, second);
     auto cursor = model.parameters();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->value.equal(first->tensor1));
-      REQUIRE((++iterator)->value.equal(first->tensor2));
-      REQUIRE((++iterator)->value.equal(second->tensor1));
-      REQUIRE((++iterator)->value.equal(second->tensor2));
+      CATCH_REQUIRE(iterator->value.equal(first->tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(first->tensor2));
+      CATCH_REQUIRE((++iterator)->value.equal(second->tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(second->tensor2));
     }
 
-    SECTION("Apply_items works") {
+    CATCH_SECTION("Apply_items works") {
       size_t count = 0;
       cursor.apply_items([&count, &model, &first, &second](
                              const std::string& key, torch::Tensor& tensor) {
         switch (count) {
           case 0: {
-            REQUIRE(tensor.equal(first->tensor1));
+            CATCH_REQUIRE(tensor.equal(first->tensor1));
             break;
           }
           case 1: {
-            REQUIRE(tensor.equal(first->tensor2));
+            CATCH_REQUIRE(tensor.equal(first->tensor2));
             break;
           }
           case 2: {
-            REQUIRE(tensor.equal(second->tensor1));
+            CATCH_REQUIRE(tensor.equal(second->tensor1));
             break;
           }
           case 3: {
-            REQUIRE(tensor.equal(second->tensor2));
+            CATCH_REQUIRE(tensor.equal(second->tensor2));
             break;
           }
         }
         count += 1;
       });
-      REQUIRE(count == 4);
+      CATCH_REQUIRE(count == 4);
     }
 
     // Other tests are correct based on correct iteration behavior and apply
     // working.
   }
 
-  SECTION("Works for deeper hierarchies (depth > 1)") {
+  CATCH_SECTION("Works for deeper hierarchies (depth > 1)") {
     std::vector<std::shared_ptr<TestModule>> modules;
     for (size_t i = 1; i <= 6; ++i) {
       modules.push_back(std::make_shared<TestModule>(i));
@@ -346,36 +346,36 @@ TEST_CASE("cursor/parameter") {
     // clang-format on
     auto cursor = model.parameters();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->value.equal(modules[0]->tensor1));
-      REQUIRE((++iterator)->value.equal(modules[0]->tensor2));
+      CATCH_REQUIRE(iterator->value.equal(modules[0]->tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(modules[0]->tensor2));
       for (size_t index = 1; index < 6; ++index) {
-        REQUIRE((++iterator)->value.equal(modules[index]->tensor1));
-        REQUIRE((++iterator)->value.equal(modules[index]->tensor2));
+        CATCH_REQUIRE((++iterator)->value.equal(modules[index]->tensor1));
+        CATCH_REQUIRE((++iterator)->value.equal(modules[index]->tensor2));
       }
     }
 
-    SECTION("names are hierarchical") {
+    CATCH_SECTION("names are hierarchical") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->key == "0.0.tensor1");
-      REQUIRE((++iterator)->key == "0.0.tensor2");
-      REQUIRE((++iterator)->key == "0.1.tensor1");
-      REQUIRE((++iterator)->key == "0.1.tensor2");
-      REQUIRE((++iterator)->key == "1.tensor1");
-      REQUIRE((++iterator)->key == "1.tensor2");
-      REQUIRE((++iterator)->key == "2.0.tensor1");
-      REQUIRE((++iterator)->key == "2.0.tensor2");
-      REQUIRE((++iterator)->key == "2.1.0.tensor1");
-      REQUIRE((++iterator)->key == "2.1.0.tensor2");
-      REQUIRE((++iterator)->key == "2.1.1.tensor1");
-      REQUIRE((++iterator)->key == "2.1.1.tensor2");
-      REQUIRE(++iterator == cursor.end());
+      CATCH_REQUIRE(iterator->key == "0.0.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "0.0.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "0.1.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "0.1.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "1.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "1.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "2.0.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "2.0.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "2.1.0.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "2.1.0.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "2.1.1.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "2.1.1.tensor2");
+      CATCH_REQUIRE(++iterator == cursor.end());
     }
   }
 }
 
-TEST_CASE("cursor/non-const-to-const-conversion") {
+CATCH_TEST_CASE("cursor/non-const-to-const-conversion") {
   torch::manual_seed(0);
   auto first = std::make_shared<TestModule>(1);
   auto second = std::make_shared<TestModule>(2);
@@ -404,11 +404,11 @@ TEST_CASE("cursor/non-const-to-const-conversion") {
   }
 }
 
-TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") {
+CATCH_TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") {
   torch::manual_seed(0);
   TestModule model(1);
 
   /// This will only compile if `Cursor` has the appropriate const methods.
   const auto cursor = model.parameters();
-  REQUIRE(cursor.contains("tensor1"));
+  CATCH_REQUIRE(cursor.contains("tensor1"));
 }
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 8d753197754c19..972223a89fd42f 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/batchnorm.h>
 #include <torch/nn/modules/conv.h>
@@ -230,7 +230,7 @@ bool test_mnist(
   return correct.sum().toCFloat() > telabel.size(0) * 0.8;
 }
 
-TEST_CASE("integration/cartpole") {
+CATCH_TEST_CASE("integration/cartpole") {
   torch::manual_seed(0);
   std::cerr << "Training episodic policy gradient with a critic for up to 3000"
                " episodes, rest your eyes for a bit!\n";
@@ -326,11 +326,11 @@ TEST_CASE("integration/cartpole") {
     if (running_reward > 150) {
       break;
     }
-    REQUIRE(episode < 3000);
+    CATCH_REQUIRE(episode < 3000);
   }
 }
 
-TEST_CASE("integration/mnist", "[cuda]") {
+CATCH_TEST_CASE("integration/mnist", "[cuda]") {
   torch::manual_seed(0);
   auto model = std::make_shared<SimpleContainer>();
   auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
@@ -357,7 +357,7 @@ TEST_CASE("integration/mnist", "[cuda]") {
   auto optimizer = torch::optim::SGD(
       model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5));
 
-  REQUIRE(test_mnist(
+  CATCH_REQUIRE(test_mnist(
       32, // batch_size
       3, // num_epochs
       true, // useGPU
@@ -366,7 +366,7 @@ TEST_CASE("integration/mnist", "[cuda]") {
       optimizer));
 }
 
-TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
+CATCH_TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
   torch::manual_seed(0);
   auto model = std::make_shared<SimpleContainer>();
   auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
@@ -393,7 +393,7 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
   auto optimizer = torch::optim::SGD(
       model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5));
 
-  REQUIRE(test_mnist(
+  CATCH_REQUIRE(test_mnist(
       32, // batch_size
       3, // num_epochs
       true, // useGPU
diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp
index c46868cdbf6989..b477b11c8e5ae1 100644
--- a/test/cpp/api/jit.cpp
+++ b/test/cpp/api/jit.cpp
@@ -1,12 +1,12 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/jit.h>
 #include <torch/tensor.h>
 
 #include <string>
 
-TEST_CASE("torch script") {
-  SECTION("multiple functions") {
+CATCH_TEST_CASE("torch script") {
+  CATCH_SECTION("multiple functions") {
     auto module = torch::jit::compile(R"JIT(
       def test_mul(a, b):
         return a * b
@@ -21,11 +21,11 @@ TEST_CASE("torch script") {
     auto a = torch::ones(1);
     auto b = torch::ones(1);
 
-    REQUIRE(1 == module->run_method("test_mul", a, b).toTensor().toCLong());
+    CATCH_REQUIRE(1 == module->run_method("test_mul", a, b).toTensor().toCLong());
 
-    REQUIRE(2 == module->run_method("test_relu", a, b).toTensor().toCLong());
+    CATCH_REQUIRE(2 == module->run_method("test_relu", a, b).toTensor().toCLong());
 
-    REQUIRE(
+    CATCH_REQUIRE(
         0x200 == module->run_method("test_while", a, b).toTensor().toCLong());
   }
 }
diff --git a/test/cpp/api/main.cpp b/test/cpp/api/main.cpp
index 4b1aaba64b2ef1..92ea3567a3b9a7 100644
--- a/test/cpp/api/main.cpp
+++ b/test/cpp/api/main.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_RUNNER
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/cuda.h>
 
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index 6d065bf6fa6523..8ced0e0a750dc8 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/detail/ordered_dict.h>
 #include <torch/expanding_array.h>
@@ -18,7 +18,7 @@ using OrderedDict = torch::detail::OrderedDict<std::string, T>;
 
 using Catch::StartsWith;
 
-TEST_CASE("NoGrad") {
+CATCH_TEST_CASE("NoGrad") {
   torch::manual_seed(0);
   torch::NoGradGuard guard;
   Linear model(5, 2);
@@ -27,88 +27,88 @@ TEST_CASE("NoGrad") {
   torch::Tensor s = y.sum();
 
   s.backward();
-  REQUIRE(!model->parameters()["weight"].grad().defined());
+  CATCH_REQUIRE(!model->parameters()["weight"].grad().defined());
 }
 
-TEST_CASE("autograd") {
+CATCH_TEST_CASE("autograd") {
   torch::manual_seed(0);
   auto x = torch::randn({3, 3}, torch::requires_grad());
   auto y = torch::randn({3, 3});
   auto z = x * y;
-  SECTION("derivatives of zero-dim tensors") {
+  CATCH_SECTION("derivatives of zero-dim tensors") {
     z.sum().backward();
-    REQUIRE(x.grad().allclose(y));
+    CATCH_REQUIRE(x.grad().allclose(y));
   }
-  SECTION("derivatives of tensors") {
+  CATCH_SECTION("derivatives of tensors") {
     z.backward();
-    REQUIRE(x.grad().allclose(y));
+    CATCH_REQUIRE(x.grad().allclose(y));
   }
-  SECTION("custom gradient inputs") {
+  CATCH_SECTION("custom gradient inputs") {
     z.sum().backward(torch::ones({}) * 2);
-    REQUIRE(x.grad().allclose(y * 2));
+    CATCH_REQUIRE(x.grad().allclose(y * 2));
   }
   // Assume everything else is safe from PyTorch tests.
 }
 
-TEST_CASE("nn::init") {
+CATCH_TEST_CASE("nn::init") {
   auto tensor = torch::empty({3, 4}, torch::requires_grad());
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       tensor.fill_(1),
       StartsWith("a leaf Variable that requires grad "
                  "has been used in an in-place operation"));
-  REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12);
+  CATCH_REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12);
 }
 
-TEST_CASE("expanding-array") {
+CATCH_TEST_CASE("expanding-array") {
   torch::manual_seed(0);
-  SECTION("successful construction") {
-    SECTION("initializer_list") {
+  CATCH_SECTION("successful construction") {
+    CATCH_SECTION("initializer_list") {
       torch::ExpandingArray<5> e({1, 2, 3, 4, 5});
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == i + 1);
+        CATCH_REQUIRE((*e)[i] == i + 1);
       }
     }
 
-    SECTION("vector") {
+    CATCH_SECTION("vector") {
       torch::ExpandingArray<5> e(std::vector<int64_t>{1, 2, 3, 4, 5});
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == i + 1);
+        CATCH_REQUIRE((*e)[i] == i + 1);
       }
     }
 
-    SECTION("array") {
+    CATCH_SECTION("array") {
       torch::ExpandingArray<5> e(std::array<int64_t, 5>({1, 2, 3, 4, 5}));
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == i + 1);
+        CATCH_REQUIRE((*e)[i] == i + 1);
       }
     }
 
-    SECTION("single value") {
+    CATCH_SECTION("single value") {
       torch::ExpandingArray<5> e(5);
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == 5);
+        CATCH_REQUIRE((*e)[i] == 5);
       }
     }
   }
-  SECTION("throws for incorrect size on construction") {
-    SECTION("initializer_list") {
-      REQUIRE_THROWS_WITH(
+  CATCH_SECTION("throws for incorrect size on construction") {
+    CATCH_SECTION("initializer_list") {
+      CATCH_REQUIRE_THROWS_WITH(
           torch::ExpandingArray<5>({1, 2, 3, 4, 5, 6, 7}),
           StartsWith("Expected 5 values, but instead got 7"));
     }
-    SECTION("vector") {
-      REQUIRE_THROWS_WITH(
+    CATCH_SECTION("vector") {
+      CATCH_REQUIRE_THROWS_WITH(
           torch::ExpandingArray<5>(std::vector<int64_t>({1, 2, 3, 4, 5, 6, 7})),
           StartsWith("Expected 5 values, but instead got 7"));
     }
   }
 }
 
-TEST_CASE("make_unique") {
+CATCH_TEST_CASE("make_unique") {
   struct Test {
     explicit Test(const int& x) : lvalue_(x) {}
     explicit Test(int&& x) : rvalue_(x) {}
@@ -117,216 +117,216 @@ TEST_CASE("make_unique") {
     at::optional<int> rvalue_;
   };
 
-  SECTION("forwards rvalues correctly") {
+  CATCH_SECTION("forwards rvalues correctly") {
     auto ptr = torch::make_unique<Test>(123);
-    REQUIRE(!ptr->lvalue_.has_value());
-    REQUIRE(ptr->rvalue_.has_value());
-    REQUIRE(*ptr->rvalue_ == 123);
+    CATCH_REQUIRE(!ptr->lvalue_.has_value());
+    CATCH_REQUIRE(ptr->rvalue_.has_value());
+    CATCH_REQUIRE(*ptr->rvalue_ == 123);
   }
 
-  SECTION("forwards lvalues correctly") {
+  CATCH_SECTION("forwards lvalues correctly") {
     int x = 5;
     auto ptr = torch::make_unique<Test>(x);
-    REQUIRE(ptr->lvalue_.has_value());
-    REQUIRE(*ptr->lvalue_ == 5);
-    REQUIRE(!ptr->rvalue_.has_value());
+    CATCH_REQUIRE(ptr->lvalue_.has_value());
+    CATCH_REQUIRE(*ptr->lvalue_ == 5);
+    CATCH_REQUIRE(!ptr->rvalue_.has_value());
   }
 
-  SECTION("Can construct unique_ptr of array") {
+  CATCH_SECTION("Can construct unique_ptr of array") {
     auto ptr = torch::make_unique<int[]>(3);
     // Value initialization is required by the standard.
-    REQUIRE(ptr[0] == 0);
-    REQUIRE(ptr[1] == 0);
-    REQUIRE(ptr[2] == 0);
+    CATCH_REQUIRE(ptr[0] == 0);
+    CATCH_REQUIRE(ptr[1] == 0);
+    CATCH_REQUIRE(ptr[2] == 0);
   }
 }
 
-TEST_CASE("ordered-dict") {
-  SECTION("is empty after default construction") {
+CATCH_TEST_CASE("ordered-dict") {
+  CATCH_SECTION("is empty after default construction") {
     OrderedDict<int> dict;
-    REQUIRE(dict.subject() == "Key");
-    REQUIRE(dict.is_empty());
-    REQUIRE(dict.size() == 0);
+    CATCH_REQUIRE(dict.subject() == "Key");
+    CATCH_REQUIRE(dict.is_empty());
+    CATCH_REQUIRE(dict.size() == 0);
   }
 
-  SECTION("insert inserts elements when they are not yet present") {
+  CATCH_SECTION("insert inserts elements when they are not yet present") {
     OrderedDict<int> dict;
     dict.insert("a", 1);
     dict.insert("b", 2);
-    REQUIRE(dict.size() == 2);
+    CATCH_REQUIRE(dict.size() == 2);
   }
 
-  SECTION("get returns values when present") {
+  CATCH_SECTION("get returns values when present") {
     OrderedDict<int> dict;
     dict.insert("a", 1);
     dict.insert("b", 2);
-    REQUIRE(dict.get("a") == 1);
-    REQUIRE(dict.get("b") == 2);
+    CATCH_REQUIRE(dict.get("a") == 1);
+    CATCH_REQUIRE(dict.get("b") == 2);
   }
 
-  SECTION("get throws when passed keys that are not present") {
+  CATCH_SECTION("get throws when passed keys that are not present") {
     OrderedDict<int> dict;
     dict.insert("a", 1);
     dict.insert("b", 2);
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.get("foo"), StartsWith("Key 'foo' is not defined"));
-    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+    CATCH_REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
   }
 
-  SECTION("can initialize from list") {
+  CATCH_SECTION("can initialize from list") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.size() == 2);
-    REQUIRE(dict.get("a") == 1);
-    REQUIRE(dict.get("b") == 2);
+    CATCH_REQUIRE(dict.size() == 2);
+    CATCH_REQUIRE(dict.get("a") == 1);
+    CATCH_REQUIRE(dict.get("b") == 2);
   }
 
-  SECTION("insert throws when passed elements that are present") {
+  CATCH_SECTION("insert throws when passed elements that are present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.insert("a", 1), StartsWith("Key 'a' already defined"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.insert("b", 1), StartsWith("Key 'b' already defined"));
   }
 
-  SECTION("front() returns the first item") {
+  CATCH_SECTION("front() returns the first item") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.front().key == "a");
-    REQUIRE(dict.front().value == 1);
+    CATCH_REQUIRE(dict.front().key == "a");
+    CATCH_REQUIRE(dict.front().value == 1);
   }
 
-  SECTION("back() returns the last item") {
+  CATCH_SECTION("back() returns the last item") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.back().key == "b");
-    REQUIRE(dict.back().value == 2);
+    CATCH_REQUIRE(dict.back().key == "b");
+    CATCH_REQUIRE(dict.back().value == 2);
   }
 
-  SECTION("find returns pointers to values when present") {
+  CATCH_SECTION("find returns pointers to values when present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.find("a") != nullptr);
-    REQUIRE(*dict.find("a") == 1);
-    REQUIRE(dict.find("b") != nullptr);
-    REQUIRE(*dict.find("b") == 2);
+    CATCH_REQUIRE(dict.find("a") != nullptr);
+    CATCH_REQUIRE(*dict.find("a") == 1);
+    CATCH_REQUIRE(dict.find("b") != nullptr);
+    CATCH_REQUIRE(*dict.find("b") == 2);
   }
 
-  SECTION("find returns null pointers when passed keys that are not present") {
+  CATCH_SECTION("find returns null pointers when passed keys that are not present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.find("bar") == nullptr);
-    REQUIRE(dict.find("") == nullptr);
+    CATCH_REQUIRE(dict.find("bar") == nullptr);
+    CATCH_REQUIRE(dict.find("") == nullptr);
   }
 
-  SECTION("operator[] returns values when passed keys that are present") {
+  CATCH_SECTION("operator[] returns values when passed keys that are present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict["a"] == 1);
-    REQUIRE(dict["b"] == 2);
+    CATCH_REQUIRE(dict["a"] == 1);
+    CATCH_REQUIRE(dict["b"] == 2);
   }
 
-  SECTION("operator[] returns items positionally when passed integers") {
+  CATCH_SECTION("operator[] returns items positionally when passed integers") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict[0].key == "a");
-    REQUIRE(dict[0].value == 1);
-    REQUIRE(dict[1].key == "b");
-    REQUIRE(dict[1].value == 2);
+    CATCH_REQUIRE(dict[0].key == "a");
+    CATCH_REQUIRE(dict[0].value == 1);
+    CATCH_REQUIRE(dict[1].key == "b");
+    CATCH_REQUIRE(dict[1].value == 2);
   }
 
-  SECTION("operator[] throws when passed keys that are not present") {
+  CATCH_SECTION("operator[] throws when passed keys that are not present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.get("foo"), StartsWith("Key 'foo' is not defined"));
-    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+    CATCH_REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
   }
 
-  SECTION("update inserts all items from another OrderedDict") {
+  CATCH_SECTION("update inserts all items from another OrderedDict") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> dict2 = {{"c", 3}};
     dict2.update(dict);
-    REQUIRE(dict2.size() == 3);
-    REQUIRE(dict2.find("a") != nullptr);
-    REQUIRE(dict2.find("b") != nullptr);
-    REQUIRE(dict2.find("c") != nullptr);
+    CATCH_REQUIRE(dict2.size() == 3);
+    CATCH_REQUIRE(dict2.find("a") != nullptr);
+    CATCH_REQUIRE(dict2.find("b") != nullptr);
+    CATCH_REQUIRE(dict2.find("c") != nullptr);
   }
 
-  SECTION("update also checks for duplicates") {
+  CATCH_SECTION("update also checks for duplicates") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> dict2 = {{"a", 1}};
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict2.update(dict), StartsWith("Key 'a' already defined"));
   }
 
-  SECTION("Can iterate items") {
+  CATCH_SECTION("Can iterate items") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     auto iterator = dict.begin();
-    REQUIRE(iterator != dict.end());
-    REQUIRE(iterator->key == "a");
-    REQUIRE(iterator->value == 1);
+    CATCH_REQUIRE(iterator != dict.end());
+    CATCH_REQUIRE(iterator->key == "a");
+    CATCH_REQUIRE(iterator->value == 1);
     ++iterator;
-    REQUIRE(iterator != dict.end());
-    REQUIRE(iterator->key == "b");
-    REQUIRE(iterator->value == 2);
+    CATCH_REQUIRE(iterator != dict.end());
+    CATCH_REQUIRE(iterator->key == "b");
+    CATCH_REQUIRE(iterator->value == 2);
     ++iterator;
-    REQUIRE(iterator == dict.end());
+    CATCH_REQUIRE(iterator == dict.end());
   }
 
-  SECTION("clear makes the dict empty") {
+  CATCH_SECTION("clear makes the dict empty") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(!dict.is_empty());
+    CATCH_REQUIRE(!dict.is_empty());
     dict.clear();
-    REQUIRE(dict.is_empty());
+    CATCH_REQUIRE(dict.is_empty());
   }
 
-  SECTION("can copy construct") {
+  CATCH_SECTION("can copy construct") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = dict;
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
   }
 
-  SECTION("can copy assign") {
+  CATCH_SECTION("can copy assign") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = {{"c", 1}};
-    REQUIRE(copy.find("c") != nullptr);
+    CATCH_REQUIRE(copy.find("c") != nullptr);
     copy = dict;
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
-    REQUIRE(copy.find("c") == nullptr);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.find("c") == nullptr);
   }
 
-  SECTION("can move construct") {
+  CATCH_SECTION("can move construct") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = std::move(dict);
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
   }
 
-  SECTION("can move assign") {
+  CATCH_SECTION("can move assign") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = {{"c", 1}};
-    REQUIRE(copy.find("c") != nullptr);
+    CATCH_REQUIRE(copy.find("c") != nullptr);
     copy = std::move(dict);
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
-    REQUIRE(copy.find("c") == nullptr);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.find("c") == nullptr);
   }
 
-  SECTION("can insert with braces") {
+  CATCH_SECTION("can insert with braces") {
     OrderedDict<std::pair<int, int>> dict;
     dict.insert("a", {1, 2});
-    REQUIRE(!dict.is_empty());
-    REQUIRE(dict["a"].first == 1);
-    REQUIRE(dict["a"].second == 2);
+    CATCH_REQUIRE(!dict.is_empty());
+    CATCH_REQUIRE(dict["a"].first == 1);
+    CATCH_REQUIRE(dict["a"].second == 2);
   }
 
-  SECTION("Error messages include the what") {
+  CATCH_SECTION("Error messages include the what") {
     OrderedDict<int> dict("Penguin");
-    REQUIRE(dict.subject() == "Penguin");
+    CATCH_REQUIRE(dict.subject() == "Penguin");
     dict.insert("a", 1);
-    REQUIRE(!dict.is_empty());
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE(!dict.is_empty());
+    CATCH_REQUIRE_THROWS_WITH(
         dict.get("b"), StartsWith("Penguin 'b' is not defined"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.insert("a", 1), StartsWith("Penguin 'a' already defined"));
   }
 }
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index d4049d6b270b26..2b9d0ad99c3477 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/module.h>
 #include <torch/nn/modules/linear.h>
@@ -22,21 +22,21 @@ struct AGIUnit2 : torch::nn::Module {
 };
 } // namespace test
 
-TEST_CASE("module/training-mode") {
+CATCH_TEST_CASE("module/training-mode") {
   torch::manual_seed(0);
   Linear module(3, 4);
-  REQUIRE(module->is_training());
-  SECTION("Enable eval mode") {
+  CATCH_REQUIRE(module->is_training());
+  CATCH_SECTION("Enable eval mode") {
     module->eval();
-    REQUIRE(!module->is_training());
+    CATCH_REQUIRE(!module->is_training());
   }
-  SECTION("Enable train mode") {
+  CATCH_SECTION("Enable train mode") {
     module->train();
-    REQUIRE(module->is_training());
+    CATCH_REQUIRE(module->is_training());
   }
 }
 
-TEST_CASE("module/zero-grad") {
+CATCH_TEST_CASE("module/zero-grad") {
   torch::manual_seed(0);
   Linear module(3, 4);
   auto weight = torch::ones({8, 3}, torch::requires_grad());
@@ -44,18 +44,18 @@ TEST_CASE("module/zero-grad") {
   loss.backward();
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
-    REQUIRE(grad.defined());
-    REQUIRE(grad.sum().toCFloat() != 0);
+    CATCH_REQUIRE(grad.defined());
+    CATCH_REQUIRE(grad.sum().toCFloat() != 0);
   }
   module->zero_grad();
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
-    REQUIRE(grad.defined());
-    REQUIRE(grad.sum().toCFloat() == 0);
+    CATCH_REQUIRE(grad.defined());
+    CATCH_REQUIRE(grad.sum().toCFloat() == 0);
   }
 }
 
-TEST_CASE("module/zero-grad-with-undefined") {
+CATCH_TEST_CASE("module/zero-grad-with-undefined") {
   struct TestModule : torch::nn::Module {
     TestModule() {
       x = register_parameter("x", torch::ones(5, at::requires_grad()));
@@ -68,120 +68,120 @@ TEST_CASE("module/zero-grad-with-undefined") {
   auto z = module.x * 2;
   z.sum().backward();
 
-  REQUIRE(module.x.grad().defined());
-  REQUIRE(!module.y.grad().defined());
+  CATCH_REQUIRE(module.x.grad().defined());
+  CATCH_REQUIRE(!module.y.grad().defined());
 
   module.zero_grad();
 
-  REQUIRE(module.x.grad().defined());
-  REQUIRE(!module.y.grad().defined());
+  CATCH_REQUIRE(module.x.grad().defined());
+  CATCH_REQUIRE(!module.y.grad().defined());
 
-  REQUIRE(module.x.grad().sum().toCFloat() == 0);
+  CATCH_REQUIRE(module.x.grad().sum().toCFloat() == 0);
 }
 
-TEST_CASE("module/name") {
+CATCH_TEST_CASE("module/name") {
   // CHECK instead of REQUIRE because demangling may fail.
   AGIUnit agi;
   // Call it twice just to make sure there are no bugs in the lazy
   // initialization semantics.
-  CHECK(agi.name() == "AGIUnit");
-  CHECK(agi.name() == "AGIUnit");
-  SECTION("correctly demangled") {
-    CHECK(test::AGIUnit().name() == "test::AGIUnit");
-    CHECK(test::AGIUnit2().name() == "Foo");
+  CATCH_CHECK(agi.name() == "AGIUnit");
+  CATCH_CHECK(agi.name() == "AGIUnit");
+  CATCH_SECTION("correctly demangled") {
+    CATCH_CHECK(test::AGIUnit().name() == "test::AGIUnit");
+    CATCH_CHECK(test::AGIUnit2().name() == "Foo");
   }
 }
 
-TEST_CASE("module/as") {
+CATCH_TEST_CASE("module/as") {
   Linear module(3, 4);
-  REQUIRE(module->as<Linear>() == module.get());
-  REQUIRE(module->as<LinearImpl>() == module.get());
-  REQUIRE(module->as<Module>() == module.get());
-  REQUIRE(module->as<AGIUnit>() == nullptr);
+  CATCH_REQUIRE(module->as<Linear>() == module.get());
+  CATCH_REQUIRE(module->as<LinearImpl>() == module.get());
+  CATCH_REQUIRE(module->as<Module>() == module.get());
+  CATCH_REQUIRE(module->as<AGIUnit>() == nullptr);
 
   std::shared_ptr<Module> raw = module.ptr();
-  REQUIRE(raw->as<Linear>() == module.get());
-  REQUIRE(raw->as<LinearImpl>() == module.get());
-  REQUIRE(raw->as<Module>() == module.get());
-  REQUIRE(raw->as<AGIUnit>() == nullptr);
+  CATCH_REQUIRE(raw->as<Linear>() == module.get());
+  CATCH_REQUIRE(raw->as<LinearImpl>() == module.get());
+  CATCH_REQUIRE(raw->as<Module>() == module.get());
+  CATCH_REQUIRE(raw->as<AGIUnit>() == nullptr);
 
   Module& raw_ref = *raw.get();
-  REQUIRE(raw_ref.as<Linear>() == module.get());
-  REQUIRE(raw_ref.as<LinearImpl>() == module.get());
-  REQUIRE(raw_ref.as<Module>() == module.get());
-  REQUIRE(raw_ref.as<AGIUnit>() == nullptr);
+  CATCH_REQUIRE(raw_ref.as<Linear>() == module.get());
+  CATCH_REQUIRE(raw_ref.as<LinearImpl>() == module.get());
+  CATCH_REQUIRE(raw_ref.as<Module>() == module.get());
+  CATCH_REQUIRE(raw_ref.as<AGIUnit>() == nullptr);
   if (auto* linear = raw_ref.as<Linear>()) {
-    REQUIRE(linear->weight.ndimension() == 2);
+    CATCH_REQUIRE(linear->weight.ndimension() == 2);
   }
 
   AGIUnit unit;
-  REQUIRE(unit.as<Linear>() == nullptr);
-  REQUIRE(unit.as<LinearImpl>() == nullptr);
-  REQUIRE(unit.as<AGIUnit>() == &unit);
+  CATCH_REQUIRE(unit.as<Linear>() == nullptr);
+  CATCH_REQUIRE(unit.as<LinearImpl>() == nullptr);
+  CATCH_REQUIRE(unit.as<AGIUnit>() == &unit);
 }
 
-TEST_CASE("module/conversions", "[multi-cuda]") {
+CATCH_TEST_CASE("module/conversions", "[multi-cuda]") {
   torch::manual_seed(0);
   Linear module(128, 64);
-  SECTION("starts as float on CPU") {
+  CATCH_SECTION("starts as float on CPU") {
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device() == torch::Device(torch::kCPU));
-      REQUIRE(parameter->dtype() == torch::kFloat32);
+      CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCPU));
+      CATCH_REQUIRE(parameter->dtype() == torch::kFloat32);
     }
   }
-  SECTION("to(CUDA)") {
+  CATCH_SECTION("to(CUDA)") {
     module->to({torch::kCUDA, 0});
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
-      REQUIRE(parameter->device().index() == 0);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      CATCH_REQUIRE(parameter->device().index() == 0);
     }
     module->to({at::kCUDA, 1});
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
-      REQUIRE(parameter->device().index() == 1);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      CATCH_REQUIRE(parameter->device().index() == 1);
     }
   }
-  SECTION("to(CPU)") {
+  CATCH_SECTION("to(CPU)") {
     module->to(torch::Device(torch::kCPU));
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CPU);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CPU);
     }
   }
-  SECTION("to(Int32)") {
+  CATCH_SECTION("to(Int32)") {
     module->to(torch::kInt32);
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->dtype() == torch::kInt32);
+      CATCH_REQUIRE(parameter->dtype() == torch::kInt32);
     }
   }
-  SECTION("to(Float64)") {
+  CATCH_SECTION("to(Float64)") {
     module->to(torch::kFloat64);
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->dtype() == torch::kFloat64);
+      CATCH_REQUIRE(parameter->dtype() == torch::kFloat64);
     }
   }
-  SECTION("to(CUDA, Byte)") {
+  CATCH_SECTION("to(CUDA, Byte)") {
     module->to(torch::Device(torch::kCUDA, 1), torch::kUInt8);
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
-      REQUIRE(parameter->device().index() == 1);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      CATCH_REQUIRE(parameter->device().index() == 1);
     }
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->dtype() == torch::kUInt8);
+      CATCH_REQUIRE(parameter->dtype() == torch::kUInt8);
     }
   }
 }
 
-TEST_CASE("module/clone") {
+CATCH_TEST_CASE("module/clone") {
   torch::manual_seed(0);
-  SECTION(
+  CATCH_SECTION(
       "a module that does not override clone() throws when clone() is called") {
     struct UnCloneable : Module {};
     UnCloneable module;
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         module.clone(), StartsWith("clone() has not been implemented"));
   }
 
-  SECTION(
+  CATCH_SECTION(
       "a module that overrides clone() does not throw when clone() is called ") {
     struct Cloneable : Module {
       std::shared_ptr<Module> clone(
@@ -190,10 +190,10 @@ TEST_CASE("module/clone") {
       }
     };
     Cloneable module;
-    REQUIRE_NOTHROW(module.clone());
+    CATCH_REQUIRE_NOTHROW(module.clone());
   }
 
-  SECTION("Cloning creates distinct parameters") {
+  CATCH_SECTION("Cloning creates distinct parameters") {
     struct TestModule : public Cloneable<TestModule> {
       TestModule() {
         reset();
@@ -216,32 +216,32 @@ TEST_CASE("module/clone") {
     auto module2 = module->clone();
     auto params1 = module->parameters();
     auto params2 = module2->parameters();
-    REQUIRE(params1.size() == 6);
-    REQUIRE(params2.size() == 6);
+    CATCH_REQUIRE(params1.size() == 6);
+    CATCH_REQUIRE(params2.size() == 6);
     for (auto& param : params1) {
-      REQUIRE(!pointer_equal(param.value, params2[param.key]));
-      REQUIRE(param->allclose(params2[param.key]));
+      CATCH_REQUIRE(!pointer_equal(param.value, params2[param.key]));
+      CATCH_REQUIRE(param->allclose(params2[param.key]));
       param->add_(2);
     }
     for (auto& param : params1) {
-      REQUIRE(!param->allclose(params2[param.key]));
+      CATCH_REQUIRE(!param->allclose(params2[param.key]));
     }
 
     auto buffers1 = module->buffers();
     auto buffers2 = module2->buffers();
-    REQUIRE(buffers1.size() == 1);
-    REQUIRE(buffers2.size() == 1);
+    CATCH_REQUIRE(buffers1.size() == 1);
+    CATCH_REQUIRE(buffers2.size() == 1);
     for (auto& buffer : buffers1) {
-      REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key]));
-      REQUIRE(buffer->allclose(buffers2[buffer.key]));
+      CATCH_REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key]));
+      CATCH_REQUIRE(buffer->allclose(buffers2[buffer.key]));
       buffer->add_(2);
     }
     for (auto& buffer : buffers1) {
-      REQUIRE(!buffer->allclose(buffers2[buffer.key]));
+      CATCH_REQUIRE(!buffer->allclose(buffers2[buffer.key]));
     }
   }
 
-  SECTION("Cloning preserves external references") {
+  CATCH_SECTION("Cloning preserves external references") {
     struct TestModule : public Cloneable<TestModule> {
       TestModule() {
         reset();
@@ -256,19 +256,19 @@ TEST_CASE("module/clone") {
       torch::NoGradGuard no_grad;
       module->weight += 1;
     }
-    REQUIRE(pointer_equal(module->weight, module->parameters()["weight"]));
-    REQUIRE(module->weight.allclose(module->parameters()["weight"]));
+    CATCH_REQUIRE(pointer_equal(module->weight, module->parameters()["weight"]));
+    CATCH_REQUIRE(module->weight.allclose(module->parameters()["weight"]));
 
     auto module2 = std::dynamic_pointer_cast<TestModule>(
         std::shared_ptr<Module>(module->clone()));
-    REQUIRE(!pointer_equal(module2->weight, module->weight));
-    REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"]));
-    REQUIRE(module2->weight.allclose(module2->parameters()["weight"]));
-    REQUIRE(module2->weight.allclose(module->weight));
-    REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"]));
+    CATCH_REQUIRE(!pointer_equal(module2->weight, module->weight));
+    CATCH_REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"]));
+    CATCH_REQUIRE(module2->weight.allclose(module2->parameters()["weight"]));
+    CATCH_REQUIRE(module2->weight.allclose(module->weight));
+    CATCH_REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"]));
   }
 
-  SECTION("Cloning copies the values of variables of submodules") {
+  CATCH_SECTION("Cloning copies the values of variables of submodules") {
     struct TestModule : public Cloneable<TestModule> {
       TestModule() {
         reset();
@@ -299,16 +299,16 @@ TEST_CASE("module/clone") {
 
     auto b = std::dynamic_pointer_cast<NestedModule>(a->clone());
 
-    REQUIRE(!pointer_equal(b->module->weight, a->module->weight));
-    REQUIRE(
+    CATCH_REQUIRE(!pointer_equal(b->module->weight, a->module->weight));
+    CATCH_REQUIRE(
         pointer_equal(b->module->weight, b->module->parameters()["weight"]));
-    REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight));
-    REQUIRE(b->module->weight.allclose(a->module->weight));
-    REQUIRE(b->module->value == a->module->value);
+    CATCH_REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight));
+    CATCH_REQUIRE(b->module->weight.allclose(a->module->weight));
+    CATCH_REQUIRE(b->module->value == a->module->value);
   }
 }
 
-TEST_CASE("module/clone-to-device", "[cuda]") {
+CATCH_TEST_CASE("module/clone-to-device", "[cuda]") {
   struct TestModule : public Cloneable<TestModule> {
     TestModule() {
       reset();
@@ -324,7 +324,7 @@ TEST_CASE("module/clone-to-device", "[cuda]") {
     torch::Tensor buffer;
   };
 
-  SECTION("Cloning preserves the device of parameters/buffers") {
+  CATCH_SECTION("Cloning preserves the device of parameters/buffers") {
     TestModule m;
     torch::Device device(torch::kCUDA, 0);
 
@@ -332,33 +332,33 @@ TEST_CASE("module/clone-to-device", "[cuda]") {
 
     auto clone = m.clone();
     for (const auto& parameter : clone->parameters()) {
-      REQUIRE(parameter->device().type() == device.type());
-      REQUIRE(parameter->device().index() == device.index());
+      CATCH_REQUIRE(parameter->device().type() == device.type());
+      CATCH_REQUIRE(parameter->device().index() == device.index());
     }
     for (const auto& buffer : clone->buffers()) {
-      REQUIRE(buffer->device().type() == device.type());
-      REQUIRE(buffer->device().index() == device.index());
+      CATCH_REQUIRE(buffer->device().type() == device.type());
+      CATCH_REQUIRE(buffer->device().index() == device.index());
     }
   }
 
-  SECTION(
+  CATCH_SECTION(
       "Cloning to a particular device places all parameters/buffers there") {
     TestModule m;
     torch::Device device(torch::kCUDA, 1);
     // everything is on CPU here
     auto clone = m.clone(device);
     for (const auto& parameter : clone->parameters()) {
-      REQUIRE(parameter->device().type() == device.type());
-      REQUIRE(parameter->device().index() == device.index());
+      CATCH_REQUIRE(parameter->device().type() == device.type());
+      CATCH_REQUIRE(parameter->device().index() == device.index());
     }
     for (const auto& buffer : clone->buffers()) {
-      REQUIRE(buffer->device().type() == device.type());
-      REQUIRE(buffer->device().index() == device.index());
+      CATCH_REQUIRE(buffer->device().type() == device.type());
+      CATCH_REQUIRE(buffer->device().index() == device.index());
     }
   }
 }
 
-TEST_CASE("module/parameters") {
+CATCH_TEST_CASE("module/parameters") {
   torch::manual_seed(0);
   struct TestModule : Module {
     TestModule() {
@@ -372,19 +372,19 @@ TEST_CASE("module/parameters") {
 
   TestModule module;
 
-  SECTION("has correct number of parameters") {
-    REQUIRE(module.parameters().size() == 3);
+  CATCH_SECTION("has correct number of parameters") {
+    CATCH_REQUIRE(module.parameters().size() == 3);
   }
 
-  SECTION("contains parameters with the correct name") {
+  CATCH_SECTION("contains parameters with the correct name") {
     auto parameters = module.parameters();
-    REQUIRE(parameters.contains("a"));
-    REQUIRE(parameters.contains("b"));
-    REQUIRE(parameters.contains("c"));
+    CATCH_REQUIRE(parameters.contains("a"));
+    CATCH_REQUIRE(parameters.contains("b"));
+    CATCH_REQUIRE(parameters.contains("c"));
   }
 }
 
-TEST_CASE("module/buffers") {
+CATCH_TEST_CASE("module/buffers") {
   torch::manual_seed(0);
   struct TestModule : Module {
     TestModule() {
@@ -398,19 +398,19 @@ TEST_CASE("module/buffers") {
 
   TestModule module;
 
-  SECTION("has correct number of buffers") {
-    REQUIRE(module.buffers().size() == 3);
+  CATCH_SECTION("has correct number of buffers") {
+    CATCH_REQUIRE(module.buffers().size() == 3);
   }
 
-  SECTION("contains buffers with the correct name") {
+  CATCH_SECTION("contains buffers with the correct name") {
     auto buffers = module.buffers();
-    REQUIRE(buffers.contains("a"));
-    REQUIRE(buffers.contains("b"));
-    REQUIRE(buffers.contains("c"));
+    CATCH_REQUIRE(buffers.contains("a"));
+    CATCH_REQUIRE(buffers.contains("b"));
+    CATCH_REQUIRE(buffers.contains("c"));
   }
 }
 
-TEST_CASE("module/default-constructor") {
+CATCH_TEST_CASE("module/default-constructor") {
   struct AImpl : torch::nn::Module {
     AImpl() : x_(123) {}
     AImpl(int x) : x_(x) {}
@@ -420,20 +420,20 @@ TEST_CASE("module/default-constructor") {
 
   {
     A a;
-    REQUIRE(a);
-    REQUIRE(!a.is_empty());
-    REQUIRE(a->x_ == 123);
+    CATCH_REQUIRE(a);
+    CATCH_REQUIRE(!a.is_empty());
+    CATCH_REQUIRE(a->x_ == 123);
   }
   {
     A a(5);
-    REQUIRE(a);
-    REQUIRE(!a.is_empty());
-    REQUIRE(a->x_ == 5);
+    CATCH_REQUIRE(a);
+    CATCH_REQUIRE(!a.is_empty());
+    CATCH_REQUIRE(a->x_ == 5);
   }
   {
     A a = nullptr;
-    REQUIRE(!a);
-    REQUIRE(a.is_empty());
-    REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder"));
+    CATCH_REQUIRE(!a);
+    CATCH_REQUIRE(a.is_empty());
+    CATCH_REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder"));
   }
 }
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 928a39f7d8b44a..7d4f9ab05fe93e 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/module.h>
 #include <torch/nn/modules/batchnorm.h>
@@ -39,92 +39,92 @@ class NestedModel : public torch::nn::Module {
   std::shared_ptr<TestModel> t;
 };
 
-TEST_CASE("modules") {
+CATCH_TEST_CASE("modules") {
   torch::manual_seed(0);
-  SECTION("conv") {
-    SECTION("1d") {
+  CATCH_SECTION("conv") {
+    CATCH_SECTION("1d") {
       Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
       auto x = torch::randn({2, 3, 5}, torch::requires_grad());
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 3);
-      REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.ndimension() == 3);
+      CATCH_REQUIRE(s.ndimension() == 0);
       for (auto i = 0; i < 3; i++) {
-        REQUIRE(y.size(i) == 2);
+        CATCH_REQUIRE(y.size(i) == 2);
       }
 
-      REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3);
+      CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3);
     }
-    SECTION("2d") {
-      SECTION("even") {
+    CATCH_SECTION("2d") {
+      CATCH_SECTION("even") {
         Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
         auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
         auto y = model->forward(x);
         torch::Tensor s = y.sum();
 
         s.backward();
-        REQUIRE(y.ndimension() == 4);
-        REQUIRE(s.ndimension() == 0);
+        CATCH_REQUIRE(y.ndimension() == 4);
+        CATCH_REQUIRE(s.ndimension() == 0);
         for (auto i = 0; i < 4; i++) {
-          REQUIRE(y.size(i) == 2);
+          CATCH_REQUIRE(y.size(i) == 2);
         }
 
-        REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3);
+        CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3);
       }
 
-      SECTION("uneven") {
+      CATCH_SECTION("uneven") {
         Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
         auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
         auto y = model->forward(x);
         torch::Tensor s = y.sum();
 
         s.backward();
-        REQUIRE(y.ndimension() == 4);
-        REQUIRE(s.ndimension() == 0);
+        CATCH_REQUIRE(y.ndimension() == 4);
+        CATCH_REQUIRE(s.ndimension() == 0);
         for (auto i = 0; i < 4; i++) {
-          REQUIRE(y.size(i) == 2);
+          CATCH_REQUIRE(y.size(i) == 2);
         }
 
-        REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2);
+        CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2);
       }
     }
-    SECTION("3d") {
+    CATCH_SECTION("3d") {
       Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
       auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 5);
-      REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.ndimension() == 5);
+      CATCH_REQUIRE(s.ndimension() == 0);
       for (auto i = 0; i < 5; i++) {
-        REQUIRE(y.size(i) == 2);
+        CATCH_REQUIRE(y.size(i) == 2);
       }
 
-      REQUIRE(
+      CATCH_REQUIRE(
           model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3 * 3);
     }
   }
-  SECTION("linear") {
-    SECTION("basic1") {
+  CATCH_SECTION("linear") {
+    CATCH_SECTION("basic1") {
       Linear model(5, 2);
       auto x = torch::randn({10, 5}, torch::requires_grad());
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 2);
-      REQUIRE(s.ndimension() == 0);
-      REQUIRE(y.size(0) == 10);
-      REQUIRE(y.size(1) == 2);
+      CATCH_REQUIRE(y.ndimension() == 2);
+      CATCH_REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.size(0) == 10);
+      CATCH_REQUIRE(y.size(1) == 2);
 
-      REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+      CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
     }
   }
 
-  SECTION("simple") {
+  CATCH_SECTION("simple") {
     auto model = std::make_shared<SimpleContainer>();
     auto l1 = model->add(Linear(10, 3), "l1");
     auto l2 = model->add(Linear(3, 5), "l2");
@@ -136,20 +136,20 @@ TEST_CASE("modules") {
     x = l3->forward(x).clamp_min(0);
 
     x.backward();
-    REQUIRE(x.ndimension() == 2);
-    REQUIRE(x.size(0) == 1000);
-    REQUIRE(x.size(1) == 100);
-    REQUIRE(x.min().toCFloat() == 0);
+    CATCH_REQUIRE(x.ndimension() == 2);
+    CATCH_REQUIRE(x.size(0) == 1000);
+    CATCH_REQUIRE(x.size(1) == 100);
+    CATCH_REQUIRE(x.min().toCFloat() == 0);
   }
 
-  SECTION("embedding") {
-    SECTION("basic") {
+  CATCH_SECTION("embedding") {
+    CATCH_SECTION("basic") {
       const int64_t dict_size = 10;
       Embedding model(dict_size, 2);
-      REQUIRE(model->parameters().contains("weight"));
-      REQUIRE(model->weight.ndimension() == 2);
-      REQUIRE(model->weight.size(0) == dict_size);
-      REQUIRE(model->weight.size(1) == 2);
+      CATCH_REQUIRE(model->parameters().contains("weight"));
+      CATCH_REQUIRE(model->weight.ndimension() == 2);
+      CATCH_REQUIRE(model->weight.size(0) == dict_size);
+      CATCH_REQUIRE(model->weight.size(1) == 2);
 
       // Cannot get gradients to change indices (input) - only for embedding
       // params
@@ -158,65 +158,65 @@ TEST_CASE("modules") {
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 2);
-      REQUIRE(s.ndimension() == 0);
-      REQUIRE(y.size(0) == 10);
-      REQUIRE(y.size(1) == 2);
+      CATCH_REQUIRE(y.ndimension() == 2);
+      CATCH_REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.size(0) == 10);
+      CATCH_REQUIRE(y.size(1) == 2);
 
-      REQUIRE(model->parameters()["weight"].grad().numel() == 2 * dict_size);
+      CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * dict_size);
     }
 
-    SECTION("list") {
+    CATCH_SECTION("list") {
       Embedding model(6, 4);
       auto x = torch::full({2, 3}, 5, torch::kInt64);
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 3);
-      REQUIRE(y.size(0) == 2);
-      REQUIRE(y.size(1) == 3);
-      REQUIRE(y.size(2) == 4);
+      CATCH_REQUIRE(y.ndimension() == 3);
+      CATCH_REQUIRE(y.size(0) == 2);
+      CATCH_REQUIRE(y.size(1) == 3);
+      CATCH_REQUIRE(y.size(2) == 4);
     }
   }
 
-  SECTION("dropout") {
+  CATCH_SECTION("dropout") {
     Dropout dropout(0.5);
     torch::Tensor x = torch::ones(100, torch::requires_grad());
     torch::Tensor y = dropout->forward(x);
 
     y.backward();
-    REQUIRE(y.ndimension() == 1);
-    REQUIRE(y.size(0) == 100);
-    REQUIRE(y.sum().toCFloat() < 130); // Probably
-    REQUIRE(y.sum().toCFloat() > 70); // Probably
+    CATCH_REQUIRE(y.ndimension() == 1);
+    CATCH_REQUIRE(y.size(0) == 100);
+    CATCH_REQUIRE(y.sum().toCFloat() < 130); // Probably
+    CATCH_REQUIRE(y.sum().toCFloat() > 70); // Probably
 
     dropout->eval();
     y = dropout->forward(x);
-    REQUIRE(y.sum().toCFloat() == 100);
+    CATCH_REQUIRE(y.sum().toCFloat() == 100);
   }
 
-  SECTION("param") {
+  CATCH_SECTION("param") {
     auto model = std::make_shared<NestedModel>();
     auto parameters = model->parameters();
-    REQUIRE(parameters["param"].size(0) == 3);
-    REQUIRE(parameters["param"].size(1) == 2);
-    REQUIRE(parameters["param"].size(2) == 21);
-    REQUIRE(parameters["l1.bias"].size(0) == 20);
-    REQUIRE(parameters["l1.weight"].size(0) == 20);
-    REQUIRE(parameters["l1.weight"].size(1) == 5);
-    REQUIRE(parameters["test.l1.bias"].size(0) == 3);
-    REQUIRE(parameters["test.l1.weight"].size(0) == 3);
-    REQUIRE(parameters["test.l1.weight"].size(1) == 10);
-    REQUIRE(parameters["test.l2.bias"].size(0) == 5);
-    REQUIRE(parameters["test.l2.weight"].size(0) == 5);
-    REQUIRE(parameters["test.l2.weight"].size(1) == 3);
-    REQUIRE(parameters["test.l3.bias"].size(0) == 100);
-    REQUIRE(parameters["test.l3.weight"].size(0) == 100);
-    REQUIRE(parameters["test.l3.weight"].size(1) == 5);
+    CATCH_REQUIRE(parameters["param"].size(0) == 3);
+    CATCH_REQUIRE(parameters["param"].size(1) == 2);
+    CATCH_REQUIRE(parameters["param"].size(2) == 21);
+    CATCH_REQUIRE(parameters["l1.bias"].size(0) == 20);
+    CATCH_REQUIRE(parameters["l1.weight"].size(0) == 20);
+    CATCH_REQUIRE(parameters["l1.weight"].size(1) == 5);
+    CATCH_REQUIRE(parameters["test.l1.bias"].size(0) == 3);
+    CATCH_REQUIRE(parameters["test.l1.weight"].size(0) == 3);
+    CATCH_REQUIRE(parameters["test.l1.weight"].size(1) == 10);
+    CATCH_REQUIRE(parameters["test.l2.bias"].size(0) == 5);
+    CATCH_REQUIRE(parameters["test.l2.weight"].size(0) == 5);
+    CATCH_REQUIRE(parameters["test.l2.weight"].size(1) == 3);
+    CATCH_REQUIRE(parameters["test.l3.bias"].size(0) == 100);
+    CATCH_REQUIRE(parameters["test.l3.weight"].size(0) == 100);
+    CATCH_REQUIRE(parameters["test.l3.weight"].size(1) == 5);
   }
 
-  SECTION("functional") {
+  CATCH_SECTION("functional") {
     {
       bool was_called = false;
       auto functional = Functional([&was_called](torch::Tensor input) {
@@ -224,63 +224,63 @@ TEST_CASE("modules") {
         return input;
       });
       auto output = functional->forward(torch::ones(5, torch::requires_grad()));
-      REQUIRE(was_called);
-      REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
+      CATCH_REQUIRE(was_called);
+      CATCH_REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
 
       was_called = false;
       // Use the call operator overload here.
       output = functional(torch::ones(5, torch::requires_grad()));
-      REQUIRE(was_called);
-      REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
+      CATCH_REQUIRE(was_called);
+      CATCH_REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
     }
     {
       auto functional = Functional(torch::relu);
-      REQUIRE(functional(torch::ones({})).toCFloat() == 1);
-      REQUIRE(functional(torch::ones({})).toCFloat() == 1);
-      REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
+      CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 1);
+      CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 1);
+      CATCH_REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
     }
     {
       auto functional =
           Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
-      REQUIRE(functional(torch::ones({})).toCFloat() == 0);
+      CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 0);
     }
   }
 
-  SECTION("batchnorm") {
+  CATCH_SECTION("batchnorm") {
     {
       BatchNorm bn(5);
 
       // Is stateful by default.
-      REQUIRE(bn->options.stateful());
+      CATCH_REQUIRE(bn->options.stateful());
 
-      REQUIRE(bn->running_mean.defined());
-      REQUIRE(bn->running_mean.dim() == 1);
-      REQUIRE(bn->running_mean.size(0) == 5);
+      CATCH_REQUIRE(bn->running_mean.defined());
+      CATCH_REQUIRE(bn->running_mean.dim() == 1);
+      CATCH_REQUIRE(bn->running_mean.size(0) == 5);
 
-      REQUIRE(bn->running_variance.defined());
-      REQUIRE(bn->running_variance.dim() == 1);
-      REQUIRE(bn->running_variance.size(0) == 5);
+      CATCH_REQUIRE(bn->running_variance.defined());
+      CATCH_REQUIRE(bn->running_variance.dim() == 1);
+      CATCH_REQUIRE(bn->running_variance.size(0) == 5);
 
       // Is affine by default.
-      REQUIRE(bn->options.affine());
+      CATCH_REQUIRE(bn->options.affine());
 
-      REQUIRE(bn->weight.defined());
-      REQUIRE(bn->weight.dim() == 1);
-      REQUIRE(bn->weight.size(0) == 5);
+      CATCH_REQUIRE(bn->weight.defined());
+      CATCH_REQUIRE(bn->weight.dim() == 1);
+      CATCH_REQUIRE(bn->weight.size(0) == 5);
 
-      REQUIRE(bn->bias.defined());
-      REQUIRE(bn->bias.dim() == 1);
-      REQUIRE(bn->bias.size(0) == 5);
+      CATCH_REQUIRE(bn->bias.defined());
+      CATCH_REQUIRE(bn->bias.dim() == 1);
+      CATCH_REQUIRE(bn->bias.size(0) == 5);
     }
     {
       BatchNorm bn(BatchNormOptions(5).stateful(false).affine(false));
 
-      REQUIRE(!bn->running_mean.defined());
-      REQUIRE(!bn->running_variance.defined());
-      REQUIRE(!bn->weight.defined());
-      REQUIRE(!bn->bias.defined());
+      CATCH_REQUIRE(!bn->running_mean.defined());
+      CATCH_REQUIRE(!bn->running_variance.defined());
+      CATCH_REQUIRE(!bn->weight.defined());
+      CATCH_REQUIRE(!bn->bias.defined());
 
-      REQUIRE_THROWS_WITH(
+      CATCH_REQUIRE_THROWS_WITH(
           bn->forward(torch::ones({2, 5})),
           StartsWith("Calling BatchNorm::forward is only permitted "
                      "when the 'stateful' option is true (was false). "
@@ -298,14 +298,14 @@ TEST_CASE("modules") {
       auto output = bn->pure_forward(input, mean, variance);
       auto expected =
           (input - mean) / torch::sqrt(variance + bn->options.eps());
-      REQUIRE(output.allclose(expected));
+      CATCH_REQUIRE(output.allclose(expected));
     }
   }
 }
 
-TEST_CASE("modules_cuda", "[cuda]") {
+CATCH_TEST_CASE("modules_cuda", "[cuda]") {
   torch::manual_seed(0);
-  SECTION("1") {
+  CATCH_SECTION("1") {
     Linear model(5, 2);
     model->to(torch::kCUDA);
     auto x =
@@ -314,15 +314,15 @@ TEST_CASE("modules_cuda", "[cuda]") {
     torch::Tensor s = y.sum();
 
     s.backward();
-    REQUIRE(y.ndimension() == 2);
-    REQUIRE(s.ndimension() == 0);
-    REQUIRE(y.size(0) == 10);
-    REQUIRE(y.size(1) == 2);
+    CATCH_REQUIRE(y.ndimension() == 2);
+    CATCH_REQUIRE(s.ndimension() == 0);
+    CATCH_REQUIRE(y.size(0) == 10);
+    CATCH_REQUIRE(y.size(1) == 2);
 
-    REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+    CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
   }
 
-  SECTION("2") {
+  CATCH_SECTION("2") {
     Linear model(5, 2);
     model->to(torch::kCUDA);
     model->to(torch::kCPU);
@@ -331,11 +331,11 @@ TEST_CASE("modules_cuda", "[cuda]") {
     torch::Tensor s = y.sum();
 
     s.backward();
-    REQUIRE(y.ndimension() == 2);
-    REQUIRE(s.ndimension() == 0);
-    REQUIRE(y.size(0) == 10);
-    REQUIRE(y.size(1) == 2);
+    CATCH_REQUIRE(y.ndimension() == 2);
+    CATCH_REQUIRE(s.ndimension() == 0);
+    CATCH_REQUIRE(y.size(0) == 10);
+    CATCH_REQUIRE(y.size(1) == 2);
 
-    REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+    CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
   }
 }
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index ab278180b12b34..4cb398dd89dc82 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/module.h>
 #include <torch/nn/modules/functional.h>
@@ -118,24 +118,24 @@ void check_exact_values(
     optimizer.step();
 
     if (i % kSampleEvery == 0) {
-      REQUIRE(
+      CATCH_REQUIRE(
           expected_parameters.at(i / kSampleEvery).size() == parameters.size());
       for (size_t p = 0; p < parameters.size(); ++p) {
-        REQUIRE(parameters.at(p)->defined());
+        CATCH_REQUIRE(parameters.at(p)->defined());
         auto computed = parameters.at(p)->flatten();
         auto expected = expected_parameters.at(i / kSampleEvery).at(p);
         if (!computed.allclose(expected, /*rtol=*/1e-3, /*atol=*/5e-4)) {
           std::cout << "Iteration " << i << ": " << computed
                     << " != " << expected << " (parameter " << p << ")"
                     << std::endl;
-          REQUIRE(false);
+          CATCH_REQUIRE(false);
         }
       }
     }
   }
 }
 
-TEST_CASE("Optim/BasicInterface") {
+CATCH_TEST_CASE("Optim/BasicInterface") {
   struct MyOptimizer : Optimizer {
     using Optimizer::Optimizer;
     void step() override {}
@@ -144,139 +144,139 @@ TEST_CASE("Optim/BasicInterface") {
       torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})};
   {
     MyOptimizer optimizer(parameters);
-    REQUIRE(optimizer.size() == parameters.size());
+    CATCH_REQUIRE(optimizer.size() == parameters.size());
   }
   {
     MyOptimizer optimizer;
-    REQUIRE(optimizer.size() == 0);
+    CATCH_REQUIRE(optimizer.size() == 0);
     optimizer.add_parameters(parameters);
-    REQUIRE(optimizer.size() == parameters.size());
+    CATCH_REQUIRE(optimizer.size() == parameters.size());
     for (size_t p = 0; p < parameters.size(); ++p) {
-      REQUIRE(optimizer.parameters()[p].allclose(parameters[p]));
+      CATCH_REQUIRE(optimizer.parameters()[p].allclose(parameters[p]));
     }
   }
   {
     Linear linear(3, 4);
     MyOptimizer optimizer(linear->parameters());
-    REQUIRE(optimizer.size() == linear->parameters().size());
+    CATCH_REQUIRE(optimizer.size() == linear->parameters().size());
   }
 }
 
-TEST_CASE("Optim/XORConvergence/SGD") {
-  REQUIRE(test_optimizer_xor<SGD>(
+CATCH_TEST_CASE("Optim/XORConvergence/SGD") {
+  CATCH_REQUIRE(test_optimizer_xor<SGD>(
       SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6)));
 }
 
-TEST_CASE("Optim/XORConvergence/Adagrad") {
-  REQUIRE(test_optimizer_xor<Adagrad>(
+CATCH_TEST_CASE("Optim/XORConvergence/Adagrad") {
+  CATCH_REQUIRE(test_optimizer_xor<Adagrad>(
       AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3)));
 }
 
-TEST_CASE("Optim/XORConvergence/RMSprop") {
-  REQUIRE(test_optimizer_xor<RMSprop>(RMSpropOptions(0.1).centered(true)));
+CATCH_TEST_CASE("Optim/XORConvergence/RMSprop") {
+  CATCH_REQUIRE(test_optimizer_xor<RMSprop>(RMSpropOptions(0.1).centered(true)));
 }
 
-TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") {
-  REQUIRE(test_optimizer_xor<RMSprop>(
+CATCH_TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") {
+  CATCH_REQUIRE(test_optimizer_xor<RMSprop>(
       RMSpropOptions(0.1).momentum(0.9).weight_decay(1e-6)));
 }
 
-TEST_CASE("Optim/XORConvergence/Adam") {
-  REQUIRE(test_optimizer_xor<Adam>(AdamOptions(0.1).weight_decay(1e-6)));
+CATCH_TEST_CASE("Optim/XORConvergence/Adam") {
+  CATCH_REQUIRE(test_optimizer_xor<Adam>(AdamOptions(0.1).weight_decay(1e-6)));
 }
 
-TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") {
-  REQUIRE(test_optimizer_xor<Adam>(
+CATCH_TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") {
+  CATCH_REQUIRE(test_optimizer_xor<Adam>(
       AdamOptions(0.1).weight_decay(1e-6).amsgrad(true)));
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/Adam") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/Adam") {
   check_exact_values<Adam>(AdamOptions(1.0), expected_parameters::Adam);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecay") {
   check_exact_values<Adam>(
       AdamOptions(1.0).weight_decay(1e-2),
       expected_parameters::Adam_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecayAndAMSGrad") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecayAndAMSGrad") {
   check_exact_values<Adam>(
       AdamOptions(1.0).weight_decay(1e-6).amsgrad(true),
       expected_parameters::Adam_with_weight_decay_and_amsgrad);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") {
   check_exact_values<Adagrad>(
       AdagradOptions(1.0), expected_parameters::Adagrad);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecay") {
   check_exact_values<Adagrad>(
       AdagradOptions(1.0).weight_decay(1e-2),
       expected_parameters::Adagrad_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecayAndLRDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecayAndLRDecay") {
   check_exact_values<Adagrad>(
       AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3),
       expected_parameters::Adagrad_with_weight_decay_and_lr_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1), expected_parameters::RMSprop);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecay") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1).weight_decay(1e-2),
       expected_parameters::RMSprop_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCentered") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCentered") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1).weight_decay(1e-6).centered(true),
       expected_parameters::RMSprop_with_weight_decay_and_centered);
 }
 
-TEST_CASE(
+CATCH_TEST_CASE(
     "Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCenteredAndMomentum") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1).weight_decay(1e-6).centered(true).momentum(0.9),
       expected_parameters::RMSprop_with_weight_decay_and_centered_and_momentum);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGD") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGD") {
   check_exact_values<SGD>(SGDOptions(0.1), expected_parameters::SGD);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecay") {
   check_exact_values<SGD>(
       SGDOptions(0.1).weight_decay(1e-2),
       expected_parameters::SGD_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndMomentum") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndMomentum") {
   check_exact_values<SGD>(
       SGDOptions(0.1).weight_decay(1e-2).momentum(0.9),
       expected_parameters::SGD_with_weight_decay_and_momentum);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndNesterovMomentum") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndNesterovMomentum") {
   check_exact_values<SGD>(
       SGDOptions(0.1).weight_decay(1e-6).momentum(0.9).nesterov(true),
       expected_parameters::SGD_with_weight_decay_and_nesterov_momentum);
 }
 
-TEST_CASE("Optim/ZeroGrad") {
+CATCH_TEST_CASE("Optim/ZeroGrad") {
   torch::manual_seed(0);
 
   Linear model(2, 8);
   SGD optimizer(model->parameters(), 0.1);
 
   for (const auto& parameter : model->parameters()) {
-    REQUIRE(!parameter->grad().defined());
+    CATCH_REQUIRE(!parameter->grad().defined());
   }
 
   auto output = model->forward(torch::ones({5, 2}));
@@ -284,19 +284,19 @@ TEST_CASE("Optim/ZeroGrad") {
   loss.backward();
 
   for (const auto& parameter : model->parameters()) {
-    REQUIRE(parameter->grad().defined());
-    REQUIRE(parameter->grad().sum().toCFloat() > 0);
+    CATCH_REQUIRE(parameter->grad().defined());
+    CATCH_REQUIRE(parameter->grad().sum().toCFloat() > 0);
   }
 
   optimizer.zero_grad();
 
   for (const auto& parameter : model->parameters()) {
-    REQUIRE(parameter->grad().defined());
-    REQUIRE(parameter->grad().sum().toCFloat() == 0);
+    CATCH_REQUIRE(parameter->grad().defined());
+    CATCH_REQUIRE(parameter->grad().sum().toCFloat() == 0);
   }
 }
 
-TEST_CASE("Optim/ExternalVectorOfParameters") {
+CATCH_TEST_CASE("Optim/ExternalVectorOfParameters") {
   torch::manual_seed(0);
 
   std::vector<torch::Tensor> parameters = {
@@ -313,12 +313,12 @@ TEST_CASE("Optim/ExternalVectorOfParameters") {
 
   optimizer.step();
 
-  REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0));
-  REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0));
-  REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0));
+  CATCH_REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0));
+  CATCH_REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0));
+  CATCH_REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0));
 }
 
-TEST_CASE("Optim/AddParameter/LBFGS") {
+CATCH_TEST_CASE("Optim/AddParameter/LBFGS") {
   torch::manual_seed(0);
 
   std::vector<torch::Tensor> parameters = {torch::randn({5, 5})};
diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
index a1517586fc9718..33e3a16d9a8040 100644
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/csrc/autograd/functions/comm.h>
 #include <torch/nn/module.h>
@@ -19,92 +19,92 @@ using namespace torch::nn;
 
 #ifdef USE_CUDA
 
-TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
   Scatter scatter(
       {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
 
   auto input = torch::ones(10, torch::requires_grad(true));
   auto output = scatter.apply({input});
 
-  REQUIRE(output.size() == 2);
-  REQUIRE(output[0].size(0) == 5);
-  REQUIRE(output[1].size(0) == 5);
+  CATCH_REQUIRE(output.size() == 2);
+  CATCH_REQUIRE(output[0].size(0) == 5);
+  CATCH_REQUIRE(output[1].size(0) == 5);
 
-  REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
+  CATCH_REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
               .allclose(input));
 
   auto sum = output[0].to({torch::kCUDA, 1}) + output[1];
   sum.backward();
 
-  REQUIRE(input.grad().defined());
-  REQUIRE(input.grad().device().is_cpu());
-  REQUIRE(input.grad().sum().toCInt() == 10);
+  CATCH_REQUIRE(input.grad().defined());
+  CATCH_REQUIRE(input.grad().device().is_cpu());
+  CATCH_REQUIRE(input.grad().sum().toCInt() == 10);
 }
 
-TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
   Gather gather(torch::Device(torch::kCUDA, 1));
 
   auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
   auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));
 
   auto outputs = gather.apply({a, b});
-  REQUIRE(outputs.size() == 1);
+  CATCH_REQUIRE(outputs.size() == 1);
   auto& output = outputs.front();
 
-  REQUIRE(output.size(0) == 10);
-  REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(output.size(0) == 10);
+  CATCH_REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));
 
   auto chunks = output.chunk(2);
-  REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
-  REQUIRE(chunks[1].allclose(b));
+  CATCH_REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
+  CATCH_REQUIRE(chunks[1].allclose(b));
 
   output.backward();
 
-  REQUIRE(a.grad().defined());
-  REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
-  REQUIRE(a.grad().sum().toCInt() == 5);
+  CATCH_REQUIRE(a.grad().defined());
+  CATCH_REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
+  CATCH_REQUIRE(a.grad().sum().toCInt() == 5);
 
-  REQUIRE(b.grad().defined());
-  REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
-  REQUIRE(b.grad().sum().toCInt() == 5);
+  CATCH_REQUIRE(b.grad().defined());
+  CATCH_REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(b.grad().sum().toCInt() == 5);
 }
 
-TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
   Linear linear(3, 4);
   auto replicas = parallel::replicate(
       linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
-  REQUIRE(replicas.size() == 2);
+  CATCH_REQUIRE(replicas.size() == 2);
 
   auto original_parameters = linear->parameters();
 
   auto replica1_parameters = replicas[0]->parameters();
   for (auto& parameter : replica1_parameters) {
-    REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
+    CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
   }
   replicas[0]->to(torch::kCPU);
-  REQUIRE(replica1_parameters.size() == original_parameters.size());
+  CATCH_REQUIRE(replica1_parameters.size() == original_parameters.size());
   for (size_t i = 0; i < original_parameters.size(); ++i) {
-    REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
-    REQUIRE(
+    CATCH_REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
+    CATCH_REQUIRE(
         replica1_parameters[i].data<float>() !=
         original_parameters[i].data<float>());
   }
 
   auto replica2_parameters = replicas[1]->parameters();
   for (auto& parameter : replica2_parameters) {
-    REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
+    CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
   }
   replicas[1]->to(torch::kCPU);
-  REQUIRE(replica2_parameters.size() == original_parameters.size());
+  CATCH_REQUIRE(replica2_parameters.size() == original_parameters.size());
   for (size_t i = 0; i < original_parameters.size(); ++i) {
-    REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
-    REQUIRE(
+    CATCH_REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
+    CATCH_REQUIRE(
         replica2_parameters[i].data<float>() !=
         original_parameters[i].data<float>());
   }
 }
 
-TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
   Linear a(3, 4);
 
   Linear b(std::static_pointer_cast<LinearImpl>(a->clone()));
@@ -121,17 +121,17 @@ TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
 
   auto outputs = parallel::parallel_apply(modules, inputs);
 
-  REQUIRE(outputs.size() == 3);
-  REQUIRE(outputs[0].device().is_cpu());
+  CATCH_REQUIRE(outputs.size() == 3);
+  CATCH_REQUIRE(outputs[0].device().is_cpu());
 
-  REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
-  REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
+  CATCH_REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
+  CATCH_REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
 
-  REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
-  REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
+  CATCH_REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
 }
 
-TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
   struct M : torch::nn::Module {
     torch::Tensor forward(torch::Tensor input) {
       return torch::ones({5}, torch::dtype(torch::kInt32));
@@ -147,17 +147,17 @@ TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
 
   auto outputs = parallel::parallel_apply(modules, inputs, devices);
 
-  REQUIRE(outputs.size() == 3);
-  REQUIRE(outputs[0].device().is_cuda());
-  REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(outputs.size() == 3);
+  CATCH_REQUIRE(outputs[0].device().is_cuda());
+  CATCH_REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));
 
-  REQUIRE(outputs[1].device().is_cuda());
-  REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
+  CATCH_REQUIRE(outputs[1].device().is_cuda());
+  CATCH_REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
 
-  REQUIRE(outputs[2].device().is_cpu());
+  CATCH_REQUIRE(outputs[2].device().is_cpu());
 }
 
-TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
@@ -167,11 +167,11 @@ TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
 
   auto m = std::make_shared<M>();
   auto input = torch::ones({10, 3});
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       parallel::data_parallel(m, input), StartsWith("Badness!"));
 }
 
-TEST_CASE(
+CATCH_TEST_CASE(
     "Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice",
     "[multi-cuda]") {
   struct M : torch::nn::Cloneable<M> {
@@ -192,9 +192,9 @@ TEST_CASE(
         input,
         /*devices=*/at::nullopt,
         /*output_device=*/torch::Device(torch::kCUDA, 1));
-    REQUIRE(output.defined());
-    REQUIRE(output.device().is_cuda());
-    REQUIRE(output.device().index() == 1);
+    CATCH_REQUIRE(output.defined());
+    CATCH_REQUIRE(output.device().is_cuda());
+    CATCH_REQUIRE(output.device().index() == 1);
   }
   {
     // Verify for the single-device case (where we don't scatter/gather).
@@ -203,16 +203,16 @@ TEST_CASE(
         input,
         /*devices=*/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
         /*output_device=*/torch::Device(torch::kCUDA, 1));
-    REQUIRE(m->intermediate_tensor.defined());
-    REQUIRE(m->intermediate_tensor.device().is_cuda());
-    REQUIRE(m->intermediate_tensor.device().index() == 0);
-    REQUIRE(output.defined());
-    REQUIRE(output.device().is_cuda());
-    REQUIRE(output.device().index() == 1);
+    CATCH_REQUIRE(m->intermediate_tensor.defined());
+    CATCH_REQUIRE(m->intermediate_tensor.device().is_cuda());
+    CATCH_REQUIRE(m->intermediate_tensor.device().index() == 0);
+    CATCH_REQUIRE(output.defined());
+    CATCH_REQUIRE(output.device().is_cuda());
+    CATCH_REQUIRE(output.device().index() == 1);
   }
 }
 
-TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
+CATCH_TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
@@ -225,9 +225,9 @@ TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
   auto output = parallel::data_parallel(m, input);
 
   const auto device_count = torch::cuda::device_count();
-  REQUIRE(output.numel() == device_count);
+  CATCH_REQUIRE(output.numel() == device_count);
   for (size_t i = 0; i < device_count; ++i) {
-    REQUIRE(output[i].toCInt() == i);
+    CATCH_REQUIRE(output[i].toCInt() == i);
   }
 }
 
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index 96685728484a39..a307851afbae46 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/linear.h>
 #include <torch/nn/modules/rnn.h>
@@ -71,22 +71,22 @@ void check_lstm_sizes(RNNOutput output) {
   // Expect the LSTM to have 64 outputs and 3 layers, with an input of batch
   // 10 and 16 time steps (10 x 16 x n)
 
-  REQUIRE(output.output.ndimension() == 3);
-  REQUIRE(output.output.size(0) == 10);
-  REQUIRE(output.output.size(1) == 16);
-  REQUIRE(output.output.size(2) == 64);
+  CATCH_REQUIRE(output.output.ndimension() == 3);
+  CATCH_REQUIRE(output.output.size(0) == 10);
+  CATCH_REQUIRE(output.output.size(1) == 16);
+  CATCH_REQUIRE(output.output.size(2) == 64);
 
-  REQUIRE(output.state.ndimension() == 4);
-  REQUIRE(output.state.size(0) == 2); // (hx, cx)
-  REQUIRE(output.state.size(1) == 3); // layers
-  REQUIRE(output.state.size(2) == 16); // Batchsize
-  REQUIRE(output.state.size(3) == 64); // 64 hidden dims
+  CATCH_REQUIRE(output.state.ndimension() == 4);
+  CATCH_REQUIRE(output.state.size(0) == 2); // (hx, cx)
+  CATCH_REQUIRE(output.state.size(1) == 3); // layers
+  CATCH_REQUIRE(output.state.size(2) == 16); // Batchsize
+  CATCH_REQUIRE(output.state.size(3) == 64); // 64 hidden dims
 
   // Something is in the hiddens
-  REQUIRE(output.state.norm().toCFloat() > 0);
+  CATCH_REQUIRE(output.state.norm().toCFloat() > 0);
 }
 
-TEST_CASE("RNN/CheckOutputSizes") {
+CATCH_TEST_CASE("RNN/CheckOutputSizes") {
   torch::manual_seed(0);
   LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
   // Input size is: sequence length, batch size, input size
@@ -104,10 +104,10 @@ TEST_CASE("RNN/CheckOutputSizes") {
   torch::Tensor diff = next.state - output.state;
 
   // Hiddens changed
-  REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
+  CATCH_REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
 }
 
-TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
+CATCH_TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
   torch::manual_seed(0);
   // Make sure the outputs match pytorch outputs
   LSTM model(2, 2);
@@ -127,10 +127,10 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
   }
 
   auto out = model->forward(x);
-  REQUIRE(out.output.ndimension() == 3);
-  REQUIRE(out.output.size(0) == 3);
-  REQUIRE(out.output.size(1) == 4);
-  REQUIRE(out.output.size(2) == 2);
+  CATCH_REQUIRE(out.output.ndimension() == 3);
+  CATCH_REQUIRE(out.output.size(0) == 3);
+  CATCH_REQUIRE(out.output.size(1) == 4);
+  CATCH_REQUIRE(out.output.size(2) == 2);
 
   auto flat = out.output.view(3 * 4 * 2);
   float c_out[] = {0.4391, 0.5402, 0.4330, 0.5324, 0.4261, 0.5239,
@@ -138,14 +138,14 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
                    0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
                    0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
   for (size_t i = 0; i < 3 * 4 * 2; i++) {
-    REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3);
+    CATCH_REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3);
   }
 
-  REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2
-  REQUIRE(out.state.size(0) == 2);
-  REQUIRE(out.state.size(1) == 1);
-  REQUIRE(out.state.size(2) == 4);
-  REQUIRE(out.state.size(3) == 2);
+  CATCH_REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2
+  CATCH_REQUIRE(out.state.size(0) == 2);
+  CATCH_REQUIRE(out.state.size(1) == 1);
+  CATCH_REQUIRE(out.state.size(2) == 4);
+  CATCH_REQUIRE(out.state.size(3) == 2);
   flat = out.state.view(16);
   float h_out[] = {0.7889,
                    0.9003,
@@ -164,33 +164,33 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
                    1.0931,
                    1.4911};
   for (size_t i = 0; i < 16; i++) {
-    REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3);
+    CATCH_REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3);
   }
 }
 
-TEST_CASE("RNN/integration/LSTM") {
-  REQUIRE(test_RNN_xor<LSTM>(
+CATCH_TEST_CASE("RNN/integration/LSTM") {
+  CATCH_REQUIRE(test_RNN_xor<LSTM>(
       [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }));
 }
 
-TEST_CASE("RNN/integration/GRU") {
-  REQUIRE(
+CATCH_TEST_CASE("RNN/integration/GRU") {
+  CATCH_REQUIRE(
       test_RNN_xor<GRU>([](int s) { return GRU(GRUOptions(s, s).layers(2)); }));
 }
 
-TEST_CASE("RNN/integration/RNN") {
-  SECTION("relu") {
-    REQUIRE(test_RNN_xor<RNN>(
+CATCH_TEST_CASE("RNN/integration/RNN") {
+  CATCH_SECTION("relu") {
+    CATCH_REQUIRE(test_RNN_xor<RNN>(
         [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }));
   }
-  SECTION("tanh") {
-    REQUIRE(test_RNN_xor<RNN>(
+  CATCH_SECTION("tanh") {
+    CATCH_REQUIRE(test_RNN_xor<RNN>(
         [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }));
   }
 }
 
-TEST_CASE("rnn_cuda", "[cuda]") {
-  SECTION("sizes") {
+CATCH_TEST_CASE("rnn_cuda", "[cuda]") {
+  CATCH_SECTION("sizes") {
     torch::manual_seed(0);
     LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
     model->to(torch::kCUDA);
@@ -209,26 +209,26 @@ TEST_CASE("rnn_cuda", "[cuda]") {
     torch::Tensor diff = next.state - output.state;
 
     // Hiddens changed
-    REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
+    CATCH_REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
   }
 
-  SECTION("lstm") {
-    REQUIRE(test_RNN_xor<LSTM>(
+  CATCH_SECTION("lstm") {
+    CATCH_REQUIRE(test_RNN_xor<LSTM>(
         [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }, true));
   }
 
-  SECTION("gru") {
-    REQUIRE(test_RNN_xor<GRU>(
+  CATCH_SECTION("gru") {
+    CATCH_REQUIRE(test_RNN_xor<GRU>(
         [](int s) { return GRU(GRUOptions(s, s).layers(2)); }, true));
   }
 
-  SECTION("rnn") {
-    SECTION("relu") {
-      REQUIRE(test_RNN_xor<RNN>(
+  CATCH_SECTION("rnn") {
+    CATCH_SECTION("relu") {
+      CATCH_REQUIRE(test_RNN_xor<RNN>(
           [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }, true));
     }
-    SECTION("tanh") {
-      REQUIRE(test_RNN_xor<RNN>(
+    CATCH_SECTION("tanh") {
+      CATCH_REQUIRE(test_RNN_xor<RNN>(
           [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }, true));
     }
   }
diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp
index aef1332720e4a4..777d6e28024b9b 100644
--- a/test/cpp/api/sequential.cpp
+++ b/test/cpp/api/sequential.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules.h>
 #include <torch/nn/modules/batchnorm.h>
@@ -21,7 +21,7 @@ using namespace torch::test;
 
 using Catch::StartsWith;
 
-TEST_CASE("Sequential/ConstructsFromSharedPointer") {
+CATCH_TEST_CASE("Sequential/ConstructsFromSharedPointer") {
   struct M : torch::nn::Module {
     explicit M(int value_) : value(value_) {}
     int value;
@@ -31,10 +31,10 @@ TEST_CASE("Sequential/ConstructsFromSharedPointer") {
   };
   Sequential sequential(
       std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3));
-  REQUIRE(sequential->size() == 3);
+  CATCH_REQUIRE(sequential->size() == 3);
 }
 
-TEST_CASE("Sequential/ConstructsFromConcreteType") {
+CATCH_TEST_CASE("Sequential/ConstructsFromConcreteType") {
   struct M : torch::nn::Module {
     explicit M(int value_) : value(value_) {}
     int value;
@@ -44,9 +44,9 @@ TEST_CASE("Sequential/ConstructsFromConcreteType") {
   };
 
   Sequential sequential(M(1), M(2), M(3));
-  REQUIRE(sequential->size() == 3);
+  CATCH_REQUIRE(sequential->size() == 3);
 }
-TEST_CASE("Sequential/ConstructsFromModuleHolder") {
+CATCH_TEST_CASE("Sequential/ConstructsFromModuleHolder") {
   struct MImpl : torch::nn::Module {
     explicit MImpl(int value_) : value(value_) {}
     int forward() {
@@ -61,10 +61,10 @@ TEST_CASE("Sequential/ConstructsFromModuleHolder") {
   };
 
   Sequential sequential(M(1), M(2), M(3));
-  REQUIRE(sequential->size() == 3);
+  CATCH_REQUIRE(sequential->size() == 3);
 }
 
-TEST_CASE("Sequential/PushBackAddsAnElement") {
+CATCH_TEST_CASE("Sequential/PushBackAddsAnElement") {
   struct M : torch::nn::Module {
     explicit M(int value_) : value(value_) {}
     int forward() {
@@ -73,17 +73,17 @@ TEST_CASE("Sequential/PushBackAddsAnElement") {
     int value;
   };
   Sequential sequential;
-  REQUIRE(sequential->size() == 0);
-  REQUIRE(sequential->is_empty());
+  CATCH_REQUIRE(sequential->size() == 0);
+  CATCH_REQUIRE(sequential->is_empty());
   sequential->push_back(Linear(3, 4));
-  REQUIRE(sequential->size() == 1);
+  CATCH_REQUIRE(sequential->size() == 1);
   sequential->push_back(std::make_shared<M>(1));
-  REQUIRE(sequential->size() == 2);
+  CATCH_REQUIRE(sequential->size() == 2);
   sequential->push_back(M(2));
-  REQUIRE(sequential->size() == 3);
+  CATCH_REQUIRE(sequential->size() == 3);
 }
 
-TEST_CASE("Sequential/AccessWithAt") {
+CATCH_TEST_CASE("Sequential/AccessWithAt") {
   struct M : torch::nn::Module {
     explicit M(int value_) : value(value_) {}
     int forward() {
@@ -98,22 +98,22 @@ TEST_CASE("Sequential/AccessWithAt") {
   for (auto& module : modules) {
     sequential->push_back(module);
   }
-  REQUIRE(sequential->size() == 3);
+  CATCH_REQUIRE(sequential->size() == 3);
 
   // returns the correct module for a given index
   for (size_t i = 0; i < modules.size(); ++i) {
-    REQUIRE(&sequential->at<M>(i) == modules[i].get());
+    CATCH_REQUIRE(&sequential->at<M>(i) == modules[i].get());
   }
 
   // throws for a bad index
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       sequential->at<M>(modules.size() + 1), StartsWith("Index out of range"));
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       sequential->at<M>(modules.size() + 1000000),
       StartsWith("Index out of range"));
 }
 
-TEST_CASE("Sequential/AccessWithPtr") {
+CATCH_TEST_CASE("Sequential/AccessWithPtr") {
   struct M : torch::nn::Module {
     explicit M(int value_) : value(value_) {}
     int forward() {
@@ -128,46 +128,46 @@ TEST_CASE("Sequential/AccessWithPtr") {
   for (auto& module : modules) {
     sequential->push_back(module);
   }
-  REQUIRE(sequential->size() == 3);
+  CATCH_REQUIRE(sequential->size() == 3);
 
   // returns the correct module for a given index
   for (size_t i = 0; i < modules.size(); ++i) {
-    REQUIRE(sequential->ptr(i).get() == modules[i].get());
-    REQUIRE(sequential[i].get() == modules[i].get());
-    REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
+    CATCH_REQUIRE(sequential->ptr(i).get() == modules[i].get());
+    CATCH_REQUIRE(sequential[i].get() == modules[i].get());
+    CATCH_REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
   }
 
   // throws for a bad index
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       sequential->ptr(modules.size() + 1), StartsWith("Index out of range"));
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       sequential->ptr(modules.size() + 1000000),
       StartsWith("Index out of range"));
 }
 
-TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") {
+CATCH_TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") {
   Sequential empty;
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       empty->forward<int>(),
       StartsWith("Cannot call forward() on an empty Sequential"));
 }
 
-TEST_CASE("Sequential/CallingForwardChainsCorrectly") {
+CATCH_TEST_CASE("Sequential/CallingForwardChainsCorrectly") {
   struct MockModule : torch::nn::Module {
     explicit MockModule(int value) : expected(value) {}
     int expected;
     int forward(int value) {
-      REQUIRE(value == expected);
+      CATCH_REQUIRE(value == expected);
       return value + 1;
     }
   };
 
   Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3});
 
-  REQUIRE(sequential->forward<int>(1) == 4);
+  CATCH_REQUIRE(sequential->forward<int>(1) == 4);
 }
 
-TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") {
+CATCH_TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") {
   struct M : public torch::nn::Module {
     int forward() {
       return 5;
@@ -175,14 +175,14 @@ TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") {
   };
 
   Sequential sequential(M{});
-  REQUIRE(sequential->forward<int>() == 5);
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE(sequential->forward<int>() == 5);
+  CATCH_REQUIRE_THROWS_WITH(
       sequential->forward<float>(),
       StartsWith("The type of the return value "
                  "is int, but you asked for type float"));
 }
 
-TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") {
+CATCH_TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") {
   struct M : public torch::nn::Module {
     torch::Tensor forward(torch::Tensor v) {
       return v;
@@ -191,21 +191,21 @@ TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") {
 
   Sequential sequential(M{});
   auto variable = torch::ones({3, 3}, torch::requires_grad());
-  REQUIRE(sequential->forward(variable).equal(variable));
+  CATCH_REQUIRE(sequential->forward(variable).equal(variable));
 }
 
-TEST_CASE("Sequential/ForwardReturnsTheLastValue") {
+CATCH_TEST_CASE("Sequential/ForwardReturnsTheLastValue") {
   torch::manual_seed(0);
   Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100));
 
   auto x = torch::randn({1000, 10}, torch::requires_grad());
   auto y = sequential->forward(x);
-  REQUIRE(y.ndimension() == 2);
-  REQUIRE(y.size(0) == 1000);
-  REQUIRE(y.size(1) == 100);
+  CATCH_REQUIRE(y.ndimension() == 2);
+  CATCH_REQUIRE(y.size(0) == 1000);
+  CATCH_REQUIRE(y.size(1) == 100);
 }
 
-TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") {
+CATCH_TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") {
   Sequential sequential(
       Linear(10, 3),
       Conv2d(1, 2, 3),
@@ -215,7 +215,7 @@ TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") {
       LSTM(4, 5));
 }
 
-TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") {
+CATCH_TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") {
   struct A : torch::nn::Module {
     int forward(int x) {
       return x;
@@ -240,34 +240,34 @@ TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") {
   Sequential b(C{}, D{});
   a->extend(*b);
 
-  REQUIRE(a->size() == 4);
-  REQUIRE(a[0]->as<A>());
-  REQUIRE(a[1]->as<B>());
-  REQUIRE(a[2]->as<C>());
-  REQUIRE(a[3]->as<D>());
+  CATCH_REQUIRE(a->size() == 4);
+  CATCH_REQUIRE(a[0]->as<A>());
+  CATCH_REQUIRE(a[1]->as<B>());
+  CATCH_REQUIRE(a[2]->as<C>());
+  CATCH_REQUIRE(a[3]->as<D>());
 
-  REQUIRE(b->size() == 2);
-  REQUIRE(b[0]->as<C>());
-  REQUIRE(b[1]->as<D>());
+  CATCH_REQUIRE(b->size() == 2);
+  CATCH_REQUIRE(b[0]->as<C>());
+  CATCH_REQUIRE(b[1]->as<D>());
 
   std::vector<std::shared_ptr<A>> c = {std::make_shared<A>(),
                                        std::make_shared<A>()};
   b->extend(c);
 
-  REQUIRE(b->size() == 4);
-  REQUIRE(b[0]->as<C>());
-  REQUIRE(b[1]->as<D>());
-  REQUIRE(b[2]->as<A>());
-  REQUIRE(b[3]->as<A>());
+  CATCH_REQUIRE(b->size() == 4);
+  CATCH_REQUIRE(b[0]->as<C>());
+  CATCH_REQUIRE(b[1]->as<D>());
+  CATCH_REQUIRE(b[2]->as<A>());
+  CATCH_REQUIRE(b[3]->as<A>());
 }
 
-TEST_CASE("Sequential/HasReferenceSemantics") {
+CATCH_TEST_CASE("Sequential/HasReferenceSemantics") {
   Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5));
   Sequential second(first);
 
-  REQUIRE(first.get() == second.get());
-  REQUIRE(first->size() == second->size());
-  REQUIRE(std::equal(
+  CATCH_REQUIRE(first.get() == second.get());
+  CATCH_REQUIRE(first->size() == second->size());
+  CATCH_REQUIRE(std::equal(
       first->begin(),
       first->end(),
       second->begin(),
@@ -276,17 +276,17 @@ TEST_CASE("Sequential/HasReferenceSemantics") {
       }));
 }
 
-TEST_CASE("Sequential/IsCloneable") {
+CATCH_TEST_CASE("Sequential/IsCloneable") {
   Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
   Sequential clone =
       std::dynamic_pointer_cast<SequentialImpl>(sequential->clone());
-  REQUIRE(sequential->size() == clone->size());
+  CATCH_REQUIRE(sequential->size() == clone->size());
 
   for (size_t i = 0; i < sequential->size(); ++i) {
     // The modules should be the same kind (type).
-    REQUIRE(sequential[i]->name() == clone[i]->name());
+    CATCH_REQUIRE(sequential[i]->name() == clone[i]->name());
     // But not pointer-equal (distinct objects).
-    REQUIRE(sequential[i] != clone[i]);
+    CATCH_REQUIRE(sequential[i] != clone[i]);
   }
 
   // Verify that the clone is deep, i.e. parameters of modules are cloned too.
@@ -295,38 +295,38 @@ TEST_CASE("Sequential/IsCloneable") {
 
   auto params1 = sequential->parameters();
   auto params2 = clone->parameters();
-  REQUIRE(params1.size() == params2.size());
+  CATCH_REQUIRE(params1.size() == params2.size());
   for (auto& param : params1) {
-    REQUIRE(!pointer_equal(param.value, params2[param.key]));
-    REQUIRE(param->device() == params2[param.key].device());
-    REQUIRE(param->allclose(params2[param.key]));
+    CATCH_REQUIRE(!pointer_equal(param.value, params2[param.key]));
+    CATCH_REQUIRE(param->device() == params2[param.key].device());
+    CATCH_REQUIRE(param->allclose(params2[param.key]));
     param->add_(2);
   }
   for (auto& param : params1) {
-    REQUIRE(!param->allclose(params2[param.key]));
+    CATCH_REQUIRE(!param->allclose(params2[param.key]));
   }
 }
 
-TEST_CASE("Sequential/RegistersElementsAsSubmodules") {
+CATCH_TEST_CASE("Sequential/RegistersElementsAsSubmodules") {
   Sequential sequential(Linear(10, 3), Conv2d(1, 2, 3), FeatureDropout(0.5));
 
   auto modules = sequential->modules();
-  REQUIRE(modules.size() == sequential->children().size());
+  CATCH_REQUIRE(modules.size() == sequential->children().size());
 
-  REQUIRE(modules[0]->as<Linear>());
-  REQUIRE(modules[1]->as<Conv2d>());
-  REQUIRE(modules[2]->as<FeatureDropout>());
+  CATCH_REQUIRE(modules[0]->as<Linear>());
+  CATCH_REQUIRE(modules[1]->as<Conv2d>());
+  CATCH_REQUIRE(modules[2]->as<FeatureDropout>());
 }
 
-TEST_CASE("Sequential/CloneToDevice", "[cuda]") {
+CATCH_TEST_CASE("Sequential/CloneToDevice", "[cuda]") {
   Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
   torch::Device device(torch::kCUDA, 0);
   Sequential clone =
       std::dynamic_pointer_cast<SequentialImpl>(sequential->clone(device));
   for (const auto& p : clone->parameters()) {
-    REQUIRE(p->device() == device);
+    CATCH_REQUIRE(p->device() == device);
   }
   for (const auto& b : clone->buffers()) {
-    REQUIRE(b->device() == device);
+    CATCH_REQUIRE(b->device() == device);
   }
 }
diff --git a/test/cpp/api/serialization.cpp b/test/cpp/api/serialization.cpp
index 354108991f42cd..fda133b8d9bf00 100644
--- a/test/cpp/api/serialization.cpp
+++ b/test/cpp/api/serialization.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/functional.h>
 #include <torch/nn/modules/linear.h>
@@ -30,12 +30,12 @@ Sequential xor_model() {
 }
 } // namespace
 
-TEST_CASE("serialization") {
+CATCH_TEST_CASE("serialization") {
   torch::manual_seed(0);
-  SECTION("undefined") {
+  CATCH_SECTION("undefined") {
     auto x = torch::Tensor();
 
-    REQUIRE(!x.defined());
+    CATCH_REQUIRE(!x.defined());
 
     auto y = torch::randn({5});
 
@@ -43,10 +43,10 @@ TEST_CASE("serialization") {
     torch::save(ss, &x);
     torch::load(ss, &y);
 
-    REQUIRE(!y.defined());
+    CATCH_REQUIRE(!y.defined());
   }
 
-  SECTION("cputypes") {
+  CATCH_SECTION("cputypes") {
     for (int i = 0; i < static_cast<int>(torch::Dtype::NumOptions); i++) {
       if (i == static_cast<int>(torch::Dtype::Half)) {
         // XXX can't serialize half tensors at the moment since contiguous() is
@@ -69,17 +69,17 @@ TEST_CASE("serialization") {
       torch::save(ss, &x);
       torch::load(ss, &y);
 
-      REQUIRE(y.defined());
-      REQUIRE(x.sizes().vec() == y.sizes().vec());
+      CATCH_REQUIRE(y.defined());
+      CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
       if (torch::isIntegralType(static_cast<torch::Dtype>(i))) {
-        REQUIRE(x.equal(y));
+        CATCH_REQUIRE(x.equal(y));
       } else {
-        REQUIRE(x.allclose(y));
+        CATCH_REQUIRE(x.allclose(y));
       }
     }
   }
 
-  SECTION("binary") {
+  CATCH_SECTION("binary") {
     auto x = torch::randn({5, 5});
     auto y = torch::Tensor();
 
@@ -93,11 +93,11 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
-  SECTION("portable_binary") {
+  CATCH_SECTION("portable_binary") {
     auto x = torch::randn({5, 5});
     auto y = torch::Tensor();
 
@@ -111,12 +111,12 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
 
-  SECTION("resized") {
+  CATCH_SECTION("resized") {
     auto x = torch::randn({11, 5});
     x.resize_({5, 5});
     auto y = torch::Tensor();
@@ -131,11 +131,11 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
-  SECTION("sliced") {
+  CATCH_SECTION("sliced") {
     auto x = torch::randn({11, 5});
     x = x.slice(0, 1, 3);
     auto y = torch::Tensor();
@@ -150,12 +150,12 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
 
-  SECTION("noncontig") {
+  CATCH_SECTION("noncontig") {
     auto x = torch::randn({11, 5});
     x = x.slice(1, 1, 4);
     auto y = torch::Tensor();
@@ -170,12 +170,12 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
 
-  SECTION("xor") {
+  CATCH_SECTION("xor") {
     // We better be able to save and load a XOR model!
     auto getLoss = [](Sequential model, uint32_t batch_size) {
       auto inputs = torch::empty({batch_size, 2});
@@ -207,7 +207,7 @@ TEST_CASE("serialization") {
       optimizer.step();
 
       running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
-      REQUIRE(epoch < 3000);
+      CATCH_REQUIRE(epoch < 3000);
       epoch++;
     }
 
@@ -216,10 +216,10 @@ TEST_CASE("serialization") {
     torch::load(ss, model2);
 
     auto loss = getLoss(model2, 100);
-    REQUIRE(loss.toCFloat() < 0.1);
+    CATCH_REQUIRE(loss.toCFloat() < 0.1);
   }
 
-  SECTION("optim") {
+  CATCH_SECTION("optim") {
     auto model1 = Linear(5, 2);
     auto model2 = Linear(5, 2);
     auto model3 = Linear(5, 2);
@@ -235,8 +235,8 @@ TEST_CASE("serialization") {
     auto param2 = model2->parameters();
     auto param3 = model3->parameters();
     for (const auto& p : param1) {
-      REQUIRE(param1[p.key].allclose(param2[p.key]));
-      REQUIRE(param2[p.key].allclose(param3[p.key]));
+      CATCH_REQUIRE(param1[p.key].allclose(param2[p.key]));
+      CATCH_REQUIRE(param2[p.key].allclose(param3[p.key]));
     }
 
     // Make some optimizers with momentum (and thus state)
@@ -281,13 +281,13 @@ TEST_CASE("serialization") {
     for (const auto& p : param1) {
       const auto& name = p.key;
       // Model 1 and 3 should be the same
-      REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
-      REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
+      CATCH_REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
+      CATCH_REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
     }
   }
 }
 
-TEST_CASE("serialization_cuda", "[cuda]") {
+CATCH_TEST_CASE("serialization_cuda", "[cuda]") {
   torch::manual_seed(0);
   // We better be able to save and load a XOR model!
   auto getLoss = [](Sequential model, uint32_t batch_size) {
@@ -318,7 +318,7 @@ TEST_CASE("serialization_cuda", "[cuda]") {
     optimizer.step();
 
     running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
-    REQUIRE(epoch < 3000);
+    CATCH_REQUIRE(epoch < 3000);
     epoch++;
   }
 
@@ -327,7 +327,7 @@ TEST_CASE("serialization_cuda", "[cuda]") {
   torch::load(ss, model2);
 
   auto loss = getLoss(model2, 100);
-  REQUIRE(loss.toCFloat() < 0.1);
+  CATCH_REQUIRE(loss.toCFloat() < 0.1);
 
   model2->to(torch::kCUDA);
   ss.clear();
@@ -335,5 +335,5 @@ TEST_CASE("serialization_cuda", "[cuda]") {
   torch::load(ss, model3);
 
   loss = getLoss(model3, 100);
-  REQUIRE(loss.toCFloat() < 0.1);
+  CATCH_REQUIRE(loss.toCFloat() < 0.1);
 }
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index f08a30d13c1e6e..57605562db1258 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/tensor.h>
 
@@ -19,12 +19,12 @@ bool almost_equal(at::Tensor left, T right, T tolerance = 1e-4) {
 }
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)                \
-  REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type());   \
-  REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \
-  REQUIRE(tensor.dtype() == (type_));                                          \
-  REQUIRE(tensor.layout() == (layout_))
+  CATCH_REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(tensor.dtype() == (type_));                                          \
+  CATCH_REQUIRE(tensor.layout() == (layout_))
 
-TEST_CASE("Tensor/ToDtype") {
+CATCH_TEST_CASE("Tensor/ToDtype") {
   auto tensor = at::empty({3, 4});
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 
@@ -39,7 +39,7 @@ TEST_CASE("Tensor/ToDtype") {
 }
 
 // Not currently supported.
-// TEST_CASE("Tensor/ToLayout") {
+// CATCH_TEST_CASE("Tensor/ToLayout") {
 //   auto tensor = at::empty({3, 4});
 //   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 //
@@ -50,7 +50,7 @@ TEST_CASE("Tensor/ToDtype") {
 //   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 // }
 
-TEST_CASE("Tensor/ToDevice", "[cuda]") {
+CATCH_TEST_CASE("Tensor/ToDevice", "[cuda]") {
   auto tensor = at::empty({3, 4});
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 
@@ -67,7 +67,7 @@ TEST_CASE("Tensor/ToDevice", "[cuda]") {
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 }
 
-TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
+CATCH_TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
   auto tensor = at::empty({3, 4});
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 
@@ -75,119 +75,119 @@ TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
   REQUIRE_TENSOR_OPTIONS(at::kCUDA, 1, at::kInt, at::kStrided);
 }
 
-TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") {
+CATCH_TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") {
   {
     auto tensor = torch::empty({3, 4}, at::requires_grad());
-    REQUIRE(tensor.requires_grad());
+    CATCH_REQUIRE(tensor.requires_grad());
 
     tensor = tensor.to(at::kDouble);
-    REQUIRE(tensor.requires_grad());
+    CATCH_REQUIRE(tensor.requires_grad());
   }
   {
     auto tensor = torch::empty({3, 4});
-    REQUIRE(!tensor.requires_grad());
+    CATCH_REQUIRE(!tensor.requires_grad());
 
     tensor = tensor.to(at::kDouble);
-    REQUIRE(!tensor.requires_grad());
+    CATCH_REQUIRE(!tensor.requires_grad());
   }
 }
 
-TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") {
+CATCH_TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") {
   auto tensor = at::empty({3, 4}, at::kFloat);
   auto hopefully_not_copy = tensor.to(at::kFloat);
-  REQUIRE(hopefully_not_copy.data<float>() == tensor.data<float>());
+  CATCH_REQUIRE(hopefully_not_copy.data<float>() == tensor.data<float>());
 }
 
-TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") {
   auto tensor = at::tensor(123);
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(tensor[0].toCInt() == 123);
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(tensor[0].toCInt() == 123);
 
   tensor = at::tensor(123.456f);
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kFloat);
-  REQUIRE(almost_equal(tensor[0], 123.456f));
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kFloat);
+  CATCH_REQUIRE(almost_equal(tensor[0], 123.456f));
 
   tensor = at::tensor(123.456);
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kDouble);
-  REQUIRE(almost_equal(tensor[0], 123.456));
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kDouble);
+  CATCH_REQUIRE(almost_equal(tensor[0], 123.456));
 }
 
-TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") {
   auto tensor = at::tensor({1, 2, 3});
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(exactly_equal(tensor[0], 1));
-  REQUIRE(exactly_equal(tensor[1], 2));
-  REQUIRE(exactly_equal(tensor[2], 3));
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(exactly_equal(tensor[0], 1));
+  CATCH_REQUIRE(exactly_equal(tensor[1], 2));
+  CATCH_REQUIRE(exactly_equal(tensor[2], 3));
 
   tensor = at::tensor({1.5, 2.25, 3.125});
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kDouble);
-  REQUIRE(almost_equal(tensor[0], 1.5));
-  REQUIRE(almost_equal(tensor[1], 2.25));
-  REQUIRE(almost_equal(tensor[2], 3.125));
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kDouble);
+  CATCH_REQUIRE(almost_equal(tensor[0], 1.5));
+  CATCH_REQUIRE(almost_equal(tensor[1], 2.25));
+  CATCH_REQUIRE(almost_equal(tensor[2], 3.125));
 }
 
-TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") {
   auto tensor = torch::tensor({1, 2, 3});
-  REQUIRE(tensor.is_variable());
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(exactly_equal(tensor[0], 1));
-  REQUIRE(exactly_equal(tensor[1], 2));
-  REQUIRE(exactly_equal(tensor[2], 3));
+  CATCH_REQUIRE(tensor.is_variable());
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(exactly_equal(tensor[0], 1));
+  CATCH_REQUIRE(exactly_equal(tensor[1], 2));
+  CATCH_REQUIRE(exactly_equal(tensor[2], 3));
 
   tensor = torch::tensor({1.5, 2.25, 3.125});
-  REQUIRE(tensor.is_variable());
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kDouble);
-  REQUIRE(almost_equal(tensor[0], 1.5));
-  REQUIRE(almost_equal(tensor[1], 2.25));
-  REQUIRE(almost_equal(tensor[2], 3.125));
+  CATCH_REQUIRE(tensor.is_variable());
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kDouble);
+  CATCH_REQUIRE(almost_equal(tensor[0], 1.5));
+  CATCH_REQUIRE(almost_equal(tensor[1], 2.25));
+  CATCH_REQUIRE(almost_equal(tensor[2], 3.125));
 }
 
-TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") {
   std::vector<int> v = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
   auto tensor = at::tensor(v);
-  REQUIRE(tensor.numel() == v.size());
-  REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(tensor.numel() == v.size());
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
   for (size_t i = 0; i < v.size(); ++i) {
-    REQUIRE(exactly_equal(tensor[i], v.at(i)));
+    CATCH_REQUIRE(exactly_equal(tensor[i], v.at(i)));
   }
 
   std::vector<float> w = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.0};
   tensor = at::tensor(w);
-  REQUIRE(tensor.numel() == w.size());
-  REQUIRE(tensor.dtype() == at::kFloat);
+  CATCH_REQUIRE(tensor.numel() == w.size());
+  CATCH_REQUIRE(tensor.dtype() == at::kFloat);
   for (size_t i = 0; i < w.size(); ++i) {
-    REQUIRE(almost_equal(tensor[i], w.at(i)));
+    CATCH_REQUIRE(almost_equal(tensor[i], w.at(i)));
   }
 }
 
-TEST_CASE("Tensor/UsesOptionsThatAreSupplied") {
+CATCH_TEST_CASE("Tensor/UsesOptionsThatAreSupplied") {
   auto tensor = at::tensor(123, dtype(at::kFloat)) + 0.5;
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kFloat);
-  REQUIRE(almost_equal(tensor[0], 123.5));
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kFloat);
+  CATCH_REQUIRE(almost_equal(tensor[0], 123.5));
 
   tensor = at::tensor({1.1, 2.2, 3.3}, dtype(at::kInt));
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(tensor.layout() == at::kStrided);
-  REQUIRE(exactly_equal(tensor[0], 1));
-  REQUIRE(exactly_equal(tensor[1], 2));
-  REQUIRE(exactly_equal(tensor[2], 3));
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(tensor.layout() == at::kStrided);
+  CATCH_REQUIRE(exactly_equal(tensor[0], 1));
+  CATCH_REQUIRE(exactly_equal(tensor[1], 2));
+  CATCH_REQUIRE(exactly_equal(tensor[2], 3));
 }
 
-TEST_CASE("FromBlob") {
+CATCH_TEST_CASE("FromBlob") {
   std::vector<int32_t> v = {1, 2, 3};
   auto tensor = torch::from_blob(v.data(), v.size(), torch::kInt32);
-  REQUIRE(tensor.is_variable());
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor[0].toCInt() == 1);
-  REQUIRE(tensor[1].toCInt() == 2);
-  REQUIRE(tensor[2].toCInt() == 3);
+  CATCH_REQUIRE(tensor.is_variable());
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor[0].toCInt() == 1);
+  CATCH_REQUIRE(tensor[1].toCInt() == 2);
+  CATCH_REQUIRE(tensor[2].toCInt() == 3);
 }
diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp
index 82d874e74b11b0..8f85014499d201 100644
--- a/test/cpp/api/tensor_cuda.cpp
+++ b/test/cpp/api/tensor_cuda.cpp
@@ -1,11 +1,11 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <ATen/ATen.h>
 
 #include <cmath>
 
-TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") {
+CATCH_TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") {
   auto tensor = at::tensor({1, 2, 3}, at::device({at::kCUDA, 1}));
-  REQUIRE(tensor.device().type() == at::Device::Type::CUDA);
-  REQUIRE(tensor.device().index() == 1);
+  CATCH_REQUIRE(tensor.device().type() == at::Device::Type::CUDA);
+  CATCH_REQUIRE(tensor.device().index() == 1);
 }
diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
index ab80c5f45ab396..7118a3599a5d1a 100644
--- a/test/cpp/api/tensor_options.cpp
+++ b/test/cpp/api/tensor_options.cpp
@@ -1,4 +1,4 @@
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <torch/tensor.h>
 
@@ -14,28 +14,28 @@ using namespace at;
 
 // A macro so we don't lose location information when an assertion fails.
 #define REQUIRE_OPTIONS(device_, index_, type_, layout_)                    \
-  REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(options.dtype() == (type_));                                      \
-  REQUIRE(options.layout() == (layout_))
+  CATCH_REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(options.dtype() == (type_));                                      \
+  CATCH_REQUIRE(options.layout() == (layout_))
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)            \
-  REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(tensor.type().scalarType() == (type_));                          \
-  REQUIRE(tensor.type().layout() == (layout_))
+  CATCH_REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(tensor.type().scalarType() == (type_));                          \
+  CATCH_REQUIRE(tensor.type().layout() == (layout_))
 
-TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
+CATCH_TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
   TensorOptions options;
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
 }
 
-TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
+CATCH_TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
   auto options = TensorOptions().device(kCPU).dtype(kInt).layout(kSparse);
-  REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt));
+  CATCH_REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt));
 }
 
-TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
+CATCH_TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
   auto options = dtype(kInt);
   REQUIRE_OPTIONS(kCPU, -1, kInt, kStrided);
 
@@ -52,7 +52,7 @@ TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
   REQUIRE_OPTIONS(kCUDA, 3, kByte, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
   TensorOptions options;
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
 
@@ -69,7 +69,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
   REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
   auto options = empty(5, kDouble).options();
   REQUIRE_OPTIONS(kCPU, -1, kDouble, kStrided);
 
@@ -77,37 +77,37 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
   REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromVariables") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromVariables") {
   auto options = torch::empty(5).options();
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
-  REQUIRE(!options.requires_grad());
+  CATCH_REQUIRE(!options.requires_grad());
 
   options = torch::empty(5, at::requires_grad()).options();
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
-  REQUIRE(!options.requires_grad());
+  CATCH_REQUIRE(!options.requires_grad());
 }
 
-TEST_CASE("Device/ParsesCorrectlyFromString") {
+CATCH_TEST_CASE("Device/ParsesCorrectlyFromString") {
   Device device("cpu:0");
-  REQUIRE(device == Device(kCPU, 0));
+  CATCH_REQUIRE(device == Device(kCPU, 0));
 
   device = Device("cpu");
-  REQUIRE(device == Device(kCPU));
+  CATCH_REQUIRE(device == Device(kCPU));
 
   device = Device("cuda:123");
-  REQUIRE(device == Device(kCUDA, 123));
+  CATCH_REQUIRE(device == Device(kCUDA, 123));
 
   device = Device("cuda");
-  REQUIRE(device == Device(kCUDA));
+  CATCH_REQUIRE(device == Device(kCUDA));
 
   std::vector<std::string> badnesses = {
       "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"};
   for (const auto& badness : badnesses) {
-    REQUIRE_THROWS(Device(badness));
+    _CATCH_REQUIRE_THROWS(Device(badness));
   }
 }
 
-TEST_CASE("OptionsGuard") {
+CATCH_TEST_CASE("OptionsGuard") {
   Tensor tensor;
   {
     OptionsGuard guard(TensorOptions{});
@@ -132,5 +132,5 @@ TEST_CASE("OptionsGuard") {
     tensor = torch::empty({10});
   }
   REQUIRE_TENSOR_OPTIONS(kCPU, -1, kFloat, kStrided);
-  REQUIRE(tensor.requires_grad());
+  CATCH_REQUIRE(tensor.requires_grad());
 }
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
index ea33321b08e2f7..edeede8423c392 100644
--- a/test/cpp/api/tensor_options_cuda.cpp
+++ b/test/cpp/api/tensor_options_cuda.cpp
@@ -1,4 +1,4 @@
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <ATen/Context.h>
 #include <ATen/DeviceGuard.h>
@@ -10,18 +10,18 @@ using namespace at;
 
 // A macro so we don't lose location information when an assertion fails.
 #define REQUIRE_OPTIONS(device_, index_, type_, layout_)                    \
-  REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(options.dtype() == (type_));                                      \
-  REQUIRE(options.layout() == (layout_))
+  CATCH_REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(options.dtype() == (type_));                                      \
+  CATCH_REQUIRE(options.layout() == (layout_))
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)            \
-  REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(tensor.type().scalarType() == (type_));                          \
-  REQUIRE(tensor.type().layout() == (layout_))
+  CATCH_REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(tensor.type().scalarType() == (type_));                          \
+  CATCH_REQUIRE(tensor.type().layout() == (layout_))
 
-TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
   auto options = CUDA(kFloat).options();
   REQUIRE_OPTIONS(kCUDA, -1, kFloat, kStrided);
 
@@ -41,7 +41,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
   REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
   auto options = empty(5, device(kCUDA).dtype(kDouble)).options();
   REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided);
 
@@ -66,7 +66,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
   }
 }
 
-TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
+CATCH_TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
   Tensor tensor;
   {
     OptionsGuard guard(device(kCUDA));
@@ -87,7 +87,7 @@ TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
   REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kInt, kStrided);
 }
 
-TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
+CATCH_TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
   Tensor tensor;
   {
     // Check that OptionsGuard respects any active device before construction.
@@ -112,17 +112,17 @@ TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
   }
 }
 
-TEST_CASE("DeviceGuardIsMovable", "[cuda]") {
+CATCH_TEST_CASE("DeviceGuardIsMovable", "[cuda]") {
   DeviceGuard first(1);
-  REQUIRE(first.original_index() == 0);
-  REQUIRE(first.last_index() == 1);
+  CATCH_REQUIRE(first.original_index() == 0);
+  CATCH_REQUIRE(first.last_index() == 1);
   DeviceGuard second(std::move(first));
-  REQUIRE(second.original_index() == 0);
-  REQUIRE(second.last_index() == 1);
-  REQUIRE(first.original_index() == -1);
+  CATCH_REQUIRE(second.original_index() == 0);
+  CATCH_REQUIRE(second.last_index() == 1);
+  CATCH_REQUIRE(first.original_index() == -1);
   DeviceGuard third;
   third = std::move(second);
-  REQUIRE(third.original_index() == 0);
-  REQUIRE(third.last_index() == 1);
-  REQUIRE(second.original_index() == -1);
+  CATCH_REQUIRE(third.original_index() == 0);
+  CATCH_REQUIRE(third.last_index() == 1);
+  CATCH_REQUIRE(second.original_index() == -1);
 }
diff --git a/torch/csrc/jit/catch_utils.hpp b/torch/csrc/jit/catch_utils.hpp
new file mode 100644
index 00000000000000..b9b0a87990a9ce
--- /dev/null
+++ b/torch/csrc/jit/catch_utils.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
+// define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index 28bf95847d634f..3110fb2c360a35 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -1,13 +1,13 @@
 #ifdef USE_CATCH
 
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 using Catch::StartsWith;
 
 #else
 
-#define REQUIRE JIT_ASSERT
+#define CATCH_REQUIRE JIT_ASSERT
 
 #endif
 
@@ -110,9 +110,9 @@ static void codeTemplateTest() {
     e.v("what",{"is","this"});
     TemplateEnv c(e);
     c.s("hi","foo2");
-    REQUIRE(e.s("hi") == "foo");
-    REQUIRE(c.s("hi") == "foo2");
-    REQUIRE(e.v("what")[0] == "is");
+    CATCH_REQUIRE(e.s("hi") == "foo");
+    CATCH_REQUIRE(c.s("hi") == "foo2");
+    CATCH_REQUIRE(e.v("what")[0] == "is");
   }
 
   {
@@ -126,7 +126,7 @@ static void codeTemplateTest() {
     auto s = ct.format(e);
     //std::cout << "'" << s << "'\n";
     //std::cout << "'" << ct_expect << "'\n";
-    REQUIRE(s == ct_expect);
+    CATCH_REQUIRE(s == ct_expect);
   }
 }
 
@@ -146,11 +146,11 @@ static void fusionTests() {
     auto b = at::rand({4,3}, at::kCUDA).transpose(0,1);
     auto o = at::zeros({3,4}, at::kCUDA);
     auto outputs = debugLaunchGraph(graph, 0, {a,b});
-    REQUIRE(outputs.size() == 1);
+    CATCH_REQUIRE(outputs.size() == 1);
     auto o2 = a*b;
     float max_diff = (o2 - outputs[0]).abs().max().toCDouble();
     //std::cout << "max diff: " << max_diff << "\n";
-    REQUIRE(max_diff == 0);
+    CATCH_REQUIRE(max_diff == 0);
   };
   testSimple();
 
@@ -200,10 +200,10 @@ static void fusionTests() {
     auto out0 = t16*t5;
 
     auto outputs = debugLaunchGraph(graph, 0, inputs);
-    REQUIRE(outputs.size() == graph.outputs().size());
-    REQUIRE(out0.is_same_size(outputs.front()));
+    CATCH_REQUIRE(outputs.size() == graph.outputs().size());
+    CATCH_REQUIRE(out0.is_same_size(outputs.front()));
     float max_diff = (outputs.front() - out0).abs().max().toCDouble();
-    REQUIRE(max_diff < 1e-6);
+    CATCH_REQUIRE(max_diff < 1e-6);
 
   };
   testOne(0,0,0,0);
@@ -234,12 +234,12 @@ static void fusionTests() {
     auto o_r = a*b;
     auto o2_r = at::cat({a, o_r}, dim);
     auto outputs = debugLaunchGraph(graph, 0, {a,b});
-    REQUIRE(outputs.size() == 2);
+    CATCH_REQUIRE(outputs.size() == 2);
 
     float max_diff = (o_r - outputs[0]).abs().max().toCDouble();
-    REQUIRE(max_diff == 0);
+    CATCH_REQUIRE(max_diff == 0);
     float max_diff2 = (o2_r - outputs[1]).abs().max().toCDouble();
-    REQUIRE(max_diff2 == 0);
+    CATCH_REQUIRE(max_diff2 == 0);
   };
   testConcat(0);
   testConcat(1);
@@ -255,58 +255,58 @@ void attributesTest() {
   auto four = attr::perm;
   Attr attr;
   attr.f_(one,3.4)->i_(two,5)->s_(three,"what");
-  REQUIRE(attr.f(one) == 3.4);
-  REQUIRE(attr.s(three) == "what");
-  REQUIRE(attr.i(two) == 5);
+  CATCH_REQUIRE(attr.f(one) == 3.4);
+  CATCH_REQUIRE(attr.s(three) == "what");
+  CATCH_REQUIRE(attr.i(two) == 5);
   attr.s_(one,"no");
-  REQUIRE(attr.s(one) == "no");
-  REQUIRE(attr.hasAttribute(three));
-  REQUIRE(!attr.hasAttribute(four));
+  CATCH_REQUIRE(attr.s(one) == "no");
+  CATCH_REQUIRE(attr.hasAttribute(three));
+  CATCH_REQUIRE(!attr.hasAttribute(four));
   attr.ss_(two, {"hi", "now"});
-  REQUIRE(attr.ss(two).at(1) == "now");
+  CATCH_REQUIRE(attr.ss(two).at(1) == "now");
 
   Attr attr2;
   attr2.copyAttributes(attr);
-  REQUIRE(attr2.s(one) == "no");
+  CATCH_REQUIRE(attr2.s(one) == "no");
   attr2.f_(one,5);
-  REQUIRE(attr.s(one) == "no");
-  REQUIRE(attr2.f(one) == 5);
+  CATCH_REQUIRE(attr.s(one) == "no");
+  CATCH_REQUIRE(attr2.f(one) == 5);
 }
 
 void internedStringsTests () {
 
-  REQUIRE(prim::Param == Symbol::prim("Param"));
-  REQUIRE(prim::Return == Symbol::prim("Return"));
-  REQUIRE(prim::Return.toUnqualString() == std::string("Return"));
-  REQUIRE(prim::Return.toQualString() == std::string("prim::Return"));
+  CATCH_REQUIRE(prim::Param == Symbol::prim("Param"));
+  CATCH_REQUIRE(prim::Return == Symbol::prim("Return"));
+  CATCH_REQUIRE(prim::Return.toUnqualString() == std::string("Return"));
+  CATCH_REQUIRE(prim::Return.toQualString() == std::string("prim::Return"));
   Symbol newsym = Symbol::aten("__NEW_SYMBOL");
   size_t symstart = newsym;
-  REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL"));
+  CATCH_REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL"));
   // TODO: This test is a bit too close to the implementation details.
-  REQUIRE(Symbol::aten("What") == symstart+1);
-  REQUIRE(Symbol::aten("What2") == symstart+2);
-  REQUIRE(Symbol::aten("What") == symstart+1);
-  REQUIRE(Symbol::aten("What2") == symstart+2);
-  REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2"));
+  CATCH_REQUIRE(Symbol::aten("What") == symstart+1);
+  CATCH_REQUIRE(Symbol::aten("What2") == symstart+2);
+  CATCH_REQUIRE(Symbol::aten("What") == symstart+1);
+  CATCH_REQUIRE(Symbol::aten("What2") == symstart+2);
+  CATCH_REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2"));
 }
 
 void fromQualStringTests() {
-  REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param"));
-  REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm"));
-  REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM"));
-  REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value"));
-  REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope(""));
-  REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string(""));
-  REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::"));
-  REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param"));
-  REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns"));
-  REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns"));
+  CATCH_REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param"));
+  CATCH_REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm"));
+  CATCH_REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM"));
+  CATCH_REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value"));
+  CATCH_REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope(""));
+  CATCH_REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string(""));
+  CATCH_REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::"));
+  CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param"));
+  CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns"));
+  CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns"));
 
   auto bad_inputs = {"scope", ":", ""};
   for (auto input : bad_inputs) {
     try {
       Symbol::fromQualString(input);
-      REQUIRE(0);
+      CATCH_REQUIRE(0);
     } catch (std::runtime_error c) {
     }
   }
@@ -467,8 +467,8 @@ void interpTest() {
     std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
 
     //std::cout << almostEqual(outputs[0],hx) << "\n";
-    REQUIRE(exactlyEqual(outputs[0],hx));
-    REQUIRE(exactlyEqual(outputs[1],cx));
+    CATCH_REQUIRE(exactlyEqual(outputs[0],hx));
+    CATCH_REQUIRE(exactlyEqual(outputs[1],cx));
 }
 
 void interpStageTest() {
@@ -500,8 +500,8 @@ void interpStageTest() {
     std::tie(hx, cx) = lstm(input[0], hx, cx1, w_ih, w_hh);
 
     //std::cout << almostEqual(outputs[0],hx) << "\n";
-    REQUIRE(exactlyEqual(outputs[0],hx));
-    REQUIRE(exactlyEqual(outputs[1],cx));
+    CATCH_REQUIRE(exactlyEqual(outputs[0],hx));
+    CATCH_REQUIRE(exactlyEqual(outputs[1],cx));
 }
 
 using var_meta_type = std::vector<int64_t>;
@@ -554,10 +554,10 @@ variable_list grad(const variable_list& outputs, const variable_list& inputs, co
 }
 
 void assertAllClose(const tensor_list& a, const tensor_list& b) {
-  REQUIRE(a.size() == b.size());
+  CATCH_REQUIRE(a.size() == b.size());
   for (size_t i = 0; i < a.size(); ++i) {
-    REQUIRE(a[i].is_same_size(b[i]));
-    REQUIRE(a[i].allclose(b[i]));
+    CATCH_REQUIRE(a[i].is_same_size(b[i]));
+    CATCH_REQUIRE(a[i].allclose(b[i]));
   }
 }
 
@@ -654,11 +654,11 @@ void testDifferentiate(std::ostream & out) {
   std::vector<size_t> expected_captured_outputs = {1};
   std::vector<size_t> expected_input_vjps = {0, 1};
   std::vector<size_t> expected_output_vjps = {0, 1};
-  REQUIRE(grad_spec.f_real_outputs == 1);
-  REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs);
-  REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs);
-  REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
-  REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
+  CATCH_REQUIRE(grad_spec.f_real_outputs == 1);
+  CATCH_REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs);
+  CATCH_REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs);
+  CATCH_REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
+  CATCH_REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
   out << "testDifferentiate\n";
   out << *grad_spec.f;
   out << *grad_spec.df;
@@ -684,11 +684,11 @@ void testDifferentiateWithRequiresGrad(std::ostream & out) {
   auto grad_spec = differentiate(graph);
   std::vector<size_t> expected_input_vjps = {1, 2};  // for e and %4 = (d + a)
   std::vector<size_t> expected_output_vjps = {0};    // only a requires grad
-  REQUIRE(grad_spec.f_real_outputs == 2);              // we need one temporary %4 = (d + a)
-  REQUIRE(grad_spec.df_input_captured_inputs == std::vector<size_t>({0}));
-  REQUIRE(grad_spec.df_input_captured_outputs == std::vector<size_t>({2}));
-  REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
-  REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
+  CATCH_REQUIRE(grad_spec.f_real_outputs == 2);              // we need one temporary %4 = (d + a)
+  CATCH_REQUIRE(grad_spec.df_input_captured_inputs == std::vector<size_t>({0}));
+  CATCH_REQUIRE(grad_spec.df_input_captured_outputs == std::vector<size_t>({2}));
+  CATCH_REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
+  CATCH_REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
   out << "testDifferentiateWithRequiresGrad\n";
   out << *grad_spec.f;
   out << *grad_spec.df;
@@ -718,7 +718,7 @@ bool isEqual(at::IntList lhs, at::IntList rhs) {
 }
 
 bool isEqual(const CompleteArgumentInfo & ti, const autograd::Variable & v) {
-  REQUIRE(ti.isTensor());
+  CATCH_REQUIRE(ti.isTensor());
   if(!ti.defined())
     return ti.defined() == v.defined();
   return
@@ -754,34 +754,34 @@ void argumentSpecTest() {
 
   CompleteArgumentSpec a(true, list);
   CompleteArgumentSpec b(true, list);
-  REQUIRE(a.hashCode() == b.hashCode());
+  CATCH_REQUIRE(a.hashCode() == b.hashCode());
 
-  REQUIRE(a == b);
+  CATCH_REQUIRE(a == b);
   CompleteArgumentSpec d(true, list2);
-  REQUIRE(d == a);
-  REQUIRE(d.hashCode() == a.hashCode());
+  CATCH_REQUIRE(d == a);
+  CATCH_REQUIRE(d.hashCode() == a.hashCode());
 
   for(size_t i = 0; i < list.size(); ++i) {
-    REQUIRE(isEqual(a.at(i), list[i].toTensor()));
+    CATCH_REQUIRE(isEqual(a.at(i), list[i].toTensor()));
   }
   CompleteArgumentSpec no_grad(/*with_grad=*/false, list);
-  REQUIRE(no_grad != a);
+  CATCH_REQUIRE(no_grad != a);
 
   std::unordered_set<CompleteArgumentSpec> spec;
   spec.insert(std::move(a));
-  REQUIRE(spec.count(b) > 0);
-  REQUIRE(spec.count(no_grad) == 0);
+  CATCH_REQUIRE(spec.count(b) > 0);
+  CATCH_REQUIRE(spec.count(no_grad) == 0);
   spec.insert(std::move(no_grad));
-  REQUIRE(spec.count(CompleteArgumentSpec(true,list)) == 1);
+  CATCH_REQUIRE(spec.count(CompleteArgumentSpec(true,list)) == 1);
 
   list2[1].toTensor().transpose_(0,1);
   CompleteArgumentSpec c(true, list2); // same as list, except for one stride
-  REQUIRE(!(c == a));
-  REQUIRE(spec.count(c) == 0);
+  CATCH_REQUIRE(!(c == a));
+  CATCH_REQUIRE(spec.count(c) == 0);
 
   Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) };
   CompleteArgumentSpec with_const(true, stack);
-  REQUIRE(with_const.at(2).sizes().size() == 2);
+  CATCH_REQUIRE(with_const.at(2).sizes().size() == 2);
 }
 
 void testGraphExecutor() {
@@ -802,11 +802,11 @@ void testGraphExecutor() {
   GraphExecutor executor(g);
   auto stack = createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh)});
   executor.run(stack);
-  REQUIRE(stack.size() == 2);
+  CATCH_REQUIRE(stack.size() == 2);
   at::Tensor r0, r1;
   std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
-  REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0));
-  REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1));
+  CATCH_REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0));
+  CATCH_REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1));
 }
 
 void testBlocks(std::ostream & out) {
@@ -877,11 +877,11 @@ void testControlFlow() {
   auto run_binary = [&](const std::string & name, int64_t a, int64_t b) {
     return V(run(name, {L(a), L(b)})[0]);
   };
-  REQUIRE(2 == run_binary("if_test", 1, 2));
-  REQUIRE(3 == run_binary("if_test", 3, 2));
-  REQUIRE(2 == run_binary("if_one", 2, 3));
-  REQUIRE(2 == run_binary("if_one", 3, 2));
-  REQUIRE(256 == run_binary("while_test",2,0));
+  CATCH_REQUIRE(2 == run_binary("if_test", 1, 2));
+  CATCH_REQUIRE(3 == run_binary("if_test", 3, 2));
+  CATCH_REQUIRE(2 == run_binary("if_one", 2, 3));
+  CATCH_REQUIRE(2 == run_binary("if_one", 3, 2));
+  CATCH_REQUIRE(256 == run_binary("while_test",2,0));
 }
 
 void testIValue() {
@@ -939,18 +939,18 @@ void testCustomOperators() {
     RegisterOperators reg({createOperator(
         "foo::bar", [](double a, at::Tensor b) { return a + b; })});
     auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::bar");
+    CATCH_REQUIRE(op->schema().name == "foo::bar");
 
-    REQUIRE(op->schema().arguments.size() == 2);
-    REQUIRE(op->schema().arguments[0].name == "_0");
-    REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
-    REQUIRE(op->schema().arguments[1].name == "_1");
-    REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().arguments.size() == 2);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "_0");
+    CATCH_REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
+    CATCH_REQUIRE(op->schema().arguments[1].name == "_1");
+    CATCH_REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
 
-    REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
 
     Stack stack;
     push(stack, 2.0f, autograd::make_variable(at::ones(5)));
@@ -958,7 +958,7 @@ void testCustomOperators() {
     at::Tensor output;
     pop(stack, output);
 
-    REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
+    CATCH_REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
   }
   {
     RegisterOperators reg({createOperator(
@@ -967,19 +967,19 @@ void testCustomOperators() {
 
     auto& ops =
         getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::bar_with_schema");
+    CATCH_REQUIRE(op->schema().name == "foo::bar_with_schema");
 
-    REQUIRE(op->schema().arguments.size() == 2);
-    REQUIRE(op->schema().arguments[0].name == "a");
-    REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
-    REQUIRE(op->schema().arguments[1].name == "b");
-    REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().arguments.size() == 2);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "a");
+    CATCH_REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
+    CATCH_REQUIRE(op->schema().arguments[1].name == "b");
+    CATCH_REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
 
-    REQUIRE(op->schema().returns.size() == 1);
-    REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().returns.size() == 1);
+    CATCH_REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
 
     Stack stack;
     push(stack, 2.0f, autograd::make_variable(at::ones(5)));
@@ -987,7 +987,7 @@ void testCustomOperators() {
     at::Tensor output;
     pop(stack, output);
 
-    REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
+    CATCH_REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
   }
   {
     // Check that lists work well.
@@ -999,21 +999,21 @@ void testCustomOperators() {
 
     auto& ops =
         getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::lists");
+    CATCH_REQUIRE(op->schema().name == "foo::lists");
 
-    REQUIRE(op->schema().arguments.size() == 3);
-    REQUIRE(op->schema().arguments[0].name == "ints");
-    REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofInts()));
-    REQUIRE(op->schema().arguments[1].name == "floats");
-    REQUIRE(op->schema().arguments[1].type->isSubtypeOf(ListType::ofFloats()));
-    REQUIRE(op->schema().arguments[2].name == "tensors");
-    REQUIRE(op->schema().arguments[2].type->isSubtypeOf(ListType::ofTensors()));
+    CATCH_REQUIRE(op->schema().arguments.size() == 3);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "ints");
+    CATCH_REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofInts()));
+    CATCH_REQUIRE(op->schema().arguments[1].name == "floats");
+    CATCH_REQUIRE(op->schema().arguments[1].type->isSubtypeOf(ListType::ofFloats()));
+    CATCH_REQUIRE(op->schema().arguments[2].name == "tensors");
+    CATCH_REQUIRE(op->schema().arguments[2].type->isSubtypeOf(ListType::ofTensors()));
 
-    REQUIRE(op->schema().returns.size() == 1);
-    REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofFloats()));
+    CATCH_REQUIRE(op->schema().returns.size() == 1);
+    CATCH_REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofFloats()));
 
     Stack stack;
     push(stack, std::vector<int64_t>{1, 2});
@@ -1023,9 +1023,9 @@ void testCustomOperators() {
     std::vector<double> output;
     pop(stack, output);
 
-    REQUIRE(output.size() == 2);
-    REQUIRE(output[0] == 1.0);
-    REQUIRE(output[1] == 2.0);
+    CATCH_REQUIRE(output.size() == 2);
+    CATCH_REQUIRE(output[0] == 1.0);
+    CATCH_REQUIRE(output[1] == 2.0);
   }
   {
     RegisterOperators reg(
@@ -1034,17 +1034,17 @@ void testCustomOperators() {
 
     auto& ops =
         getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::lists2");
+    CATCH_REQUIRE(op->schema().name == "foo::lists2");
 
-    REQUIRE(op->schema().arguments.size() == 1);
-    REQUIRE(op->schema().arguments[0].name == "tensors");
-    REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofTensors()));
+    CATCH_REQUIRE(op->schema().arguments.size() == 1);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "tensors");
+    CATCH_REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofTensors()));
 
-    REQUIRE(op->schema().returns.size() == 1);
-    REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofTensors()));
+    CATCH_REQUIRE(op->schema().returns.size() == 1);
+    CATCH_REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofTensors()));
 
     Stack stack;
     push(stack, std::vector<at::Tensor>{autograd::make_variable(at::ones(5))});
@@ -1052,31 +1052,31 @@ void testCustomOperators() {
     std::vector<at::Tensor> output;
     pop(stack, output);
 
-    REQUIRE(output.size() == 1);
-    REQUIRE(output[0].allclose(autograd::make_variable(at::ones(5))));
+    CATCH_REQUIRE(output.size() == 1);
+    CATCH_REQUIRE(output[0].allclose(autograd::make_variable(at::ones(5))));
   }
   {
 #ifdef USE_CATCH
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(Tensor a) -> Tensor",
             [](double a, at::Tensor b) { return a + b; }),
         StartsWith("Inferred 2 argument(s) for operator implementation, "
                    "but the provided schema specified 1 argument(s)."));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(Tensor a) -> Tensor",
             [](double a) { return a; }),
         StartsWith("Inferred type for argument #0 was float, "
                    "but the provided schema specified type Dynamic "
                    "for the argument in that position"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(float a) -> (float, float)",
             [](double a) { return a; }),
         StartsWith("Inferred 1 return value(s) for operator implementation, "
                    "but the provided schema specified 2 return value(s)."));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(float a) -> Tensor",
             [](double a) { return a; }),
@@ -1109,7 +1109,7 @@ void testCustomOperators() {
         break;
       }
     }
-    REQUIRE(contains_traced_op);
+    CATCH_REQUIRE(contains_traced_op);
   }
   {
 #ifdef USE_CATCH
@@ -1124,7 +1124,7 @@ void testCustomOperators() {
     Stack stack;
     push(stack, std::vector<double>{1.0});
 
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         op.getOperation()(stack),
         StartsWith("Tracing float lists currently not supported!"));
 #endif
@@ -1156,42 +1156,42 @@ TORCH_API std::string runJITCPPTests() {
 
 #ifdef USE_CATCH
 
-TEST_CASE( "jit test CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "jit test CPU", "[cpu]" ) {
 
   std::stringstream out;
-  SECTION( "control flow" )
+  CATCH_SECTION( "control flow" )
     testControlFlow();
-  SECTION( "blocks" )
+  CATCH_SECTION( "blocks" )
     testBlocks(out);
-  SECTION( "create autodiff subgraphs" )
+  CATCH_SECTION( "create autodiff subgraphs" )
     testCreateAutodiffSubgraphs(out);
-  SECTION( "differentiate" )
+  CATCH_SECTION( "differentiate" )
     testDifferentiate(out);
-  SECTION( "differentiate with requires grad" )
+  CATCH_SECTION( "differentiate with requires grad" )
     testDifferentiateWithRequiresGrad(out);
-  SECTION( "AD formulas" )
+  CATCH_SECTION( "AD formulas" )
     testADFormulas();
-  SECTION( "code template" )
+  CATCH_SECTION( "code template" )
     codeTemplateTest();
-  SECTION( "attributes" )
+  CATCH_SECTION( "attributes" )
     attributesTest();
-  SECTION( "interned strings" )
+  CATCH_SECTION( "interned strings" )
     internedStringsTests();
-  SECTION( "custom operators" )
+  CATCH_SECTION( "custom operators" )
     testCustomOperators();
 }
 
-TEST_CASE( "jit test CUDA", "[cuda]" ) {
+CATCH_TEST_CASE( "jit test CUDA", "[cuda]" ) {
 
-  SECTION( "graph executor" )
+  CATCH_SECTION( "graph executor" )
     testGraphExecutor();
-  SECTION( "fusion" )
+  CATCH_SECTION( "fusion" )
     fusionTests();
-  SECTION( "interp" )
+  CATCH_SECTION( "interp" )
     interpTest();
-  SECTION( "interp stage" )
+  CATCH_SECTION( "interp stage" )
     interpStageTest();
-  SECTION( "argument spec" )
+  CATCH_SECTION( "argument spec" )
     argumentSpecTest();
 }
 

From c8fbeb3aa2068bb509de40067aa4f69fe25f22f2 Mon Sep 17 00:00:00 2001
From: Will Feng <willfeng@fb.com>
Date: Tue, 18 Sep 2018 10:14:40 -0700
Subject: [PATCH 222/237] Add empty tensor tests to test_sparse (#11228)

Summary:
This PR adds empty sparse tensor tests to `test_sparse.py`, and also fix various places in internal code to make the tests pass.

**[NOTE] API CHANGE:**
- `coalesce` on sparse tensor will always be performed out-of-place now (meaning the original tensor will never be affected)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11228

Differential Revision: D9755189

Pulled By: yf225

fbshipit-source-id: e9d36f437db1a132c423d3a282ff405a084ae7cc
---
 aten/src/ATen/native/sparse/SparseTensor.cpp  |   22 +-
 .../ATen/native/sparse/SparseTensorMath.cpp   |   18 +-
 .../native/sparse/cuda/SparseCUDATensor.cpp   |    3 +
 .../native/sparse/cuda/SparseCUDATensor.cu    |   10 +-
 .../sparse/cuda/SparseCUDATensorMath.cu       |   66 +-
 ...arse.test_add_dense_sparse_mismatch.expect |    1 -
 .../TestCudaSparse.test_log1p-backward.expect |    1 -
 ...stCudaSparse.test_log1p-uncoalesced.expect |    1 -
 ...arse.test_add_dense_sparse_mismatch.expect |    1 -
 ...coalescedSparse.test_log1p-backward.expect |    1 -
 ...lescedSparse.test_log1p-uncoalesced.expect |    1 -
 ...arse.test_add_dense_sparse_mismatch.expect |    1 -
 .../TestSparse.test_log1p-backward.expect     |    1 -
 .../TestSparse.test_log1p-uncoalesced.expect  |    1 -
 ...TestSparseOneOff.test_cuda_from_cpu.expect |    1 -
 ...eOff.test_cuda_sparse_cpu_dense_add.expect |    1 -
 ...arse.test_add_dense_sparse_mismatch.expect |    1 -
 ...coalescedSparse.test_log1p-backward.expect |    1 -
 ...lescedSparse.test_log1p-uncoalesced.expect |    1 -
 test/test_sparse.py                           | 1125 +++++++++++------
 20 files changed, 780 insertions(+), 478 deletions(-)
 delete mode 100644 test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
 delete mode 100644 test/expect/TestCudaSparse.test_log1p-backward.expect
 delete mode 100644 test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
 delete mode 100644 test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
 delete mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
 delete mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
 delete mode 100644 test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
 delete mode 100644 test/expect/TestSparse.test_log1p-backward.expect
 delete mode 100644 test/expect/TestSparse.test_log1p-uncoalesced.expect
 delete mode 100644 test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
 delete mode 100644 test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
 delete mode 100644 test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
 delete mode 100644 test/expect/TestUncoalescedSparse.test_log1p-backward.expect
 delete mode 100644 test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect

diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 25fd4fc5df4326..49efed2a1e066b 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -257,12 +257,16 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
   AT_ASSERT(!self.is_variable());
   AT_ASSERT(self.is_sparse());
 
-  if (self._nnz() < 2) {
-    _get_sparse_impl(self)->set_coalesced(true);
-  }
   if (self.is_coalesced()) {
     return self;
   }
+  // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false,
+  // we should keep the original tensor intact and do coalesce on a copy of the tensor
+  if (self._nnz() < 2) {
+    SparseTensor dst = self.clone();
+    _get_sparse_impl(dst)->set_coalesced(true);
+    return dst;
+  }
 
   LongTensor indices = self._indices();
   Tensor values = self._values().contiguous();
@@ -306,13 +310,17 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
           int64_t pos = indicesPermutationAccessor[j];
           int64_t curr = indicesBufferAccessor[j];
           if (curr == prev) {
-            THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
+              THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            }
           } else {
             ++i;
             for (int64_t d = 0; d < sparseDims; d++) {
               newIndicesAccessor[d][i] = indicesAccessor[d][pos];
             }
-            THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
+              THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            }
           }
           prev = curr;
         }
@@ -345,6 +353,10 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   int64_t r_nnz = mask._nnz();
   _get_sparse_impl(r)->set_nnz_and_narrow(r_nnz);
+  if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
+    return r;
+  }
+
   // NB: Relies on mask._nnz() == 0 test above
   auto mask_indices_accessor = mask_indices.accessor<int64_t, 2>();
 
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index afd8001734a9a8..ec074b5a6c8a88 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -250,18 +250,22 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
             for (d = 0; d < sparseDims; d++) {
               r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
             }
-            THBlas_axpy<scalar_t>(blockSize, 1,
-              t_values_ptr + t_i * blockSize, 1,
-              r_values_ptr + r_i * blockSize, 1);
+            if (t_values.numel() > 0) {  // We add all elements from t_values to r_values only if t_values is not an empty tensor
+              THBlas_axpy<scalar_t>(blockSize, 1,
+                t_values_ptr + t_i * blockSize, 1,
+                r_values_ptr + r_i * blockSize, 1);
+            }
             t_i++;
           }
           if (cmp <= 0) {
             for (d = 0; d < sparseDims; d++) {
               r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i];
             }
-            THBlas_axpy<scalar_t>(blockSize, cast_value,
-              s_values_ptr + s_i * blockSize, 1,
-              r_values_ptr + r_i * blockSize, 1);
+            if (s_values.numel() > 0) {  // We add all elements from s_values to r_values only if s_values is not an empty tensor
+              THBlas_axpy<scalar_t>(blockSize, cast_value,
+                s_values_ptr + s_i * blockSize, 1,
+                r_values_ptr + r_i * blockSize, 1);
+            }
             s_i++;
           }
           r_i++;
@@ -368,6 +372,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes());
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
+    r.resize_as_(src_);
     return r.zero_();
   }
 
@@ -519,7 +524,6 @@ Tensor& s_addmm_out_sparse_dense_cpu(
 
   AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor");
   AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
-  AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor");
   AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
 
   SparseTensor sparse = sparse_.coalesce();
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index fc53c41b56f727..107a30f51c2a97 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -25,6 +25,9 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz());
+  if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
+    return r;
+  }
 
   LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options());
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index ff4b0e0c57736c..ef2da61460eb99 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -27,12 +27,16 @@ namespace at { namespace native {
 SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
 #ifndef __HIP_PLATFORM_HCC__
   int64_t nnz = self._nnz();
-  if (nnz < 2) {
-    _get_sparse_impl(self)->set_coalesced(true);
-  }
   if (self.is_coalesced()) {
     return self;
   }
+  // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false,
+  // we should keep the original tensor intact and do coalesce on a copy of the tensor
+  if (nnz < 2) {
+    SparseTensor dst = self.clone();
+    _get_sparse_impl(dst)->set_coalesced(true);
+    return dst;
+  }
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index d7701479e9b243..712c28817bf6b1 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -106,37 +106,38 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
           r__.transpose_(0, 1);
         }
 
-        /* dense */
-        Tensor dense_;
-        char transpose_dense;
-        if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
-          transpose_dense = 'n';
-          dense_ = dense;
-        } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
-          transpose_dense = 't';
-          dense_ = dense;
-        } else {
-          transpose_dense = 't';
-          dense_ = dense.contiguous();
-        }
-
-        sparse::cuda::csrmm2(
-          'n',
-          transpose_dense,
-          m,
-          n,
-          k,
-          nnz,
-          cast_alpha,
-          values.data<scalar_t>(),
-          csr.data<int32_t>(),
-          colIndicesInt.data<int32_t>(),
-          dense_.data<scalar_t>(),
-          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
-          cast_beta,
-          r__.data<scalar_t>(),
-          r__.stride(1));
+        if (nnz > 0) {
+          /* dense */
+          Tensor dense_;
+          char transpose_dense;
+          if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
+            transpose_dense = 'n';
+            dense_ = dense;
+          } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
+            transpose_dense = 't';
+            dense_ = dense;
+          } else {
+            transpose_dense = 't';
+            dense_ = dense.contiguous();
+          }
 
+          sparse::cuda::csrmm2(
+            'n',
+            transpose_dense,
+            m,
+            n,
+            k,
+            nnz,
+            cast_alpha,
+            values.data<scalar_t>(),
+            csr.data<int32_t>(),
+            colIndicesInt.data<int32_t>(),
+            dense_.data<scalar_t>(),
+            (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
+            cast_beta,
+            r__.data<scalar_t>(),
+            r__.stride(1));
+        }
       });
 
   r_.copy_(r__);
@@ -281,6 +282,10 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   int64_t nDim = dense.dim();
   int64_t nDimI = sparse._sparseDims();
 
+  if (sparse._values().numel() == 0) {
+    return r_;
+  }
+
   if (sparse.is_coalesced()) {
     // TODO benchmark to decide whether to remove this special case
     const dim3 block = cuda::getApplyBlock();
@@ -423,6 +428,7 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   SparseTensor src = src_.coalesce();
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
+    r_.resize_as_(src_);
     return r_.zero_();
   }
 
diff --git a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-backward.expect b/test/expect/TestCudaSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestCudaSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-backward.expect b/test/expect/TestSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-uncoalesced.expect b/test/expect/TestSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
deleted file mode 100644
index fab1614da93d4a..00000000000000
--- a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
+++ /dev/null
@@ -1 +0,0 @@
-backend of indices (CUDA) must match backend of values (CPU)
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
deleted file mode 100644
index 77b0b500f3b692..00000000000000
--- a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'other' to be a CPU tensor, but got a CUDA tensor
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 82de45d6d17384..760879485e2b6d 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -44,7 +44,7 @@ def setUp(self):
         self.SparseTensor = torch.sparse.DoubleTensor
         super(TestSparse, self).setUp()
 
-    def _gen_sparse(self, d, nnz, with_size):
+    def _gen_sparse(self, sparse_dims, nnz, with_size):
         # TODO: Consider implementing this in the CUDA case by directly
         # performing the operations on the GPU.  You won't be able to
         # use torch.rand/torch.randn in this case because they are
@@ -54,28 +54,30 @@ def _gen_sparse(self, d, nnz, with_size):
         # If you do this, be sure to update assert_uncoalesced too
 
         if isinstance(with_size, Number):
-            with_size = [with_size] * d
+            with_size = [with_size] * sparse_dims
 
         if self.is_uncoalesced:
             # We want to generate a tensor with a lot of uncoalesced
             # entries to stress test whether or not we handle this
             # (subtle) case correctly
-            v_size = [nnz * 2] + list(with_size[d:])
+            v_size = [nnz * 2] + list(with_size[sparse_dims:])
             v = torch.randn(*v_size)
-            r = torch.rand(d, nnz)
+            r = torch.rand(sparse_dims, nnz)
             # Repeat the indexes, so every position shows up twice
-            i = torch.cat([r, r], dim=1) * \
-                torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1)
+            i = torch.cat([r, r], dim=1)
+            if nnz > 0:
+                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz * 2, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
             self.assert_uncoalesced(x)
         else:
-            # Generate a sparse tensor with d sparse dimensions; the
-            # rest the dimensions with_size[d:] are dense.
-            v_size = [nnz] + list(with_size[d:])
+            # Generate a sparse tensor with sparse_dims sparse dimensions; the
+            # rest the dimensions with_size[sparse_dims:] are dense.
+            v_size = [nnz] + list(with_size[sparse_dims:])
             v = torch.randn(*v_size)
-            i = torch.rand(d, nnz) * \
-                torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
+            i = torch.rand(sparse_dims, nnz)
+            if nnz > 0:
+                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
 
@@ -90,15 +92,13 @@ def assert_uncoalesced(self, x):
         correctness of the uncoalesced tensor generation algorithm.
         """
         assert not x.is_coalesced()
-        # Strategy: construct a new sparse tensor with the raw value
-        # field overwritten to a tensor of ones, coalesce it, and then
-        # check if any value entries are > 1 (which indicates that the
-        # original was uncoalesced.)
-        i = x._indices().clone()
-        v = x._values().clone().fill_(1)
-        y = torch.sparse.DoubleTensor(i, v, x.size())
-        z = self.safeCoalesce(y)
-        assert (z._values() > 1).sum() > 0
+        existing_indices = set()
+        for i in range(x._nnz()):
+            index = str(x._indices()[:, i])
+            if index in existing_indices:
+                return True
+            else:
+                existing_indices.add(index)
 
     def randn(self, *args, **kwargs):
         """
@@ -164,18 +164,20 @@ def test_print(self):
 
     @skipIfRocm
     def test_basic(self):
-        x, i, v = self._gen_sparse(3, 10, 100)
-
-        self.assertEqual(i, x._indices())
-        self.assertEqual(v, x._values())
-
-        x, i, v = self._gen_sparse(3, 10, [100, 100, 100])
-        self.assertEqual(i, x._indices())
-        self.assertEqual(v, x._values())
-        self.assertEqual(x.ndimension(), 3)
-        self.assertEqual(self.safeCoalesce(x)._nnz(), 10)
-        for i in range(3):
-            self.assertEqual(x.size(i), 100)
+        def test_shape(sparse_dims, nnz, with_size):
+            if isinstance(with_size, Number):
+                with_size = [with_size] * sparse_dims
+            x, i, v = self._gen_sparse(sparse_dims, nnz, with_size)
+            self.assertEqual(i, x._indices())
+            self.assertEqual(v, x._values())
+            self.assertEqual(x.ndimension(), len(with_size))
+            self.assertEqual(self.safeCoalesce(x)._nnz(), nnz)
+            self.assertEqual(list(x.size()), with_size)
+
+        test_shape(3, 10, 100)
+        test_shape(3, 10, [100, 100, 100])
+        test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
         # Make sure that coalesce handles duplicate indices correctly
         i = self.IndexTensor([[9, 0, 0, 0, 8, 1, 1, 1, 2, 7, 2, 2, 3, 4, 6, 9]])
@@ -213,6 +215,13 @@ def test_ctor_size_checks(self):
 
     @skipIfRocm
     def test_to_dense(self):
+        def test_tensor(x, res):
+            x.to_dense()  # Tests triple to_dense for memory corruption
+            x.to_dense()
+            x.to_dense()
+            self.assertEqual(res, x.to_dense())
+            self.assertEqual(res, self.safeToDense(x))
+
         i = self.IndexTensor([
             [0, 1, 2, 2],
             [0, 0, 0, 3],
@@ -234,12 +243,17 @@ def test_to_dense(self):
              [0, 0, 0, 0, 0],
              [0, 0, 0, 0, 4]],
         ])
+        test_tensor(x, res)
 
-        x.to_dense()  # Tests double to_dense for memory corruption
-        x.to_dense()
-        x.to_dense()
-        self.assertEqual(res, x.to_dense())
-        self.assertEqual(res, self.safeToDense(x))
+        i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
+        res = self.ValueTensor(3, 4, 5, 0)
+        test_tensor(x, res)
 
     @skipIfRocm
     def test_shared(self):
@@ -251,8 +265,21 @@ def test_shared(self):
         i[0][0] = 0
         self.assertEqual(self.ValueTensor([6, 0, 0]), self.safeToDense(x))
 
+        i = self.IndexTensor([[2]])
+        v = self.ValueTensor(1, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 0]))
+        i[0][0] = 0
+        self.assertEqual(self.ValueTensor(3, 0), self.safeToDense(x))
+
     @skipIfRocm
     def test_to_dense_hybrid(self):
+        def test_tensor(x, res):
+            x.to_dense()  # Tests double to_dense for memory corruption
+            x.to_dense()
+            x.to_dense()
+            self.assertEqual(res, x.to_dense())
+            self.assertEqual(res, self.safeToDense(x))
+
         i = self.IndexTensor([
             [0, 1, 2, 2],
             [0, 0, 0, 3],
@@ -273,15 +300,24 @@ def test_to_dense_hybrid(self):
              [0, 0],
              [4, 5]],
         ])
+        test_tensor(x, res)
 
-        x.to_dense()  # Tests double to_dense for memory corruption
-        x.to_dense()
-        x.to_dense()
-        self.assertEqual(res, x.to_dense())
-        self.assertEqual(res, self.safeToDense(x))
+        i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+        ])
+        v = self.ValueTensor(4, 2, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 2, 0]))
+        res = self.ValueTensor(3, 4, 2, 0)
+        test_tensor(x, res)
 
     @skipIfRocm
     def test_contig(self):
+        def test_tensor(x, exp_i, exp_v):
+            x = self.safeCoalesce(x)
+            self.assertEqual(exp_i, x._indices())
+            self.assertEqual(exp_v, x._values())
+
         i = self.IndexTensor([
             [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
@@ -293,9 +329,7 @@ def test_contig(self):
             [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
         ])
         exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        test_tensor(x, exp_i, exp_v)
 
         i = self.IndexTensor([
             [2, 0, 2, 1],
@@ -310,10 +344,22 @@ def test_contig(self):
             [0, 0, 1, 4],
         ])
         exp_v = self.ValueTensor([2, 1, 3, 4])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
+        exp_i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = self.ValueTensor(4, 0)
+        test_tensor(x, exp_i, exp_v)
 
         # Duplicate indices
         i = self.IndexTensor([
@@ -329,13 +375,30 @@ def test_contig(self):
             [0, 4],
         ])
         exp_v = self.ValueTensor([6, 4])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
+        exp_i = self.IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = self.ValueTensor(2, 0)
+        test_tensor(x, exp_i, exp_v)
 
     @skipIfRocm
     def test_contig_hybrid(self):
+        def test_tensor(x, exp_i, exp_v):
+            x = self.safeCoalesce(x)
+            self.assertEqual(exp_i, x._indices())
+            self.assertEqual(exp_v, x._values())
+
         i = self.IndexTensor([
             [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
@@ -353,9 +416,7 @@ def test_contig_hybrid(self):
             [2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
             [3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
         ])
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        test_tensor(x, exp_i, exp_v)
 
         i = self.IndexTensor([
             [2, 0, 2, 1],
@@ -370,10 +431,22 @@ def test_contig_hybrid(self):
             [0, 0, 1, 4],
         ])
         exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 3, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0]))
+        exp_i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = self.ValueTensor(4, 3, 0)
+        test_tensor(x, exp_i, exp_v)
 
         # Duplicate indices
         i = self.IndexTensor([
@@ -389,51 +462,79 @@ def test_contig_hybrid(self):
             [0, 4],
         ])
         exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 3, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0]))
+        exp_i = self.IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = self.ValueTensor(2, 3, 0)
+        test_tensor(x, exp_i, exp_v)
 
     @skipIfRocm
     def test_clone(self):
-        x, _, _ = self._gen_sparse(4, 20, 5)
-        if self.is_uncoalesced:
-            self.assertFalse(x.is_coalesced())
+        def test_shape(sparse_dims, nnz, with_size):
+            x = self._gen_sparse(sparse_dims, nnz, with_size)[0]
+            if self.is_uncoalesced:
+                self.assertFalse(x.is_coalesced())
+                y = x.clone()
+                self.assertFalse(y.is_coalesced())
+            x = x.coalesce()
+            self.assertTrue(x.is_coalesced())
             y = x.clone()
-            self.assertFalse(y.is_coalesced())
-        x = x.coalesce()
-        self.assertTrue(x.is_coalesced())
-        y = x.clone()
-        self.assertTrue(y.is_coalesced())
+            self.assertTrue(y.is_coalesced())
+
+        test_shape(4, 20, 5)
+        test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @cuda_only
     def test_cuda_empty(self):
+        def test_tensor(x):
+            y = x.cuda(0)
+            self.assertEqual(x._sparseDims(), y._sparseDims())
+            self.assertEqual(x._denseDims(), y._denseDims())
+            x = y.cpu()
+            self.assertEqual(y._sparseDims(), x._sparseDims())
+            self.assertEqual(y._denseDims(), x._denseDims())
+
         x = torch.sparse.FloatTensor(2, 3, 4)
-        y = x.cuda(0)
-        self.assertEqual(x._sparseDims(), y._sparseDims())
-        self.assertEqual(x._denseDims(), y._denseDims())
-        x = y.cpu()
-        self.assertEqual(y._sparseDims(), x._sparseDims())
-        self.assertEqual(y._denseDims(), x._denseDims())
+        test_tensor(x)
+
+        x = torch.sparse.FloatTensor(2, 3, 4, 0)
+        test_tensor(x)
 
     @skipIfRocm
     def test_transpose(self):
-        x = self._gen_sparse(4, 20, 5)[0]
-        y = self.safeToDense(x)
+        def test_shape(sparse_dims, nnz, with_size):
+            x = self._gen_sparse(sparse_dims, nnz, with_size)[0]
+            y = self.safeToDense(x)
+
+            for i, j in itertools.combinations(range(4), 2):
+                x = x.transpose_(i, j)
+                y = y.transpose(i, j)
+                self.assertEqual(self.safeToDense(x), y)
 
-        for i, j in itertools.combinations(range(4), 2):
-            x = x.transpose_(i, j)
-            y = y.transpose(i, j)
-            self.assertEqual(self.safeToDense(x), y)
+                x = x.transpose(i, j)
+                y = y.transpose(i, j)
+                self.assertEqual(self.safeToDense(x), y)
 
-            x = x.transpose(i, j)
-            y = y.transpose(i, j)
-            self.assertEqual(self.safeToDense(x), y)
+        test_shape(4, 20, 5)
+        test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @cpu_only
     def test_coalesce_transpose_mm(self):
-        def test_shape(di, dj, dk):
-            x, _, _ = self._gen_sparse(2, 20, [dj, di])
+        def test_shape(di, dj, dk, nnz):
+            x, _, _ = self._gen_sparse(2, nnz, [dj, di])
             y = torch.randn(dj, dk)
 
             x_coalesced = x.coalesce()
@@ -446,43 +547,58 @@ def test_shape(di, dj, dk):
             expected = torch.mm(self.safeToDense(x_coalesced_t), y)
             self.assertEqual(res, expected)
 
-        test_shape(10, 20, 30)
+        test_shape(10, 20, 30, 20)
+        test_shape(0, 20, 30, 0)
+        test_shape(10, 0, 30, 0)
+        test_shape(10, 20, 0, 0)
+        test_shape(10, 20, 0, 20)
 
     def test_t_empty(self):
-        x = self.SparseTensor(2, 3)
-        x.t_()
-        self.assertEqual(torch.Size([3, 2]), x.size())
-        self.assertEqual(0, x._indices().numel())
-        self.assertEqual(0, x._values().numel())
-        self.assertEqual(x._sparseDims(), 2)
-        self.assertEqual(x._denseDims(), 0)
+        def test_in_place(x):
+            shape_original = x.shape
+            x.t_()
+            self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), x.size())
+            self.assertEqual(0, x._indices().numel())
+            self.assertEqual(0, x._values().numel())
+            self.assertEqual(x._sparseDims(), 2)
+            self.assertEqual(x._denseDims(), 0)
+
+        def test_not_in_place(x):
+            shape_original = x.shape
+            y = x.t()
+            self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), y.size())
+            self.assertEqual(0, y._indices().numel())
+            self.assertEqual(0, y._values().numel())
+            self.assertEqual(x._sparseDims(), 2)
+            self.assertEqual(x._denseDims(), 0)
 
         x = self.SparseTensor(2, 3)
-        y = x.t()
-        self.assertEqual(torch.Size([3, 2]), y.size())
-        self.assertEqual(0, y._indices().numel())
-        self.assertEqual(0, y._values().numel())
-        self.assertEqual(x._sparseDims(), 2)
-        self.assertEqual(x._denseDims(), 0)
+        test_in_place(x)
+        test_not_in_place(x)
+
+        x = self.SparseTensor(2, 0)
+        test_in_place(x)
+        test_not_in_place(x)
 
     @skipIfRocm
     def test_add_zeros(self):
-        def test_shape(sparse_dims, sizes):
-            x, _, _ = self._gen_sparse(sparse_dims, 20, sizes)
+        def test_shape(sparse_dims, nnz, sizes):
+            x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
             zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device)
             r1 = zeros + x
             r2 = x + zeros
             self.assertEqual(r1, x)
             self.assertEqual(r2, x)
 
-        test_shape(1, [1])
-        test_shape(4, [3, 17, 19, 5])
-        test_shape(2, [3, 17, 19, 5])
+        test_shape(1, 20, [1])
+        test_shape(4, 20, [3, 17, 19, 5])
+        test_shape(2, 20, [3, 17, 19, 5])
+        test_shape(2, 20, [3, 17, 19, 0])
 
     @cpu_only
     def test_mm(self):
-        def test_shape(di, dj, dk):
-            x, _, _ = self._gen_sparse(2, 20, [di, dj])
+        def test_shape(di, dj, dk, nnz):
+            x, _, _ = self._gen_sparse(2, nnz, [di, dj])
             t = torch.randn(di, dk)
             y = torch.randn(dj, dk)
             alpha = random.random()
@@ -500,15 +616,19 @@ def test_shape(di, dj, dk):
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res, expected)
 
-        test_shape(10, 100, 100)
-        test_shape(100, 1000, 200)
-        test_shape(64, 10000, 300)
+        test_shape(10, 100, 100, 20)
+        test_shape(100, 1000, 200, 20)
+        test_shape(64, 10000, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(10, 0, 100, 0)
+        test_shape(10, 100, 0, 0)
+        test_shape(10, 100, 0, 20)
 
     @cpu_only
     def test_saddmm(self):
-        def test_shape(di, dj, dk):
-            x = self._gen_sparse(2, 20, [di, dj])[0]
-            t = self._gen_sparse(2, 20, [di, dk])[0]
+        def test_shape(di, dj, dk, nnz):
+            x = self._gen_sparse(2, nnz, [di, dj])[0]
+            t = self._gen_sparse(2, nnz, [di, dk])[0]
             y = torch.randn(dj, dk)
             alpha = random.random()
             beta = random.random()
@@ -525,43 +645,52 @@ def test_shape(di, dj, dk):
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(self.safeToDense(res), expected)
 
-        test_shape(7, 5, 3)
-        test_shape(1000, 100, 100)
-        test_shape(3000, 64, 300)
+        test_shape(7, 5, 3, 20)
+        test_shape(1000, 100, 100, 20)
+        test_shape(3000, 64, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(1000, 0, 100, 0)
+        test_shape(1000, 100, 0, 0)
 
     @skipIfRocm
     def test_dsmm(self):
-        def test_shape(di, dj, dk):
-            x = self._gen_sparse(2, 20, [di, dj])[0]
+        def test_shape(di, dj, dk, nnz):
+            x = self._gen_sparse(2, nnz, [di, dj])[0]
             y = self.randn(dj, dk)
 
             res = torch.dsmm(x, y)
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res, expected)
 
-        test_shape(7, 5, 3)
-        test_shape(1000, 100, 100)
-        test_shape(3000, 64, 300)
+        test_shape(7, 5, 3, 20)
+        test_shape(1000, 100, 100, 20)
+        test_shape(3000, 64, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(1000, 0, 100, 0)
+        test_shape(1000, 100, 0, 0)
+        test_shape(1000, 100, 0, 20)
 
     @skipIfRocm
     def test_hsmm(self):
-        def test_shape(di, dj, dk):
-            x = self._gen_sparse(2, 20, [di, dj])[0]
+        def test_shape(di, dj, dk, nnz):
+            x = self._gen_sparse(2, nnz, [di, dj])[0]
             y = self.randn(dj, dk)
 
             res = torch.hsmm(x, y)
-            # TODO: use self.safeToDense(), but this triggers
-            # https://github.com/pytorch/pytorch/issues/3170
-            expected = torch.mm(x.to_dense(), y)
+            expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res.to_dense(), expected)
 
-        test_shape(7, 5, 3)
-        test_shape(1000, 100, 100)
-        test_shape(3000, 64, 300)
+        test_shape(7, 5, 3, 20)
+        test_shape(1000, 100, 100, 20)
+        test_shape(3000, 64, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(1000, 0, 100, 0)
+        test_shape(1000, 100, 0, 0)
+        test_shape(1000, 100, 0, 20)
 
-    def _test_spadd_shape(self, shape_i, shape_v=None):
+    def _test_spadd_shape(self, nnz, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x, _, _ = self._gen_sparse(len(shape_i), 10, shape)
+        x, _, _ = self._gen_sparse(len(shape_i), nnz, shape)
         y = self.randn(*shape)
         r = random.random()
 
@@ -583,7 +712,7 @@ def _test_spadd_shape(self, shape_i, shape_v=None):
 
         self.assertEqual(res, expected)
 
-        x, i, v = self._gen_sparse(len(shape_i), 10, shape)
+        x, i, v = self._gen_sparse(len(shape_i), nnz, shape)
         nnz = i.size(1)
 
         # Non contiguous sparse indices tensor
@@ -606,28 +735,40 @@ def _test_spadd_shape(self, shape_i, shape_v=None):
 
     @skipIfRocm
     def test_spadd(self):
-        self._test_spadd_shape([5, 6])
-        self._test_spadd_shape([10, 10, 10])
-        self._test_spadd_shape([50, 30, 20])
-        self._test_spadd_shape([5, 5, 5, 5, 5, 5])
+        self._test_spadd_shape(10, [5, 6])
+        self._test_spadd_shape(10, [10, 10, 10])
+        self._test_spadd_shape(10, [50, 30, 20])
+        self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5])
+        self._test_spadd_shape(0, [0, 30, 20])
+        self._test_spadd_shape(0, [50, 0, 20])
+        self._test_spadd_shape(0, [50, 30, 0])
 
     @skipIfRocm
     def test_spadd_hybrid(self):
-        self._test_spadd_shape([5, 6], [2, 3])
-        self._test_spadd_shape([10, 10, 10], [3])
-        self._test_spadd_shape([50, 30, 20], [2])
-        self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2])
+        self._test_spadd_shape(10, [5, 6], [2, 3])
+        self._test_spadd_shape(10, [10, 10, 10], [3])
+        self._test_spadd_shape(10, [50, 30, 20], [2])
+        self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5], [2])
+        self._test_spadd_shape(0, [0, 30, 20], [2, 0])
+        self._test_spadd_shape(0, [50, 0, 20], [2, 0])
+        self._test_spadd_shape(0, [50, 30, 0], [2, 0])
+        self._test_spadd_shape(10, [50, 30, 20], [2, 0])
 
     @skipIfRocm
     def test_norm(self):
-        x, _, _ = self._gen_sparse(3, 10, 100)
-        y = x.coalesce()
-        self.assertEqual(x.norm(), y._values().norm())
+        def test_shape(sparse_dims, nnz, with_size):
+            x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size)
+            y = x.coalesce()
+            self.assertEqual(x.norm(), y._values().norm())
+
+        test_shape(3, 10, 100)
+        test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
 
-    def _test_basic_ops_shape(self, shape_i, shape_v=None):
+    def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
-        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
+        x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape)
 
         y1 = x1 + x2
         y2 = x1.clone()
@@ -689,30 +830,49 @@ def _test_basic_ops_shape(self, shape_i, shape_v=None):
 
     @skipIfRocm
     def test_basic_ops(self):
-        self._test_basic_ops_shape([5, 6])
-        self._test_basic_ops_shape([10, 10, 10])
-        self._test_basic_ops_shape([50, 30, 20])
-        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5])
+        self._test_basic_ops_shape(9, 12, [5, 6])
+        self._test_basic_ops_shape(9, 12, [10, 10, 10])
+        self._test_basic_ops_shape(9, 12, [50, 30, 20])
+        self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5])
+        self._test_basic_ops_shape(0, 12, [10, 10, 10])
+        self._test_basic_ops_shape(9, 0, [10, 10, 10])
+        self._test_basic_ops_shape(0, 0, [10, 10, 10])
+        self._test_basic_ops_shape(0, 0, [10, 10, 0])
 
     @skipIfRocm
     def test_basic_ops_hybrid(self):
-        self._test_basic_ops_shape([5, 6], [2, 3])
-        self._test_basic_ops_shape([10, 10, 10], [3])
-        self._test_basic_ops_shape([50, 30, 20], [2])
-        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2])
+        self._test_basic_ops_shape(9, 12, [5, 6], [2, 3])
+        self._test_basic_ops_shape(9, 12, [10, 10, 10], [3])
+        self._test_basic_ops_shape(9, 12, [50, 30, 20], [2])
+        self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5], [2])
+        self._test_basic_ops_shape(0, 12, [10, 10, 10], [2])
+        self._test_basic_ops_shape(9, 0, [10, 10, 10], [2])
+        self._test_basic_ops_shape(0, 0, [10, 10, 10], [2])
+        self._test_basic_ops_shape(9, 12, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(0, 12, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(9, 0, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(0, 0, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(0, 0, [10, 10, 0], [2, 0])
 
     @skipIfRocm
     def test_add_dense_sparse_mismatch(self):
-        x = torch.zeros([3, 4], dtype=self.value_dtype, device=self.device)
-        sparse_y = self.SparseTensor(torch.zeros(1, 4, dtype=torch.int64, device=self.device),
-                                     torch.randn(4, 4, 4, dtype=self.value_dtype, device=self.device),
-                                     torch.Size([3, 4, 4]))
-        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
-
-    def _test_sparse_mask_shape(self, shape_i, shape_v=None):
+        def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
+            x = torch.zeros(dense_size, dtype=self.value_dtype, device=self.device)
+            sparse_y = self.SparseTensor(torch.zeros(sparse_dims_shape, dtype=torch.int64, device=self.device),
+                                         torch.randn(dense_dims_shape, dtype=self.value_dtype, device=self.device),
+                                         torch.Size(sparse_size))
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    "add: expected 'self' and 'other' to have same size"):
+                x + sparse_y
+
+        test_shape([3, 4], [1, 4], [4, 4, 4], [3, 4, 4])
+        test_shape([3, 4, 0], [1, 4], [4, 4, 4, 0], [3, 4, 4, 0])
+
+    def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
-        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
+        x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape)
 
         y1 = x1 + x2
         y2 = x1.clone()
@@ -740,87 +900,30 @@ def _test_sparse_mask_fixed(self):
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4]))
         self.assertEqual(res, expected)
 
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([5, 4, 0])).coalesce()
+        dense = self.ValueTensor(5, 4, 0)
+        exp_v = self.ValueTensor(4, 0)
+        res = dense.sparse_mask(x)
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 0]))
+        self.assertEqual(res, expected)
+
     @skipIfRocm
     def test_sparse_mask(self):
         self._test_sparse_mask_fixed()
 
-        self._test_sparse_mask_shape([5, 6])
-        self._test_sparse_mask_shape([10, 10, 10])
-        self._test_sparse_mask_shape([50, 30, 20])
-        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5])
-
-    def _test_zeros(self, shape, out_shape_i, out_shape_v=None):
-        out_shape = out_shape_i + (out_shape_v or [])
-        for nnz in [9, 12]:
-            out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape)
-            torch.zeros(*shape, out=out)
-            self.assertEqual(tuple(out.size()), tuple(shape))
-            self.assertTrue(out._indices().numel() == out._values().numel() == 0)
-            self.assertEqual(out._nnz(), 0)
-            self.assertEqual(out._sparseDims(), len(shape))
-            self.assertEqual(out._denseDims(), 0)
-
-    @skipIfRocm
-    def test_log1p(self):
-        if self.is_cuda:
-            input = torch.cuda.sparse.DoubleTensor(
-                torch.LongTensor([[0], [1], [2]]).transpose(1, 0).cuda(),
-                torch.FloatTensor([3, 4, 5]).cuda(),
-                torch.Size([3]))
-        else:
-            input = torch.sparse.DoubleTensor(
-                torch.LongTensor([[0], [1], [2]]).transpose(1, 0),
-                torch.FloatTensor([3, 4, 5]),
-                torch.Size([3]))
-
-        expected_output = torch.tensor([3., 4., 5.]).log1p_()
-        self.assertEqual(expected_output, input.log1p().to_dense())
-        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
-
-        # test in-place op on uncoalesced input
-        self.assertExpectedRaises(RuntimeError, lambda: input.log1p_(), subname="uncoalesced")
-
-        input.requires_grad_()
-        self.assertTrue(input.requires_grad)
-
-        # test autograd
-        x = input.clone()
-        y = input.log1p()
-        self.assertExpectedRaises(RuntimeError, lambda: y.backward(x), subname="backward")
-
-        # test uncoalesced input
-        input_uncoalesced = torch.sparse.DoubleTensor(
-            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
-            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
-            torch.Size([3]))
-        self.assertEqual(expected_output, input_uncoalesced.log1p().to_dense())
-        self.assertEqual(expected_output, input_uncoalesced.coalesce().log1p_().to_dense())
-
-    def test_zeros(self):
-        i_shapes = [2, 3, 4]
-        v_shapes = [3, 4, 5, 6]
-        for i_dim in range(1, len(i_shapes) + 1):
-            for v_dim in range(len(v_shapes) + 1):
-                self._test_zeros([2, 3, 4], i_shapes[:i_dim], v_shapes[:v_dim])
-
-    def _test_zeros_like(self, template_shape_i, template_shape_v=None):
-        template_shape_v = template_shape_v or []
-        template_shape = template_shape_i + template_shape_v
-        for nnz in [9, 12]:
-            t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
-            res = torch.zeros_like(t)
-            self.assertEqual(tuple(res.size()), tuple(template_shape))
-            self.assertTrue(res._indices().numel() == res._values().numel() == 0)
-            self.assertEqual(res._nnz(), 0)
-            self.assertEqual(res._sparseDims(), len(template_shape_i))
-            self.assertEqual(res._denseDims(), len(template_shape_v))
-
-    def test_zeros_like(self):
-        i_shapes = [2, 3, 4]
-        v_shapes = [3, 4, 5, 6]
-        for i_dim in range(1, len(i_shapes) + 1):
-            for v_dim in range(len(v_shapes) + 1):
-                self._test_zeros_like(i_shapes[:i_dim], v_shapes[:v_dim])
+        self._test_sparse_mask_shape(9, 12, [5, 6])
+        self._test_sparse_mask_shape(9, 12, [10, 10, 10])
+        self._test_sparse_mask_shape(9, 12, [50, 30, 20])
+        self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5])
+        self._test_sparse_mask_shape(0, 12, [10, 10, 10])
+        self._test_sparse_mask_shape(9, 0, [10, 10, 10])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 10])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 0])
 
     def _test_sparse_mask_hybrid_fixed(self):
         i = self.IndexTensor([
@@ -844,110 +947,130 @@ def _test_sparse_mask_hybrid_fixed(self):
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
         self.assertEqual(res, expected)
 
-    @skipIfRocm
-    def test_sparse_variable_methods(self):
-        # TODO: delete when tensor/variable are merged
-        from torch.autograd import Variable
-        i = self.IndexTensor([[0, 1, 1], [2, 0, 2]])
-        v = self.ValueTensor([3, 4, 5])
-        sparse_mat = self.SparseTensor(i, v, torch.Size([2, 3]))
-        sparse_var = Variable(sparse_mat)
-
-        to_test_one_arg = {
-            'zeros_like': lambda x: torch.zeros_like(x),
-            'transpose': lambda x: x.transpose(0, 1),
-            'transpose_': lambda x: x.transpose_(0, 1),
-            't': lambda x: x.t(),
-            't_': lambda x: x.t_(),
-            'div': lambda x: x.div(2),
-            'div_': lambda x: x.div_(2),
-            'pow': lambda x: x.pow(2),
-            '_nnz': lambda x: x._nnz(),
-            'is_coalesced': lambda x: x.is_coalesced(),
-            'coalesce': lambda x: x.coalesce(),
-            'to_dense': lambda x: x.to_dense(),
-            '_sparseDims': lambda x: x._sparseDims(),
-            '_denseDims': lambda x: x._denseDims(),
-            'norm': lambda x: x.norm(),
-            'log1p': lambda x: x.log1p(),
-        }
-
-        for test_name, test_fn in to_test_one_arg.items():
-            var1 = sparse_var.clone()
-            tensor1 = sparse_mat.clone()
-
-            out_var = test_fn(var1)
-            out_tensor = test_fn(tensor1)
-
-            if isinstance(out_tensor, int) or isinstance(out_tensor, bool):
-                if not isinstance(out_var, int) and not isinstance(out_var, bool):
-                    check_var = out_var.data[0]
-                else:
-                    check_var = out_var
-                self.assertEqual(out_var, out_tensor)
-                continue
-
-            # Assume output is variable / tensor
-            self.assertEqual(test_fn(var1).data, test_fn(tensor1),
-                             test_name)
-
-        i = self.IndexTensor([[0, 0, 1], [1, 2, 1]])
-        v = self.ValueTensor([3, 3, 4])
-        sparse_mat2 = self.SparseTensor(i, v, torch.Size([2, 3]))
-        sparse_var2 = Variable(sparse_mat2)
-
-        to_test_two_arg = {
-            'sub': lambda x, y: x.sub(y),
-            'sub_': lambda x, y: x.sub_(y),
-            'mul': lambda x, y: x.mul(y),
-            'mul_': lambda x, y: x.mul_(y),
-        }
-
-        for test_name, test_fn in to_test_two_arg.items():
-            var1 = sparse_var.clone()
-            var2 = sparse_var2.clone()
-            tensor1 = sparse_mat.clone()
-            tensor2 = sparse_mat2.clone()
-            self.assertEqual(test_fn(var1, var2).data,
-                             test_fn(tensor1, tensor2), test_name)
-
-        to_test_mixed = [
-            # test name, lambda expression, should_run_when_cuda
-            ('sspaddmm', lambda sp, de: sp.sspaddmm(sp, de), False),
-            ('sspaddmm_b', lambda sp, de: sp.sspaddmm(2, sp, de), False),
-            ('sspaddmm_b_a', lambda sp, de: sp.sspaddmm(3, 2, sp, de), False),
-            ('addmm', lambda sp, de: de.addmm(sp, de), True),
-            # TODO: This looks like a typo
-            ('addmm_', lambda sp, de: de.addmm(sp, de), True),
-            ('mm', lambda sp, de: torch.mm(sp, de), True),
-            ('mm_out', lambda sp, de: torch.mm(sp, de, out=de), True),
-        ]
-
-        i = self.IndexTensor([[0, 0, 1, 2, 2], [1, 2, 1, 0, 1]])
-        v = self.ValueTensor([3, 3, 4, 1, 2])
-        sparse_mat = self.SparseTensor(i, v, torch.Size([3, 3]))
-        sparse_var = Variable(sparse_mat)
-        dense_mat = sparse_mat.to_dense().random_(0, 5)
-        dense_var = Variable(dense_mat)
-
-        for test_name, test_fn, test_cuda in to_test_mixed:
-            if sparse_var.is_cuda and not test_cuda:
-                continue
-            sp_var = sparse_var.clone()
-            de_var = dense_var.clone()
-            sp_mat = sparse_mat.clone()
-            de_mat = dense_mat.clone()
-            self.assertEqual(test_fn(sp_var, de_var).data,
-                             test_fn(sp_mat, de_mat), test_name)
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor(4, 2, 0)
+        x = self.SparseTensor(i, v, torch.Size([5, 4, 2, 0])).coalesce()
+        dense = self.ValueTensor(5, 4, 2, 0)
+        res = dense.sparse_mask(x)
+        exp_v = self.ValueTensor(4, 2, 0)
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2, 0]))
+        self.assertEqual(res, expected)
 
     @skipIfRocm
     def test_sparse_mask_hybrid(self):
         self._test_sparse_mask_hybrid_fixed()
 
-        self._test_sparse_mask_shape([5, 6], [2, 3])
-        self._test_sparse_mask_shape([10, 10, 10], [3])
-        self._test_sparse_mask_shape([50, 30, 20], [2])
-        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2])
+        self._test_sparse_mask_shape(9, 12, [5, 6], [2, 3])
+        self._test_sparse_mask_shape(9, 12, [10, 10, 10], [3])
+        self._test_sparse_mask_shape(9, 12, [50, 30, 20], [2])
+        self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5], [2])
+        self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2])
+        self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2])
+        self._test_sparse_mask_shape(9, 12, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0])
+
+    def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None):
+        out_shape = out_shape_i + (out_shape_v or [])
+        for nnz in nnzs:
+            out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape)
+            torch.zeros(*shape, out=out)
+            self.assertEqual(tuple(out.size()), tuple(shape))
+            self.assertTrue(out._indices().numel() == out._values().numel() == 0)
+            self.assertEqual(out._nnz(), 0)
+            self.assertEqual(out._sparseDims(), len(shape))
+            self.assertEqual(out._denseDims(), 0)
+
+    def test_zeros(self):
+        def test_shape(i_shapes, v_shapes, shape, nnzs):
+            for i_dim in range(1, len(i_shapes) + 1):
+                for v_dim in range(len(v_shapes) + 1):
+                    self._test_zeros(nnzs, shape, i_shapes[:i_dim], v_shapes[:v_dim])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 4], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 4], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 4], [9, 12])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 0], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 0], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12])
+
+    def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None):
+        template_shape_v = template_shape_v or []
+        template_shape = template_shape_i + template_shape_v
+        for nnz in nnzs:
+            t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
+            res = torch.zeros_like(t)
+            self.assertEqual(tuple(res.size()), tuple(template_shape))
+            self.assertTrue(res._indices().numel() == res._values().numel() == 0)
+            self.assertEqual(res._nnz(), 0)
+            self.assertEqual(res._sparseDims(), len(template_shape_i))
+            self.assertEqual(res._denseDims(), len(template_shape_v))
+
+    def test_zeros_like(self):
+        def test_shape(i_shapes, v_shapes, nnzs):
+            for i_dim in range(1, len(i_shapes) + 1):
+                for v_dim in range(len(v_shapes) + 1):
+                    self._test_zeros_like(nnzs, i_shapes[:i_dim], v_shapes[:v_dim])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
+
+    def _test_log1p_tensor(self, input, dense_tensor):
+        expected_output = torch.tensor(dense_tensor).log1p_()
+        self.assertEqual(expected_output, input.log1p().to_dense())
+        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
+
+        # test in-place op on uncoalesced input
+        with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"):
+            input.log1p_()
+
+        input.requires_grad_()
+        self.assertTrue(input.requires_grad)
+
+        # test autograd
+        x = input.clone()
+        y = input.log1p()
+        with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"):
+            y.backward(x)
+
+    @skipIfRocm
+    def test_log1p(self):
+        input = torch.sparse_coo_tensor(
+            torch.LongTensor([[0], [1], [2]]).transpose(1, 0),
+            torch.FloatTensor([3, 4, 5]),
+            torch.Size([3]),
+            device=self.device)
+        self._test_log1p_tensor(input, [3., 4., 5.])
+
+        # test uncoalesced input
+        input_uncoalesced = torch.sparse_coo_tensor(
+            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
+            torch.Size([3]),
+            device=self.device)
+        self._test_log1p_tensor(input_uncoalesced, [3., 4., 5.])
+
+        input = torch.sparse_coo_tensor(
+            torch.zeros([2, 0]),
+            torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+            torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]),
+            device=self.device)
+        self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0]))
+
+        input = torch.sparse_coo_tensor(
+            torch.zeros([1, 5]),
+            torch.zeros([5, 6, 0]),
+            torch.Size([5, 6, 0]),
+            device=self.device)
+        self._test_log1p_tensor(input, torch.zeros([5, 6, 0]))
 
     @skipIfRocm
     def test_sparse_add_coalesce(self):
@@ -959,30 +1082,55 @@ def test_sparse_add_coalesce(self):
 
         self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
 
+        i = self.IndexTensor([[1, 2, 1]])
+        v = self.ValueTensor(3, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 0]))
+        y = self.SparseTensor(i, v, torch.Size([3, 0]))
+        z = x + y
+
+        self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
+
     @cuda_only
     def test_storage_not_null(self):
         x = torch.cuda.sparse.FloatTensor(2)
         self.assertNotEqual(x.get_device(), -1)
 
+        x = torch.cuda.sparse.FloatTensor(2, 0)
+        self.assertNotEqual(x.get_device(), -1)
+
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     @skipIfRocm
     def test_same_gpu(self):
+        def check_device(x, device_id):
+            self.assertEqual(x.get_device(), device_id)
+            self.assertEqual(x._values().get_device(), device_id)
+            self.assertEqual(x._indices().get_device(), device_id)
+
         i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(1)
         x = self.SparseTensor(i, v, torch.Size([3]), device=1)
-        self.assertEqual(x.get_device(), 1)
-        self.assertEqual(x._values().get_device(), 1)
-        self.assertEqual(x._indices().get_device(), 1)
+        check_device(x, 1)
+
+        i = self.IndexTensor([[2]]).cuda(1)
+        v = self.ValueTensor(1, 0).cuda(1)
+        x = self.SparseTensor(i, v, torch.Size([3, 0]), device=1)
+        check_device(x, 1)
 
         x = self.SparseTensor(3, device=1)
-        self.assertEqual(x.get_device(), 1)
-        self.assertEqual(x._values().get_device(), 1)
-        self.assertEqual(x._indices().get_device(), 1)
+        check_device(x, 1)
+
+        x = self.SparseTensor(3, 0, device=1)
+        check_device(x, 1)
 
+        i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(0)
         self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3])))
 
+        i = self.IndexTensor([[2]]).cuda(1)
+        v = self.ValueTensor(1, 0).cuda(0)
+        self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3, 0])))
+
     def _test_new_device(self, size, device):
         with torch.cuda.device(device):
             x = torch.cuda.sparse.DoubleTensor(*size)
@@ -997,6 +1145,7 @@ def test_new_device_single_gpu(self):
         self._test_new_device((), 0)
         self._test_new_device((30, 20), 0)
         self._test_new_device((30, 20, 10), 0)
+        self._test_new_device((30, 20, 10, 0), 0)
 
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
@@ -1004,49 +1153,67 @@ def test_new_device_multi_gpu(self):
         self._test_new_device((), 1)
         self._test_new_device((30, 20), 1)
         self._test_new_device((30, 20, 10), 1)
+        self._test_new_device((30, 20, 10, 0), 1)
 
     @skipIfRocm
     def test_new(self):
-        x, indices, values = self._gen_sparse(3, 10, 100)
-        if not x.is_cuda:
-            # CUDA sparse tensors currently requires the size to be
-            # specified if nDimV > 0
-            self.assertEqual(x.new(indices, values), x)
-        self.assertEqual(x.new(indices, values, x.size()), x)
+        def test_shape(sparse_dims, nnz, with_size):
+            x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size)
+            if not x.is_cuda:
+                # CUDA sparse tensors currently requires the size to be
+                # specified if nDimV > 0
+                self.assertEqual(x.new(indices, values), x)
+            self.assertEqual(x.new(indices, values, x.size()), x)
+
+        test_shape(3, 10, 100)
+        test_shape(3, 0, [100, 100, 0])
 
     @cpu_only  # not really, but we only really want to run this once
     @skipIfRocm
     def test_factory(self):
-        default_size = torch.Size([1, 3])
-        size = torch.Size([3, 3])
-        for include_size in [True, False]:
-            for use_tensor_idx in [True, False]:
-                for use_tensor_val in [True, False]:
-                    for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
-                        # have to include size with cuda sparse tensors
-                        include_size = include_size or use_cuda
-                        dtype = torch.float64
-                        long_dtype = torch.int64
-                        device = torch.device('cpu') if not use_cuda else torch.device(torch.cuda.device_count() - 1)
-                        indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
-                        values = torch.tensor([1.], dtype=dtype) if use_tensor_val else 1.
-                        if include_size:
-                            sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
-                                                                    device=device, requires_grad=True)
-                        else:
-                            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
-                                                                    device=device, requires_grad=True)
-                        self.assertEqual(indices, sparse_tensor._indices())
-                        self.assertEqual(values, sparse_tensor._values())
-                        self.assertEqual(size if include_size else default_size, sparse_tensor.size())
-                        self.assertEqual(dtype, sparse_tensor.dtype)
-                        if use_cuda:
-                            self.assertEqual(device, sparse_tensor._values().device)
-                        self.assertEqual(True, sparse_tensor.requires_grad)
+        for test_empty_tensor in [True, False]:
+            if test_empty_tensor:
+                default_size = torch.Size([1, 3, 0])
+                size = torch.Size([3, 3, 0])
+            else:
+                default_size = torch.Size([1, 3])
+                size = torch.Size([3, 3])
+            for include_size in [True, False]:
+                for use_tensor_idx in [True, False]:
+                    for use_tensor_val in [True, False]:
+                        for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
+                            # have to include size with cuda sparse tensors
+                            include_size = include_size or use_cuda
+                            dtype = torch.float64
+                            long_dtype = torch.int64
+                            device = torch.device('cpu') if not use_cuda else \
+                                torch.device(torch.cuda.device_count() - 1)
+                            indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
+                            if test_empty_tensor:
+                                values = self.ValueTensor(1, 0)
+                            else:
+                                if use_tensor_val:
+                                    values = torch.tensor([1.], dtype=dtype)
+                                else:
+                                    values = 1.
+                            if include_size:
+                                sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
+                                                                        device=device, requires_grad=True)
+                            else:
+                                sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
+                                                                        device=device, requires_grad=True)
+                            self.assertEqual(indices, sparse_tensor._indices())
+                            self.assertEqual(values, sparse_tensor._values())
+                            self.assertEqual(size if include_size else default_size, sparse_tensor.size())
+                            self.assertEqual(dtype, sparse_tensor.dtype)
+                            if use_cuda:
+                                self.assertEqual(device, sparse_tensor._values().device)
+                            self.assertEqual(True, sparse_tensor.requires_grad)
 
     @skipIfRocm
     def test_factory_size_check(self):
-        indices = self.IndexTensor([[1, 2], [0, 2]])
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
         values = self.ValueTensor([.5, .5])
         sizes = torch.Size([2, 3])
         with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
@@ -1056,12 +1223,34 @@ def test_factory_size_check(self):
         with self.assertRaisesRegex(RuntimeError, "found negative index"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[1, 2], [0, 2]])
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
+        values = self.ValueTensor(2, 1, 0)
+        sizes = torch.Size([2, 3, 1, 0])
+        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
+        values = self.ValueTensor(2, 2, 2)
+        sizes = torch.Size([0, 0, 2, 2])
+        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
         values = self.ValueTensor([[1, 1, 1], [1, 1, 1]])
         sizes = torch.Size([3, 3, 2])
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
+        values = self.ValueTensor(2, 1, 0)
+        sizes = torch.Size([3, 3, 2, 0])
+        with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
     def test_factory_default(self):
         tensor = self.SparseTensor()
         expected_indices = self.IndexTensor(1, 0)
@@ -1095,25 +1284,31 @@ def test_factory_nnz(self):
         with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-    def _test_factory_tensor_shape(self, i_shape, v_shape, size, expected_size):
-        device = 'cuda' if self.is_cuda else 'cpu'
-        if size:
-            t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device)
-        else:
-            t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device)
-        expected_indices = torch.empty(i_shape, device=device)
-        expected_values = torch.empty(v_shape, device=device)
-        expected_size = torch.Size(expected_size)
-        self.assertEqual(t._indices(), expected_indices)
-        self.assertEqual(t._values(), expected_values)
-        self.assertEqual(t.size(), expected_size)
+        indices = self.IndexTensor([[0]])  # (sparseDims, nnz): (1, 1)
+        values = self.ValueTensor(2, 0)  # (nnz, ...): (2, 0)
+        sizes = torch.Size([2, 0])
+        with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
+            torch.sparse_coo_tensor(indices, values, sizes)
 
     def test_factory_nnz_zero(self):
-        self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0])
-        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0])
-        self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0])
-        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
-        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
+        def test_shape(i_shape, v_shape, size, expected_size):
+            device = 'cuda' if self.is_cuda else 'cpu'
+            if size:
+                t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device)
+            else:
+                t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device)
+            expected_indices = torch.empty(i_shape, device=device)
+            expected_values = torch.empty(v_shape, device=device)
+            expected_size = torch.Size(expected_size)
+            self.assertEqual(t._indices(), expected_indices)
+            self.assertEqual(t._values(), expected_values)
+            self.assertEqual(t.size(), expected_size)
+
+        test_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0])
+        test_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0])
+        test_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0])
+        test_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
+        test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
     @skipIfRocm
     def test_factory_dense_dims(self):
@@ -1123,6 +1318,12 @@ def test_factory_dense_dims(self):
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
+        indices = self.IndexTensor([[0]])
+        values = self.ValueTensor(1, 2, 3, 0)
+        sizes = torch.Size([1, 3, 4, 0])
+        with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
     @cpu_only
     def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32))
@@ -1132,6 +1333,13 @@ def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1]))
         self.assertEqual(torch.int64, t.dtype)
 
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0))
+        self.assertEqual(torch.float32, t.dtype)
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0))
+        self.assertEqual(torch.float64, t.dtype)
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.LongTensor(1, 0))
+        self.assertEqual(torch.int64, t.dtype)
+
     @cuda_only
     @skipIfRocm
     def test_factory_device_type_inference(self):
@@ -1140,41 +1348,66 @@ def test_factory_device_type_inference(self):
         for indices_device in ['cuda', 'cpu']:
             for values_device in ['cuda', 'cpu']:
                 for sparse_device in ['cuda', 'cpu', None]:
-                    t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
-                                                torch.tensor([1.], device=values_device),
-                                                (1, 3), device=sparse_device)
-                    should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
-                    self.assertEqual(should_be_cuda, t.is_cuda)
+                    for test_empty_tensor in [True, False]:
+                        if test_empty_tensor:
+                            t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
+                                                        self.ValueTensor(1, 0).to(values_device),
+                                                        (1, 3, 0), device=sparse_device)
+                        else:
+                            t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
+                                                        torch.tensor([1.], device=values_device),
+                                                        (1, 3), device=sparse_device)
+                        should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
+                        self.assertEqual(should_be_cuda, t.is_cuda)
 
     @cpu_only
     def test_factory_copy(self):
+        def test_tensor(indices, values, indices_equal, values_equal):
+            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+            if indices_equal:
+                self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+            else:
+                self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+            if values_equal:
+                self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+            else:
+                self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+
         # both correct
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.tensor([1.], dtype=torch.float64)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, True, True)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.DoubleTensor(1, 0)
+        test_tensor(indices, values, True, True)
 
         # only indices correct
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.tensor([1.], dtype=torch.float32)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, True, False)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.FloatTensor(1, 0)
+        test_tensor(indices, values, True, True)  # An empty tensor's data_ptr is always equal to 0
 
         # only values correct
         indices = torch.tensor(([0], [2]), dtype=torch.int32)
         values = torch.tensor([1.], dtype=torch.float64)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, False, True)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int32)
+        values = torch.DoubleTensor(1, 0)
+        test_tensor(indices, values, False, True)
 
         # neither correct
         indices = torch.tensor(([0], [2]), dtype=torch.int32)
         values = torch.tensor([1.], dtype=torch.float32)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, False, False)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int32)
+        values = torch.FloatTensor(1, 0)
+        test_tensor(indices, values, False, True)  # An empty tensor's data_ptr is always equal to 0
 
     @cpu_only  # just run once, we test both cpu and cuda
     def test_constructor_device_legacy(self):
@@ -1225,9 +1458,15 @@ def test_is_sparse(self):
         x = torch.randn(3, 3)
         self.assertFalse(x.is_sparse)
 
+        x = torch.randn(3, 3, 0)
+        self.assertFalse(x.is_sparse)
+
         x = self.SparseTensor()
         self.assertTrue(x.is_sparse)
 
+        x = self.SparseTensor(1, 0)
+        self.assertTrue(x.is_sparse)
+
     @skipIfRocm
     def test_resize_as(self):
         def do_test(t):
@@ -1238,6 +1477,8 @@ def do_test(t):
             self.assertEqual(t, t + y)
 
         do_test(self.SparseTensor())
+        do_test(self.SparseTensor(3, 0))
+        do_test(self.SparseTensor(3, 3))
 
     @skipIfRocm
     def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
@@ -1264,10 +1505,13 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
 
     @skipIfRocm
     def test_resize(self):
-        # 1. Increase the size of some dense dimensions [Supported]
+        # 1. Expand the size of some dense dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                 [1, 1], [1, 2, 4], [2, 2, 4])
 
+        self._test_resize_shape([1, 1], [1, 2, 0], [2, 2, 0],
+                                [1, 1], [1, 2, 4], [2, 2, 4])
+
         # 2. Expand the size of some sparse dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                 [1, 1], [1, 2, 3], [4, 2, 3])
@@ -1276,11 +1520,18 @@ def test_resize(self):
         self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3],
                                 [2, 0], [0, 2, 4, 5], [1, 1, 2, 4, 5])
 
+        self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3],
+                                [2, 0], [0, 2, 4, 0], [1, 1, 2, 4, 0])
+
         # 4. Add dims to dense dimensions [Not Supported]
         with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 3, 4], [2, 2, 3, 4])
 
+        with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
+            self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
+                                    [1, 1], [1, 2, 3, 0], [2, 2, 3, 0])
+
         # 5. Remove dims from dense dimensions [Not Supported]
         with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
@@ -1301,6 +1552,10 @@ def test_resize(self):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 2], [2, 2, 2])
 
+        with self.assertRaisesRegex(RuntimeError, "shrinking the size of dense dimensions"):
+            self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
+                                    [1, 1], [1, 2, 0], [2, 2, 0])
+
     def test_is_nonzero(self):
         self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
@@ -1308,6 +1563,8 @@ def test_is_nonzero(self):
         self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (0., 0.), (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (-1., 1.), (1,)).is_nonzero())
         self.assertTrue(torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, []).is_nonzero())  # scalar sparse tensor
+        with self.assertRaisesRegex(RuntimeError, "bool value of Tensor with no values is ambiguous"):
+            torch.sparse_coo_tensor(([0, 1],), self.ValueTensor(2, 0), (4, 0)).is_nonzero()
 
 
 class TestUncoalescedSparse(TestSparse):
@@ -1338,11 +1595,26 @@ class TestSparseOneOff(TestCase):
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @skipIfRocm
     def test_cuda_from_cpu(self):
-        self.assertExpectedRaises(
-            RuntimeError,
-            lambda: torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                             torch.randn(4, 4, 4),
-                                             [3, 4, 4]))
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                     torch.randn(4, 4, 4),
+                                     [3, 4, 4])
+
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                     torch.randn(4, 4, 4, 0),
+                                     [3, 4, 4, 0])
+
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+            torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
+                                     torch.randn(0, 4, 4, 0),
+                                     [0, 4, 4, 0])
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @skipIfRocm
@@ -1351,7 +1623,22 @@ def test_cuda_sparse_cpu_dense_add(self):
         sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
                                                  torch.randn(4, 4, 4).cuda(),
                                                  [3, 4, 4])
-        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
+        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
+            x + sparse_y
+
+        x = torch.zeros(3, 4, 4, 0)
+        sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                                 torch.randn(4, 4, 4, 0).cuda(),
+                                                 [3, 4, 4, 0])
+        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
+            x + sparse_y
+
+        x = torch.zeros(0, 4, 4, 0)
+        sparse_y = torch.cuda.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
+                                                 torch.randn(0, 4, 4, 0).cuda(),
+                                                 [0, 4, 4, 0])
+        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
+            x + sparse_y
 
 
 if __name__ == '__main__':

From 6073f3073e71d5e05bbe537a32b363917dcf5dfc Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 18 Sep 2018 10:15:51 -0700
Subject: [PATCH 223/237] Document torch::nn::init (#11778)

Summary:
Doc fixes and documentation for `torch::nn::init`.

ebetica soumith ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11778

Differential Revision: D9886648

Pulled By: goldsborough

fbshipit-source-id: 22eb78add1dc32b92cc32253683ab3d746505a64
---
 torch/csrc/api/include/torch/nn/init.h        | 50 ++++++++++++++++++-
 torch/csrc/api/include/torch/nn/module.h      |  8 +--
 torch/csrc/api/include/torch/nn/modules/any.h |  6 +++
 torch/csrc/api/src/nn/init.cpp                |  6 +--
 4 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/api/include/torch/nn/init.h b/torch/csrc/api/include/torch/nn/init.h
index 9054e8446f3e0c..0fd68df5bbd0c8 100644
--- a/torch/csrc/api/include/torch/nn/init.h
+++ b/torch/csrc/api/include/torch/nn/init.h
@@ -6,16 +6,64 @@ namespace torch {
 namespace nn {
 namespace init {
 
+/// Fills the given `tensor` with the provided `value` in-place, and returns it.
+/// No gradient will be recorded for this operation.
 Tensor constant_(Tensor tensor, Scalar value);
+
+/// Fills the given `tensor` with the Dirac delta function in-place, and returns
+/// it. No gradient will be recorded for this operation.
 Tensor dirac_(Tensor tensor);
-Tensor eye_(Tensor tensor);
+
+/// Fills the given 2-dimensional `matrix` with an identity matrix.
+/// No gradient will be recorded for this operation.
+Tensor eye_(Tensor matrix);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a normal
+/// distribution parameterized by `mean` and `std`.
+/// No gradient will be recorded for this operation.
 Tensor normal_(Tensor tensor, double mean = 0, double std = 1);
+
+/// Fills the given `tensor` with ones.
+/// No gradient will be recorded for this operation.
 Tensor ones_(Tensor tensor);
+
+/// Fills the input `Tensor` with a (semi) orthogonal matrix, as described in
+/// "Exact solutions to the nonlinear dynamics of learning in deep linear neural
+/// networks" - Saxe, A. et al. (2013). The input tensor must have at least 2
+/// dimensions, and for tensors with more than 2 dimensions the trailing
+/// dimensions are flattened.
+/// No gradient will be recorded for this operation.
 Tensor orthogonal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the 2D input `Tensor` as a sparse matrix, where the
+/// non-zero elements will be drawn from a centered normal distribution
+/// with the given standard deviation `std`, as described in "Deep learning via
+/// Hessian-free optimization" - Martens, J. (2010). The `sparsity` is a real
+/// value between 0 and 1 that controls the fraction of elements in each column
+/// to be set to zero.
+/// No gradient will be recorded for this operation.
 Tensor sparse_(Tensor tensor, double sparsity, double std = 0.01);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a uniform
+/// distribution parameterized by `low` and `high`.
+/// No gradient will be recorded for this operation.
 Tensor uniform_(Tensor tensor, double low = 0, double high = 1);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010). Values are scaled by the
+/// `gain` parameter. No gradient will be recorded for this operation.
 Tensor xavier_normal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
+/// distribution. Values are scaled by the `gain` parameter
+/// No gradient will be recorded for this operation.
 Tensor xavier_uniform_(Tensor tensor, double gain = 1.0);
+
+/// Fills the given `tensor` with zeros.
+/// No gradient will be recorded for this operation.
 Tensor zeros_(Tensor tensor);
 
 } // namespace init
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index 2cb71e3470574e..3dd730fcca5c68 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -260,7 +260,7 @@ class Module {
   /// methods such as `parameters()`, `clone()` or `to().`
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     weight_ = register_parameter("weight", torch::randn({A, B}));
   ///   }
@@ -277,7 +277,7 @@ class Module {
   /// to methods such as `buffers()`, `clone()` or `to().
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     mean_ = register_buffer("mean", torch::empty({num_features_}));
   ///   }
@@ -290,7 +290,7 @@ class Module {
   /// `clone()` or `to()`.
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
   ///   }
@@ -308,7 +308,7 @@ class Module {
   /// `clone()` or `to()`.
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
   ///   }
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index 512e6d2cc1c5af..1544ce31a835f9 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -184,11 +184,13 @@ class AnyModule {
   bool is_empty() const noexcept;
 
  private:
+  /// \internal
   /// The static type of the object we store in the `AnyModule`, which erases
   /// the actual type, but allows us to call `forward()` on the underlying
   /// module.
   struct Placeholder;
 
+  /// \internal
   /// The dynamic type of the object stored in the `AnyModule`. It contains the
   /// concrete instance to which all calls are forwarded. It is parameterized
   /// over the concrete type of the module, and the types of the arguments the
@@ -289,6 +291,7 @@ class AnyModule::Value {
   explicit Value(autograd::Variable variable)
       : Value(Tensor(std::move(variable))) {}
 
+  /// \internal
   /// The static type of the object we store in the `Value`, which erases the
   /// actual object's type, allowing us only to check the `type_info` of the
   /// type stored in the dynamic type.
@@ -299,6 +302,7 @@ class AnyModule::Value {
     const std::type_info& type_info;
   };
 
+  /// \internal
   /// The dynamic type of the object we store in the `Value`, which hides the
   /// actual object we have erased in this `Value`.
   template <typename T>
@@ -337,6 +341,7 @@ struct AnyModule::Placeholder : public AnyModule::Value::Placeholder {
 
 template <typename ModuleType, typename... ArgumentTypes>
 struct AnyModule::Holder : public AnyModule::Placeholder {
+  /// \internal
   struct CheckedGetter {
     template <typename T>
     decay_t<T>&& operator()(size_t index) {
@@ -356,6 +361,7 @@ struct AnyModule::Holder : public AnyModule::Placeholder {
     std::vector<Value>& arguments_;
   };
 
+  /// \internal
   struct InvokeForward {
     template <typename... Ts>
     Value operator()(Ts&&... ts) {
diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp
index 1afc1858b2aa99..275e9639abf152 100644
--- a/torch/csrc/api/src/nn/init.cpp
+++ b/torch/csrc/api/src/nn/init.cpp
@@ -69,11 +69,11 @@ Tensor dirac_(Tensor tensor) {
   return tensor;
 }
 
-Tensor eye_(Tensor tensor) {
+Tensor eye_(Tensor matrix) {
   NoGradGuard guard;
   AT_CHECK(
-      tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported");
-  return torch::eye_out(tensor, tensor.size(0), tensor.size(1));
+      matrix.ndimension() == 2, "Only tensors with 2 dimensions are supported");
+  return torch::eye_out(matrix, matrix.size(0), matrix.size(1));
 }
 
 Tensor normal_(Tensor tensor, double mean, double std) {

From 98aebed88ea85bfb76e97cc0948a326f2a32ccba Mon Sep 17 00:00:00 2001
From: Ansha Yu <ansha@fb.com>
Date: Tue, 18 Sep 2018 10:40:45 -0700
Subject: [PATCH 224/237] Refactor tests part 1 (#11350)

Summary:
Followup to [the serialized test framework](https://github.com/pytorch/pytorch/pull/10594)

Round 1 for refactoring tests, starting alphabetically. I added some functionality, so I wanted to send out some of these initial changes sooner.

I'm skipping all tests that don't explicitly call assertReferenceChecks. Some tests directly call np.allclose, and others are simply TestCase (rather than HypothesisTestCase).

1. Start alphabetically producing serialized outputs for test functions, annotating those we want to include with `serialized_test_util.given`. So far I've only added one test per operator, but this already does seem to add quite a few tests.
2. Add functionality to allow us to generate outputs using pytest by adding pytest argument options. This allows us to skip adding a `__main__` function to quite a few tests.
3. Catch any exceptions generating the gradient operator and skip serializing/reading it, since certain operators don't have gradients.
4. Add functionality to better handle jagged array inputs, which numpy doesn't handle very well. We simply explicitly do the conversion to dtype=object.
5. Make only one file per test function, rather than 4, to reduce the number of files in the github repo.

I also noticed that there is some hypothesis handling that makes `serialized_test_util.given` not compatible with adding more hypothesis decorators on top. For example, there are tests that do
```
settings(...)
given(...)
def test_my_stuff(...)
```
But there is a hypothesis handler that explicitly checks that `given` is called below `settings`, so we cannot refactor this to `serialized_test_util.given`. I've just avoided decorating these kinds of tests for now, I hope that's alright.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11350

Reviewed By: houseroad

Differential Revision: D9693857

Pulled By: ajyu

fbshipit-source-id: a9b4279afbe51c90cf2025c5ac6b2db2111f4af7
---
 .../operator_test/activation_ops_test.py      |  11 +-
 caffe2/python/operator_test/adadelta_test.py  |   7 +-
 caffe2/python/operator_test/adagrad_test.py   |  10 +-
 .../operator_test/affine_channel_op_test.py   |  16 +-
 caffe2/python/operator_test/arg_ops_test.py   |  13 +-
 .../operator_test/batch_box_cox_test.py       |   5 +-
 .../operator_test/batch_bucketize_op_test.py  |   7 +-
 .../operator_test/batch_moments_op_test.py    |  15 +-
 .../batch_sparse_to_dense_op_test.py          |  10 +-
 .../operator_test/bbox_transform_test.py      |   6 +-
 .../python/operator_test/boolean_mask_test.py |  14 +-
 .../operator_test/boolean_unmask_test.py      |  11 +-
 .../box_with_nms_limit_op_test.py             |  14 +-
 caffe2/python/operator_test/ceil_op_test.py   |   7 +-
 .../channel_backprop_stats_op_test.py         |  10 +-
 .../operator_test/channel_shuffle_test.py     |   9 +-
 .../operator_test/channel_stats_op_test.py    |  10 +-
 caffe2/python/operator_test/clip_op_test.py   |   5 +-
 .../operator_test/clip_tensor_op_test.py      |   6 +-
 ...nd_distribute_fpn_rpn_proposals_op_test.py |  27 +--
 .../operator_test/concat_split_op_test.py     |  17 +-
 .../python/operator_test/conditional_test.py  |  10 +-
 caffe2/python/operator_test/conftest.py       |  39 ++++
 caffe2/python/operator_test/conv_test.py      |  21 ++-
 .../cosine_embedding_criterion_op_test.py     |   5 +-
 .../ctc_beam_search_decoder_op_test.py        |   6 +-
 .../ctc_greedy_decoder_op_test.py             |   7 +-
 .../python/operator_test/distance_op_test.py  |  13 +-
 .../python/operator_test/dropout_op_test.py   |   5 +-
 .../elementwise_linear_op_test.py             |   5 +-
 .../elementwise_logical_ops_test.py           |  12 +-
 .../elementwise_op_broadcast_test.py          |   5 +-
 caffe2/python/operator_test/expand_op_test.py |   5 +-
 .../python/operator_test/fc_operator_test.py  |   5 +-
 .../python/operator_test/filler_ops_test.py   |  18 +-
 caffe2/python/operator_test/find_op_test.py   |  14 +-
 .../operator_test/flexible_top_k_test.py      |  13 +-
 caffe2/python/operator_test/floor_op_test.py  |   9 +-
 .../python/operator_test/gather_ops_test.py   |   9 +-
 .../operator_test/gather_ranges_op_test.py    |   9 +-
 caffe2/python/operator_test/glu_op_test.py    |   7 +-
 .../operator_test/group_norm_op_test.py       |  19 +-
 caffe2/python/operator_test/gru_test.py       |   5 +-
 .../operator_test/hyperbolic_ops_test.py      |   9 +-
 .../operator_test/index_hash_ops_test.py      |   6 +-
 .../operator_test/instance_norm_test.py       |   6 +-
 .../operator_test/integral_image_ops_test.py  |   9 +-
 caffe2/python/operator_test/jsd_ops_test.py   |   6 +-
 .../operator_test/layer_norm_op_test.py       |  11 +-
 .../learning_rate_adaption_op_test.py         |   9 +-
 .../operator_test/learning_rate_op_test.py    |   5 +-
 .../operator_test/length_split_op_test.py     |   5 +-
 .../operator_test/lengths_pad_op_test.py      |   5 +-
 .../operator_test/lengths_tile_op_test.py     |   5 +-
 .../operator_test/lengths_top_k_ops_test.py   |   6 +-
 .../listwise_l2r_operator_test.py             |   1 +
 .../locally_connected_op_test.py              |   5 +-
 caffe2/python/operator_test/loss_ops_test.py  |   5 +-
 .../margin_ranking_criterion_op_test.py       |  11 +-
 caffe2/python/operator_test/math_ops_test.py  |   5 +-
 caffe2/python/operator_test/matmul_op_test.py |   9 +-
 caffe2/python/operator_test/mean_op_test.py   |   9 +-
 .../operator_test/merge_id_lists_op_test.py   |  13 +-
 .../python/operator_test/moments_op_test.py   |   7 +-
 .../python/operator_test/momentum_sgd_test.py |   7 +-
 .../operator_test/negate_gradient_op_test.py  |  13 +-
 .../operator_test/numpy_tile_op_test.py       |   5 +-
 .../python/operator_test/one_hot_ops_test.py  |  11 +-
 .../python/operator_test/onnx_while_test.py   |  15 +-
 caffe2/python/operator_test/pack_ops_test.py  |   5 +-
 .../pack_rnn_sequence_op_test.py              |   7 +-
 caffe2/python/operator_test/pad_test.py       |  14 +-
 .../piecewise_linear_transform_test.py        |   9 +-
 .../python/operator_test/weighted_sum_test.py |   2 +-
 caffe2/python/serialized_test/README.md       |  14 +-
 .../activation_ops_test.test_elu.zip          | Bin 0 -> 1192 bytes
 .../activation_ops_test.test_leaky_relu.zip   | Bin 0 -> 1200 bytes
 .../activation_ops_test.test_relu.zip         | Bin 0 -> 1166 bytes
 .../activation_ops_test.test_relu_n.zip       | Bin 0 -> 1184 bytes
 .../adadelta_test.test_adadelta.zip           | Bin 0 -> 1167 bytes
 ...adelta_test.test_sparse_adadelta_empty.zip | Bin 0 -> 1209 bytes
 .../adagrad_test.test_adagrad.zip             | Bin 0 -> 1121 bytes
 ...est.test_row_wise_sparse_adagrad_empty.zip | Bin 0 -> 1161 bytes
 ...adagrad_test.test_sparse_adagrad_empty.zip | Bin 0 -> 1154 bytes
 ...channel_op_test.test_affine_channel_2d.zip | Bin 0 -> 1299 bytes
 .../arg_ops_test.test_argmax.zip              | Bin 0 -> 1061 bytes
 .../arg_ops_test.test_argmin.zip              | Bin 0 -> 1061 bytes
 .../batch_box_cox_test.test_batch_box_cox.zip | Bin 0 -> 1116 bytes
 ...e_op_test.test_batch_bucketize_example.zip | Bin 0 -> 1265 bytes
 ..._moments_op_test.test_batch_moments_2d.zip | Bin 0 -> 1246 bytes
 ...nse_op_test.test_batch_sparse_to_dense.zip | Bin 0 -> 1501 bytes
 ...box_transform_test.test_bbox_transform.zip | Bin 0 -> 1254 bytes
 .../boolean_mask_test.test_boolean_mask.zip   | Bin 0 -> 1077 bytes
 ...k_test.test_sequence_mask_with_lengths.zip | Bin 0 -> 1337 bytes
 .../boolean_unmask_test.test.zip              | Bin 0 -> 1092 bytes
 ...box_with_nms_limit_op_test.test_simple.zip | Bin 0 -> 1238 bytes
 .../operator_test/ceil_op_test.test_ceil.zip  | Bin 0 -> 1029 bytes
 ...stats_op_test.testChannelBackpropStats.zip | Bin 0 -> 1568 bytes
 ...nnel_shuffle_test.test_channel_shuffle.zip | Bin 0 -> 1241 bytes
 ...channel_stats_op_test.testChannelStats.zip | Bin 0 -> 1275 bytes
 .../operator_test/clip_op_test.test_clip.zip  | Bin 0 -> 1208 bytes
 ...or_op_test.test_clip_tensor_by_scaling.zip | Bin 0 -> 1174 bytes
 ...roposals_op_test.test_collect_and_dist.zip | Bin 0 -> 43168 bytes
 .../concat_split_op_test.test_concat.zip      | Bin 0 -> 1246 bytes
 .../concat_split_op_test.test_split.zip       | Bin 0 -> 1226 bytes
 ...at_split_op_test.test_split_by_lengths.zip | Bin 0 -> 1233 bytes
 .../conditional_test.test_conditional.zip     | Bin 0 -> 3409 bytes
 .../operator_test/conv_test.test_1x1_conv.zip | Bin 0 -> 1632 bytes
 ...p_test.test_cosine_embedding_criterion.zip | Bin 0 -> 1355 bytes
 ...r_op_test.test_ctc_beam_search_decoder.zip | Bin 0 -> 1101 bytes
 ...ecoder_op_test.test_ctc_greedy_decoder.zip | Bin 0 -> 1133 bytes
 .../distance_op_test.test_dot_product.zip     | Bin 0 -> 1199 bytes
 .../dropout_op_test.test_dropout_is_test.zip  | Bin 0 -> 1215 bytes
 .../elementwise_linear_op_test.test.zip       | Bin 0 -> 1280 bytes
 ...ise_logical_ops_test.test_is_member_of.zip | Bin 0 -> 1050 bytes
 ...lementwise_logical_ops_test.test_where.zip | Bin 0 -> 1072 bytes
 ..._op_broadcast_test.test_broadcast_powt.zip | Bin 0 -> 3427 bytes
 .../expand_op_test.test_expand_rand_shape.zip | Bin 0 -> 1191 bytes
 .../fc_operator_test.test_fc.zip              | Bin 0 -> 1219 bytes
 ...r_ops_test.test_diagonal_fill_op_float.zip | Bin 0 -> 1057 bytes
 ...iller_ops_test.test_lengths_range_fill.zip | Bin 0 -> 1054 bytes
 .../operator_test/find_op_test.test_find.zip  | Bin 0 -> 1051 bytes
 ...lexible_top_k_test.test_flexible_top_k.zip | Bin 0 -> 1258 bytes
 .../floor_op_test.test_floor.zip              | Bin 0 -> 1030 bytes
 .../gather_ops_test.test_batch_gather_ops.zip | Bin 0 -> 1234 bytes
 .../gather_ops_test.test_gather_ops.zip       | Bin 0 -> 1853 bytes
 ...ther_ranges_op_test.test_gather_ranges.zip | Bin 0 -> 1103 bytes
 ...anges_op_test.test_gather_ranges_split.zip | Bin 0 -> 1106 bytes
 .../glu_op_test.test_glu_old.zip              | Bin 0 -> 1080 bytes
 .../group_norm_op_test.test_group_norm_2d.zip | Bin 0 -> 1433 bytes
 .../gru_test.test_gru_unit_op.zip             | Bin 0 -> 1628 bytes
 .../hyperbolic_ops_test.test_cosh.zip         | Bin 0 -> 1163 bytes
 .../hyperbolic_ops_test.test_sinh.zip         | Bin 0 -> 1162 bytes
 .../hyperbolic_ops_test.test_tanh.zip         | Bin 0 -> 1162 bytes
 ...ndex_hash_ops_test.test_index_hash_ops.zip | Bin 0 -> 1075 bytes
 ...est.test_instance_norm_reference_check.zip | Bin 0 -> 3032 bytes
 ..._test.test_integral_image_gradient_ops.zip | Bin 0 -> 1747 bytes
 ...image_ops_test.test_integral_image_ops.zip | Bin 0 -> 1608 bytes
 .../jsd_ops_test.test_bernoulli_jsd.zip       | Bin 0 -> 1314 bytes
 ...r_norm_op_test.test_layer_norm_grad_op.zip | Bin 0 -> 1128 bytes
 ...earning_rate_adaption_op_normalization.zip | Bin 0 -> 1107 bytes
 ...te_op_test.test_alter_learning_rate_op.zip | Bin 0 -> 1135 bytes
 ...h_split_op_test.test_length_split_edge.zip | Bin 0 -> 1093 bytes
 .../lengths_pad_op_test.test_lengths_pad.zip  | Bin 0 -> 1114 bytes
 ...lengths_tile_op_test.test_lengths_tile.zip | Bin 0 -> 1207 bytes
 ...s_top_k_ops_test.test_lengths_top_k_op.zip | Bin 0 -> 1240 bytes
 .../locally_connected_op_test.test_lc_2d.zip  | Bin 0 -> 1277 bytes
 .../loss_ops_test.test_averaged_loss.zip      | Bin 0 -> 1179 bytes
 ..._op_test.test_margin_ranking_criterion.zip | Bin 0 -> 1398 bytes
 .../operator_test/math_ops_test.test_sign.zip | Bin 0 -> 1027 bytes
 .../matmul_op_test.test_batch_matmul.zip      | Bin 0 -> 1393 bytes
 .../matmul_op_test.test_matmul.zip            | Bin 0 -> 1396 bytes
 ...matmul_op_test.test_numpy_batch_matmul.zip | Bin 0 -> 1118 bytes
 .../operator_test/mean_op_test.test_mean.zip  | Bin 0 -> 1165 bytes
 ...d_lists_op_test.test_merge_id_lists_op.zip | Bin 0 -> 1122 bytes
 .../moments_op_test.test_moments.zip          | Bin 0 -> 1268 bytes
 .../momentum_sgd_test.test_momentum_sgd.zip   | Bin 0 -> 1187 bytes
 ...ntum_sgd_test.test_sparse_momentum_sgd.zip | Bin 0 -> 1182 bytes
 .../negate_gradient_op_test.test_forward.zip  | Bin 0 -> 1161 bytes
 .../numpy_tile_op_test.test_numpy_tile.zip    | Bin 0 -> 1081 bytes
 ...ops_test.test_batch_bucketized_one_hot.zip | Bin 0 -> 1099 bytes
 .../one_hot_ops_test.test_batch_one_hot.zip   | Bin 0 -> 1071 bytes
 .../one_hot_ops_test.test_one_hot.zip         | Bin 0 -> 1070 bytes
 .../one_hot_ops_test.test_segment_one_hot.zip | Bin 0 -> 1090 bytes
 .../onnx_while_test.test_onnx_while_fibb.zip  | Bin 0 -> 1472 bytes
 ...ops_test.test_pack_with_max_length_ops.zip | Bin 0 -> 1341 bytes
 ...sequence_op_test.test_pack_rnn_seqence.zip | Bin 0 -> 1218 bytes
 ...quence_op_test.test_unpack_rnn_seqence.zip | Bin 0 -> 1218 bytes
 .../data/operator_test/pad_test.test_crop.zip | Bin 0 -> 3245 bytes
 ...test_multi_predictions_params_from_arg.zip | Bin 0 -> 1139 bytes
 .../weighted_sum_test.test_weighted_sum.zip   | Bin 0 -> 1212 bytes
 .../gradient_0.pb                             | Bin 67 -> 0 bytes
 .../inputs.npz                                | Bin 390 -> 0 bytes
 .../operator_0.pb                             | Bin 43 -> 0 bytes
 .../outputs.npz                               | Bin 234 -> 0 bytes
 .../serialized_test/serialized_test_util.py   | 172 +++++++++++-------
 176 files changed, 523 insertions(+), 369 deletions(-)
 create mode 100644 caffe2/python/operator_test/conftest.py
 create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip
 create mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb
 delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz

diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index 0d35110740825b..5be8b689f115cb 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -11,12 +11,13 @@
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.mkl_test_util as mu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
 
 
-class TestActivations(hu.HypothesisTestCase):
-    @given(X=hu.tensor(), in_place=st.booleans(),
+class TestActivations(serial.SerializedTestCase):
+    @serial.given(X=hu.tensor(), in_place=st.booleans(),
            engine=st.sampled_from(["", "CUDNN"]), **mu.gcs)
     def test_relu(self, X, in_place, engine, gc, dc):
         if gc == mu.mkl_do:
@@ -74,7 +75,7 @@ def relu_grad_ref(g_out, outputs, fwd_inputs):
             output_to_grad="X" if in_place else "Y",
             grad_reference=relu_grad_ref)
 
-    @given(X=hu.tensor(elements=st.floats(-3.0, 3.0)),
+    @serial.given(X=hu.tensor(elements=st.floats(-3.0, 3.0)),
            n=st.floats(min_value=0.5, max_value=2.0),
            in_place=st.booleans(), **hu.gcs)
     def test_relu_n(self, X, n, in_place, gc, dc):
@@ -100,7 +101,7 @@ def relu_n_ref(X):
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.005)
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            alpha=st.floats(min_value=0.1, max_value=2.0),
            in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
@@ -169,7 +170,7 @@ def prelu_ref(X, W):
             # Gradient check wrt W
             self.assertGradientChecks(gc, op, [X, W], 1, [0], stepsize=1e-2)
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            alpha=st.floats(min_value=0.1, max_value=2.0),
            inplace=st.booleans(),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py
index 9112d50f38df86..2976b06108ff51 100644
--- a/caffe2/python/operator_test/adadelta_test.py
+++ b/caffe2/python/operator_test/adadelta_test.py
@@ -12,9 +12,10 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestAdadelta(hu.HypothesisTestCase):
+class TestAdadelta(serial.SerializedTestCase):
     @staticmethod
     def ref_adadelta(param_in,
                      mom_in,
@@ -44,7 +45,7 @@ def ref_adadelta(param_in,
             return (param_out.astype(np.float32), mom_out.astype(np.float32),
                     mom_delta_out.astype(np.float32))
 
-    @given(inputs=hu.tensors(n=4),
+    @serial.given(inputs=hu.tensors(n=4),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
@@ -136,7 +137,7 @@ def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay,
                     ref_using_fp16
                 ], ref_sparse)
 
-    @given(inputs=hu.tensors(n=3),
+    @serial.given(inputs=hu.tensors(n=3),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
index e4101e92cf01cb..69aead865d1c3f 100644
--- a/caffe2/python/operator_test/adagrad_test.py
+++ b/caffe2/python/operator_test/adagrad_test.py
@@ -15,11 +15,13 @@
 from caffe2.python.operator_test.adagrad_test_helper import (
     ref_adagrad, adagrad_sparse_test_helper
 )
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
 import os
 
-class TestAdagrad(hu.HypothesisTestCase):
+
+class TestAdagrad(serial.SerializedTestCase):
     @staticmethod
     def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
         mom_out = mom_in + np.mean(np.square(grad))
@@ -27,7 +29,7 @@ def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
         param_out = param_in + grad_adj
         return (param_out, mom_out)
 
-    @given(inputs=hu.tensors(n=3),
+    @serial.given(inputs=hu.tensors(n=3),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
@@ -112,7 +114,7 @@ def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc):
         return adagrad_sparse_test_helper(self, inputs, lr, epsilon,
             None, ref_adagrad, gc, dc)
 
-    @given(inputs=hu.tensors(n=2),
+    @serial.given(inputs=hu.tensors(n=2),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
@@ -223,7 +225,7 @@ def ref_row_wise_sparse(param, momentum, indices, grad, lr):
             [param, momentum, indices, grad, lr],
             ref_row_wise_sparse)
 
-    @given(inputs=hu.tensors(n=1),
+    @serial.given(inputs=hu.tensors(n=1),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
index 6e56da29b7f6a9..bcce4efc8ec529 100644
--- a/caffe2/python/operator_test/affine_channel_op_test.py
+++ b/caffe2/python/operator_test/affine_channel_op_test.py
@@ -2,16 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class TestAffineChannelOp(hu.HypothesisTestCase):
+class TestAffineChannelOp(serial.SerializedTestCase):
     def affine_channel_nchw_ref(self, X, scale, bias):
         dims = X.shape
         N = dims[0]
@@ -30,9 +29,10 @@ def affine_channel_nhwc_ref(self, X, scale, bias):
         Y = X * scale + bias
         return [Y.reshape(dims)]
 
-    @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
-           W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
-           is_learnable=st.booleans(), in_place=st.booleans(), **hu.gcs)
+    @serial.given(N=st.integers(1, 5), C=st.integers(1, 5),
+            H=st.integers(1, 5), W=st.integers(1, 5),
+            order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(),
+            in_place=st.booleans(), **hu.gcs)
     def test_affine_channel_2d(
             self, N, C, H, W, order, is_learnable, in_place, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py
index 6492189808a72c..9bdea7ecf5cdbf 100644
--- a/caffe2/python/operator_test/arg_ops_test.py
+++ b/caffe2/python/operator_test/arg_ops_test.py
@@ -9,11 +9,13 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestArgOps(hu.HypothesisTestCase):
-    @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
-           keepdims=st.booleans(), **hu.gcs)
+class TestArgOps(serial.SerializedTestCase):
+    @serial.given(
+        X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
+        keepdims=st.booleans(), **hu.gcs)
     def test_argmax(self, X, axis, keepdims, gc, dc):
         if axis >= len(X.shape):
             axis %= len(X.shape)
@@ -32,8 +34,9 @@ def argmax_ref(X):
         self.assertReferenceChecks(gc, op, [X], argmax_ref)
         self.assertDeviceChecks(dc, op, [X], [0])
 
-    @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
-           keepdims=st.booleans(), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
+        keepdims=st.booleans(), **hu.gcs)
     def test_argmin(self, X, axis, keepdims, gc, dc):
         if axis >= len(X.shape):
             axis %= len(X.shape)
diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py
index f8bc77ba9e5a7a..7252499352ee84 100644
--- a/caffe2/python/operator_test/batch_box_cox_test.py
+++ b/caffe2/python/operator_test/batch_box_cox_test.py
@@ -7,6 +7,7 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -51,8 +52,8 @@ def _inputs(draw):
     )
 
 
-class TestBatchBoxCox(hu.HypothesisTestCase):
-    @given(
+class TestBatchBoxCox(serial.SerializedTestCase):
+    @serial.given(
         inputs=_inputs(),
         **hu.gcs_cpu_only
     )
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
index 711240de9cfdfc..301941afb590c3 100644
--- a/caffe2/python/operator_test/batch_bucketize_op_test.py
+++ b/caffe2/python/operator_test/batch_bucketize_op_test.py
@@ -6,13 +6,14 @@
 import numpy as np
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 
 
-class TestBatchBucketize(hu.HypothesisTestCase):
-    @given(**hu.gcs_cpu_only)
+class TestBatchBucketize(serial.SerializedTestCase):
+    @serial.given(**hu.gcs_cpu_only)
     def test_batch_bucketize_example(self, gc, dc):
         op = core.CreateOperator('BatchBucketize',
                                  ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"],
diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py
index 2db25e73892563..91d49b76ee4119 100644
--- a/caffe2/python/operator_test/batch_moments_op_test.py
+++ b/caffe2/python/operator_test/batch_moments_op_test.py
@@ -2,16 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class TestBatchMomentsOp(hu.HypothesisTestCase):
+class TestBatchMomentsOp(serial.SerializedTestCase):
     def batch_moments_nchw_ref(self, X):
         dims = X.shape
         N = dims[0]
@@ -29,9 +28,9 @@ def batch_moments_nhwc_ref(self, X):
         var = np.mean(np.square(X), axis=0)
         return [mu, var]
 
-    @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
-           W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
-           **hu.gcs)
+    @serial.given(N=st.integers(1, 5), C=st.integers(1, 5),
+            H=st.integers(1, 5), W=st.integers(1, 5),
+            order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
     def test_batch_moments_2d(self, N, C, H, W, order, gc, dc):
         op = core.CreateOperator(
             "BatchMoments",
diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
index a8e04fb2e14750..a47cc44e593c8b 100644
--- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
+++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
@@ -3,17 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
+import numpy as np
 
 
-class TestBatchSparseToDense(hu.HypothesisTestCase):
+class TestBatchSparseToDense(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         batch_size=st.integers(5, 10),
         dense_last_dim=st.integers(5, 10),
         default_value=st.floats(min_value=2.0, max_value=3.0),
diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py
index b54a4435513be7..f76891bae4dc89 100644
--- a/caffe2/python/operator_test/bbox_transform_test.py
+++ b/caffe2/python/operator_test/bbox_transform_test.py
@@ -2,9 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -201,8 +203,8 @@ def generate_rois_rotated(roi_counts, im_dims):
     return rotated_rois
 
 
-class TestBBoxTransformOp(hu.HypothesisTestCase):
-    @given(
+class TestBBoxTransformOp(serial.SerializedTestCase):
+    @serial.given(
         num_rois=st.integers(1, 10),
         num_classes=st.integers(1, 10),
         im_dim=st.integers(100, 600),
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 638248d60bafe5..8811f5667503b8 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -2,18 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from hypothesis import assume, given
-import hypothesis.strategies as st
-
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import assume, given
+import hypothesis.strategies as st
+import numpy as np
 
 
-class TestBooleanMaskOp(hu.HypothesisTestCase):
+class TestBooleanMaskOp(serial.SerializedTestCase):
 
-    @given(x=hu.tensor(min_dim=1,
+    @serial.given(x=hu.tensor(min_dim=1,
                        max_dim=5,
                        elements=st.floats(min_value=0.5, max_value=1.0)),
            **hu.gcs)
@@ -54,7 +54,7 @@ def _dtype_conversion(x, dtype, gc, dc):
             x = x.astype(dtype)
         return x, dc
 
-    @given(x=hu.tensor(min_dim=2,
+    @serial.given(x=hu.tensor(min_dim=2,
                        max_dim=5,
                        elements=st.floats(min_value=0.5, max_value=1.0)),
            dtype=st.sampled_from([np.float32, np.float16]),
diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py
index 86b2fedb49a245..e3bc9f248d3a26 100644
--- a/caffe2/python/operator_test/boolean_unmask_test.py
+++ b/caffe2/python/operator_test/boolean_unmask_test.py
@@ -3,16 +3,15 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-from hypothesis import given
-import hypothesis.strategies as st
-
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+import hypothesis.strategies as st
+import numpy as np
 
 
-class TestUnmaskOp(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=2, max_value=20),
+class TestUnmaskOp(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=2, max_value=20),
            dtype=st.sampled_from([
                np.bool_,
                np.int8,
diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
index 8cd9acbd6a5d3c..52155c0a5d7649 100644
--- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py
+++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
@@ -3,13 +3,13 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-import unittest
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core
+import unittest
+import numpy as np
 
 
 def get_op(input_len, output_len, args):
@@ -64,8 +64,8 @@ def gen_multiple_boxes(centers, scores, count, num_classes):
     return ret_box, ret_scores
 
 
-class TestBoxWithNMSLimitOp(hu.HypothesisTestCase):
-    @given(**HU_CONFIG)
+class TestBoxWithNMSLimitOp(serial.SerializedTestCase):
+    @serial.given(**HU_CONFIG)
     def test_simple(self, gc):
         in_centers = [(0, 0), (20, 20), (50, 50)]
         in_scores = [0.9, 0.8, 0.6]
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 79293fc6453232..130364261ea166 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -4,17 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
 import unittest
 
 
-class TestCeil(hu.HypothesisTestCase):
+class TestCeil(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
     def test_ceil(self, X, gc, dc, engine):
diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
index 5c59b8d6f05c1b..e516288b436c38 100644
--- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
@@ -3,17 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.proto import caffe2_pb2
 import unittest
 
 
-class TestChannelBackpropStats(hu.HypothesisTestCase):
-    @given(
+class TestChannelBackpropStats(serial.SerializedTestCase):
+    @serial.given(
         size=st.integers(7, 10),
         inputChannels=st.integers(1, 10),
         batchSize=st.integers(1, 3),
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
index e17d7a736063c4..34417fd1847337 100644
--- a/caffe2/python/operator_test/channel_shuffle_test.py
+++ b/caffe2/python/operator_test/channel_shuffle_test.py
@@ -3,14 +3,15 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class ChannelShuffleOpsTest(hu.HypothesisTestCase):
+class ChannelShuffleOpsTest(serial.SerializedTestCase):
     def _channel_shuffle_nchw_ref(self, X, group):
         dims = X.shape
         N = dims[0]
@@ -31,7 +32,7 @@ def _channel_shuffle_nhwc_ref(self, X, group):
         Y = np.transpose(X, axes=(0, 1, 3, 2))
         return [Y.reshape(dims)]
 
-    @given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5),
+    @serial.given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5),
            H=st.integers(1, 5), W=st.integers(1, 5),
            order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
     def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc):
diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py
index 2a238e67542f5a..f1daddee7721dd 100644
--- a/caffe2/python/operator_test/channel_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_stats_op_test.py
@@ -3,17 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.proto import caffe2_pb2
 import unittest
 
 
-class TestChannelStats(hu.HypothesisTestCase):
-    @given(
+class TestChannelStats(serial.SerializedTestCase):
+    @serial.given(
         size=st.integers(7, 10),
         inputChannels=st.integers(1, 10),
         batchSize=st.integers(1, 3),
diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py
index 38499a69eb1d90..46163d30dedc2f 100644
--- a/caffe2/python/operator_test/clip_op_test.py
+++ b/caffe2/python/operator_test/clip_op_test.py
@@ -10,10 +10,11 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestClip(hu.HypothesisTestCase):
-    @given(X=hu.tensor(),
+class TestClip(serial.SerializedTestCase):
+    @serial.given(X=hu.tensor(),
            min_=st.floats(min_value=-2, max_value=0),
            max_=st.floats(min_value=0, max_value=2),
            inplace=st.booleans(),
diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py
index bea2133d5182ee..042b4ef2a8326c 100644
--- a/caffe2/python/operator_test/clip_tensor_op_test.py
+++ b/caffe2/python/operator_test/clip_tensor_op_test.py
@@ -4,15 +4,15 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestClipTensorByScalingOp(hu.HypothesisTestCase):
+class TestClipTensorByScalingOp(serial.SerializedTestCase):
 
-    @given(n=st.integers(5, 8), d=st.integers(2, 4),
+    @serial.given(n=st.integers(5, 8), d=st.integers(2, 4),
            threshold=st.floats(0.1, 10),
            additional_threshold=st.floats(0.1, 10),
            use_additional_threshold=st.booleans(),
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
index facb675a4944ec..e37738801745c2 100644
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -4,15 +4,16 @@
 from __future__ import unicode_literals
 
 import numpy as np
-import unittest
 import os
+import unittest
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
 
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, utils
 from caffe2.proto import caffe2_pb2
+from caffe2.python import core, utils
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 #
 # Should match original Detectron code at
@@ -129,17 +130,17 @@ def collect_and_distribute_fpn_rpn_ref(*inputs):
     return outputs
 
 
-class TestCollectAndDistributeFpnRpnProposals(hu.HypothesisTestCase):
+class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase):
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(proposal_count=st.integers(min_value=1000, max_value=8000),
-           rpn_min_level=st.integers(min_value=1, max_value=4),
-           rpn_num_levels=st.integers(min_value=1, max_value=6),
-           roi_min_level=st.integers(min_value=1, max_value=4),
-           roi_num_levels=st.integers(min_value=1, max_value=6),
-           rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
-           roi_canonical_scale=st.integers(min_value=100, max_value=300),
-           roi_canonical_level=st.integers(min_value=1, max_value=8),
-           **hu.gcs_cpu_only)
+    @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000),
+                  rpn_min_level=st.integers(min_value=1, max_value=4),
+                  rpn_num_levels=st.integers(min_value=1, max_value=6),
+                  roi_min_level=st.integers(min_value=1, max_value=4),
+                  roi_num_levels=st.integers(min_value=1, max_value=6),
+                  rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
+                  roi_canonical_scale=st.integers(min_value=100, max_value=300),
+                  roi_canonical_level=st.integers(min_value=1, max_value=8),
+                  **hu.gcs_cpu_only)
     def test_collect_and_dist(
         self,
         proposal_count,
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
index af8d5486e6e6ab..3d2c4ae31946d9 100644
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -3,13 +3,14 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
 @st.composite
@@ -44,8 +45,8 @@ def _tensor_splits(draw, add_axis=False):
         )
 
 
-class TestConcatSplitOps(hu.HypothesisTestCase):
-    @given(tensor_splits=_tensor_splits(),
+class TestConcatSplitOps(serial.SerializedTestCase):
+    @serial.given(tensor_splits=_tensor_splits(),
            **hu.gcs)
     def test_concat(self, tensor_splits, gc, dc):
         axis, _, splits = tensor_splits
@@ -92,7 +93,7 @@ def test_concat_add_axis(self, tensor_splits, gc, dc):
         for i in range(len(splits)):
             self.assertGradientChecks(gc, op, splits, i, [0])
 
-    @given(tensor_splits=_tensor_splits(),
+    @serial.given(tensor_splits=_tensor_splits(),
            split_as_arg=st.booleans(),
            **hu.gcs)
     def test_split(self, tensor_splits, split_as_arg, gc, dc):
@@ -127,7 +128,7 @@ def split_ref(input, split=split_info):
         self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
         self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad)
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(
             dtype=np.float32,
             min_value=1,
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
index b96b530d687751..88d8fd8b7a27a3 100644
--- a/caffe2/python/operator_test/conditional_test.py
+++ b/caffe2/python/operator_test/conditional_test.py
@@ -2,15 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from hypothesis import given
-import hypothesis.strategies as st
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+import hypothesis.strategies as st
+import numpy as np
 
 
-class TestConditionalOp(hu.HypothesisTestCase):
-    @given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
+class TestConditionalOp(serial.SerializedTestCase):
+    @serial.given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
     def test_conditional(self, rows_num, gc, dc):
         op = core.CreateOperator(
             "Conditional", ["condition", "data_t", "data_f"], "output"
diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py
new file mode 100644
index 00000000000000..54a57dfd51928d
--- /dev/null
+++ b/caffe2/python/operator_test/conftest.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.python.serialized_test.serialized_test_util as serial
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        '-G',
+        '--generate-serialized',
+        action='store_true',
+        dest='generate',
+        help='generate output files (default=false, compares to current files)',
+    )
+    parser.addoption(
+        '-O',
+        '--output',
+        default=serial.DATA_DIR,
+        dest='output',
+        help='output directory (default: %(default)s)'
+    )
+    parser.addoption(
+        '-D',
+        '--disable-serialized-check',
+        action='store_true',
+        dest='disable',
+        help='disable checking serialized tests'
+    )
+
+
+def pytest_configure(config):
+    generate = config.getoption('generate', default=False)
+    output = config.getoption('output', default=serial.DATA_DIR)
+    disable = config.getoption('disable', default=False)
+    serial._output_context.__setattr__('should_generate_output', generate)
+    serial._output_context.__setattr__('output_dir', output)
+    serial._output_context.__setattr__('disable_serialized_check', disable)
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 8e65a9324535a6..d29d724b89c29d 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -13,11 +13,13 @@
 from caffe2.python import brew, core, workspace
 import caffe2.python.hypothesis_test_util as hu
 from caffe2.python.model_helper import ModelHelper
+import caffe2.python.serialized_test.serialized_test_util as serial
 import caffe2.python._import_c_extension as C
 
 import unittest
 import os
 
+
 def _cudnn_supports(
         dilation=False,
         nhwc=False,
@@ -54,7 +56,7 @@ def _cudnn_convolution_algo_count(direction):
         return st.sampled_from([-1])
 
 
-class TestConvolution(hu.HypothesisTestCase):
+class TestConvolution(serial.SerializedTestCase):
     # CUDNN does NOT support different padding values and we skip it
     @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
            stride_h=st.integers(1, 3),
@@ -636,14 +638,15 @@ def test_use_cudnn_engine_interactions(self):
                         self.assertEqual(model.Proto().op[-1].engine,
                                          expected_engine)
 
-    @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4),
-           G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4),
-           H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-           force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-           force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-           **hu.gcs)
+    @serial.given(
+        op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4),
+        G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4),
+        H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(),
+        order=st.sampled_from(["NCHW", "NHWC"]),
+        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
+        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
+        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
+        **hu.gcs)
     def test_1x1_conv(self, op_type, N, G, DX, DY, H, W, use_bias, order,
                       force_algo_fwd, force_algo_dgrad,
                       force_algo_wgrad, gc, dc):
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
index d67df5fd3e1f32..1124df94e67ae7 100644
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -9,10 +9,11 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestCosineEmbeddingCriterion(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=10, max_value=20),
+class TestCosineEmbeddingCriterion(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=10, max_value=20),
            seed=st.integers(min_value=0, max_value=65535),
            margin=st.floats(min_value=-0.5, max_value=0.5),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
index 51d2bbc6f484ab..4deef35c5bb506 100644
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
@@ -2,10 +2,12 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from collections import defaultdict, Counter
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -15,9 +17,9 @@
 DEFAULT_PRUNE_THRESHOLD = 0.001
 
 
-class TestCTCBeamSearchDecoderOp(hu.HypothesisTestCase):
+class TestCTCBeamSearchDecoderOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         batch=st.sampled_from([1, 2, 4]),
         max_time=st.sampled_from([1, 8, 64]),
         alphabet_size=st.sampled_from([1, 2, 32, 128, 512]),
diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
index 0e638e8155e9e7..98079d2b026ae0 100644
--- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
@@ -2,18 +2,19 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
-
 import unittest
 
 
-class TestCTCGreedyDecoderOp(hu.HypothesisTestCase):
+class TestCTCGreedyDecoderOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         batch=st.sampled_from([2, 4, 128, 256]),
         max_time=st.sampled_from([2, 10, 30, 50]),
         num_classes=st.sampled_from([2, 10, 26, 40]),
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
index 0a5f9a38a4a7c6..753b94d20f1f54 100644
--- a/caffe2/python/operator_test/distance_op_test.py
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -3,15 +3,16 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class DistanceTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 3),
+class DistanceTest(serial.SerializedTestCase):
+    @serial.given(n=st.integers(1, 3),
            dim=st.integers(4, 16),
            **hu.gcs)
     def test_cosine_similarity(self, n, dim, gc, dc):
@@ -32,7 +33,7 @@ def test_cosine_similarity(self, n, dim, gc, dc):
         self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0],
                                   stepsize=1e-2, threshold=1e-2)
 
-    @given(inputs=hu.tensors(n=2,
+    @serial.given(inputs=hu.tensors(n=2,
                              min_dim=1,
                              max_dim=2,
                              dtype=np.float32),
@@ -57,7 +58,7 @@ def dot_ref(X, Y):
         # Gradient check wrt Y
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
 
-    @given(n=st.integers(1, 3),
+    @serial.given(n=st.integers(1, 3),
            dim=st.integers(4, 16),
            **hu.gcs)
     def test_L1_distance(self, n, dim, gc, dc):
@@ -88,7 +89,7 @@ def test_L1_distance(self, n, dim, gc, dc):
         self.assertGradientChecks(gc, op, [X, Y], 1, [0],
                                   stepsize=1e-2, threshold=1e-2)
 
-    @given(n=st.integers(1, 3),
+    @serial.given(n=st.integers(1, 3),
            dim=st.integers(4, 16),
            **hu.gcs)
     def test_L2_distance(self, n, dim, gc, dc):
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index 89d63b7e13286b..2bbd9ba4efe114 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -10,11 +10,12 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestDropout(hu.HypothesisTestCase):
+class TestDropout(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            in_place=st.booleans(),
            ratio=st.floats(0, 0.999),
            engine=st.sampled_from(["", "CUDNN"]),
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
index c67a84921a4ad0..8c7df5f33625b2 100644
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestElementwiseLinearOp(hu.HypothesisTestCase):
+class TestElementwiseLinearOp(serial.SerializedTestCase):
 
-    @given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs)
+    @serial.given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs)
     # @given(n=st.integers(2, 50), d=st.integers(2, 50), **hu.gcs_cpu_only)
     def test(self, n, d, gc, dc):
         X = np.random.rand(n, d).astype(np.float32)
diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py
index 7279dd5e1fb99c..8f665a06cd9e9a 100644
--- a/caffe2/python/operator_test/elementwise_logical_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py
@@ -4,10 +4,10 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-
 import numpy as np
 import unittest
 
@@ -21,7 +21,7 @@ def rowmux(select_vec, left, right):
     return mux(select, left, right)
 
 
-class TestWhere(hu.HypothesisTestCase):
+class TestWhere(serial.SerializedTestCase):
 
     def test_reference(self):
         self.assertTrue((
@@ -35,7 +35,7 @@ def test_reference(self):
                                         [[3], [4]])[0]
         ).all())
 
-    @given(N=st.integers(min_value=1, max_value=10),
+    @serial.given(N=st.integers(min_value=1, max_value=10),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs_cpu_only)
     def test_where(self, N, gc, dc, engine):
@@ -107,9 +107,9 @@ def test_rowwhere_dim2(self, N, gc, dc, engine):
         self.assertReferenceChecks(gc, op, [C, X, Y], rowmux)
 
 
-class TestIsMemberOf(hu.HypothesisTestCase):
+class TestIsMemberOf(serial.SerializedTestCase):
 
-    @given(N=st.integers(min_value=1, max_value=10),
+    @serial.given(N=st.integers(min_value=1, max_value=10),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs_cpu_only)
     def test_is_member_of(self, N, gc, dc, engine):
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index e767a0db161a9f..161f5fc0724b14 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -11,10 +11,11 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
 # TODO(jiayq): make them hypothesis tests for better coverage.
-class TestElementwiseBroadcast(hu.HypothesisTestCase):
+class TestElementwiseBroadcast(serial.SerializedTestCase):
     @given(**hu.gcs)
     def test_broadcast_Add(self, gc, dc):
         # Set broadcast and no axis, i.e. broadcasting last dimensions.
@@ -168,7 +169,7 @@ def test_broadcast_Sub(self, gc, dc):
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
 
-    @given(**hu.gcs)
+    @serial.given(**hu.gcs)
     def test_broadcast_powt(self, gc, dc):
         np.random.seed(101)
 
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
index efd056c8f1654d..4b608a4418dddc 100644
--- a/caffe2/python/operator_test/expand_op_test.py
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -7,11 +7,12 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestExpandOp(hu.HypothesisTestCase):
+class TestExpandOp(serial.SerializedTestCase):
     def _rand_shape(self, X_shape, max_length):
         length = np.random.randint(max_length)
         shape = np.ones(length, dtype=np.int64)
@@ -39,7 +40,7 @@ def ref(X, shape):
         self.assertDeviceChecks(dc, op, [X, shape], [0])
         self.assertGradientChecks(gc, op, [X, shape], 0, [0])
 
-    @given(X=hu.tensor(max_dim=5, dtype=np.float32),
+    @serial.given(X=hu.tensor(max_dim=5, dtype=np.float32),
            **hu.gcs)
     def test_expand_rand_shape(self, X, gc, dc):
         shape = self._rand_shape(X.shape, 5)
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
index 1c444da6a8b80e..d10e19e4932ade 100644
--- a/caffe2/python/operator_test/fc_operator_test.py
+++ b/caffe2/python/operator_test/fc_operator_test.py
@@ -7,11 +7,12 @@
 from caffe2.python import core
 from hypothesis import assume, given, settings, HealthCheck
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestFcOperator(hu.HypothesisTestCase):
+class TestFcOperator(serial.SerializedTestCase):
     def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc):
         if dtype == np.float16:
             # fp16 only supported with CUDA
@@ -76,7 +77,7 @@ def fc_tranposed_op(X, W, b):
                                       threshold=threshold, stepsize=stepsize)
 
     @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much])
-    @given(n=st.integers(1, 5),
+    @serial.given(n=st.integers(1, 5),
            m=st.integers(0, 5),
            k=st.integers(1, 5),
            multi_dim=st.sampled_from([True, False]),
diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py
index df13cba4a3b961..df7cd1dc4e7960 100644
--- a/caffe2/python/operator_test/filler_ops_test.py
+++ b/caffe2/python/operator_test/filler_ops_test.py
@@ -3,13 +3,13 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
-from hypothesis import given
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from hypothesis import given
+import hypothesis.strategies as st
 import numpy as np
 
 
@@ -19,7 +19,7 @@ def _fill_diagonal(shape, value):
     return (result,)
 
 
-class TestFillerOperator(hu.HypothesisTestCase):
+class TestFillerOperator(serial.SerializedTestCase):
 
     @given(**hu.gcs)
     def test_shape_error(self, gc, dc):
@@ -127,7 +127,7 @@ def test_uniform_fill_using_arg(self, gc, dc):
 
         self.assertNotEqual(min_data, max_data)
 
-    @given(
+    @serial.given(
         shape=st.sampled_from(
             [
                 [3, 3],
@@ -168,9 +168,9 @@ def test_diagonal_fill_op_int(self, gc, dc):
         # Check against numpy reference
         self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal)
 
-    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                            min_size=0,
-                            max_size=10),
+    @serial.given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                                   min_size=0,
+                                   max_size=10),
            **hu.gcs)
     def test_lengths_range_fill(self, lengths, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py
index febf6efeef76ec..153724a5f49d29 100644
--- a/caffe2/python/operator_test/find_op_test.py
+++ b/caffe2/python/operator_test/find_op_test.py
@@ -4,20 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
-import hypothesis.strategies as st
-from hypothesis import given
-
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+import hypothesis.strategies as st
 import numpy as np
 
 
-class TestFindOperator(hu.HypothesisTestCase):
+class TestFindOperator(serial.SerializedTestCase):
 
-    @given(n=st.sampled_from([1, 4, 8, 31, 79, 150]),
-           idxsize=st.sampled_from([2, 4, 8, 1000, 5000]),
-           **hu.gcs)
+    @serial.given(n=st.sampled_from([1, 4, 8, 31, 79, 150]),
+        idxsize=st.sampled_from([2, 4, 8, 1000, 5000]),
+        **hu.gcs)
     def test_find(self, n, idxsize, gc, dc):
         maxval = 10
 
diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py
index 08f079b5f8cfdd..fcd20278c6caba 100644
--- a/caffe2/python/operator_test/flexible_top_k_test.py
+++ b/caffe2/python/operator_test/flexible_top_k_test.py
@@ -3,15 +3,16 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from collections import OrderedDict
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from collections import OrderedDict
+from hypothesis import given
+import numpy as np
 
-class TestFlexibleTopK(hu.HypothesisTestCase):
+
+class TestFlexibleTopK(serial.SerializedTestCase):
     def flexible_top_k_ref(self, X, k):
         X_flat = X.reshape((-1, X.shape[-1]))
         indices_ref = np.ndarray(shape=sum(k), dtype=np.int32)
@@ -38,7 +39,7 @@ def flexible_top_k_ref(self, X, k):
 
         return (values_ref, indices_ref)
 
-    @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
+    @serial.given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
     def test_flexible_top_k(self, X, gc, dc):
         X = X.astype(dtype=np.float32)
         k_shape = (int(X.size / X.shape[-1]), )
diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py
index aac1e81efa6570..4cbd269620673d 100644
--- a/caffe2/python/operator_test/floor_op_test.py
+++ b/caffe2/python/operator_test/floor_op_test.py
@@ -4,17 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
-
 import unittest
 
 
-class TestFloor(hu.HypothesisTestCase):
+class TestFloor(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
     def test_floor(self, X, gc, dc, engine):
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index 2c2bc33910c560..d5ab8e58cec0f2 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -7,12 +7,13 @@
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import hypothesis.extra.numpy as hnp
 
 
-class TestGatherOps(hu.HypothesisTestCase):
-    @given(rows_num=st.integers(1, 10000),
+class TestGatherOps(serial.SerializedTestCase):
+    @serial.given(rows_num=st.integers(1, 10000),
            index_num=st.integers(0, 5000),
            **hu.gcs)
     def test_gather_ops(self, rows_num, index_num, gc, dc):
@@ -52,8 +53,8 @@ def _inputs(draw):
     )
 
 
-class TestBatchGatherOps(hu.HypothesisTestCase):
-    @given(inputs=_inputs(),
+class TestBatchGatherOps(serial.SerializedTestCase):
+    @serial.given(inputs=_inputs(),
            **hu.gcs)
     def test_batch_gather_ops(self, inputs, gc, dc):
         data, ind = inputs
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
index d653dd3297bcd2..a16b92ba7d6992 100644
--- a/caffe2/python/operator_test/gather_ranges_op_test.py
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -6,8 +6,8 @@
 from caffe2.python import core, workspace
 from hypothesis import given
 from hypothesis import strategies as st
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import numpy as np
 
 
@@ -121,8 +121,9 @@ def gather_ranges_to_dense_with_key(data, ranges, key, lengths):
     return outputs
 
 
-class TestGatherRanges(hu.HypothesisTestCase):
-    @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
+class TestGatherRanges(serial.SerializedTestCase):
+    @serial.given(
+        boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
     def test_gather_ranges(self, boarders_and_data, gc, dc):
         boarders, data = boarders_and_data
 
@@ -142,7 +143,7 @@ def boarders_to_range(boarders):
             reference=gather_ranges,
         )
 
-    @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
+    @serial.given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
     def test_gather_ranges_split(self, tensor_splits, gc, dc):
         data, ranges, lengths, _ = tensor_splits
 
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
index 98ebc9bed4012d..56902bb444efde 100644
--- a/caffe2/python/operator_test/glu_op_test.py
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -4,20 +4,21 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given, settings, HealthCheck
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
 import unittest
 
 
-class TestGlu(hu.HypothesisTestCase):
+class TestGlu(serial.SerializedTestCase):
     # Suppress filter_too_much health check.
     # Reproduce by commenting @settings and uncommenting @seed.
     # @seed(302934307671667531413257853548643485645)
     @settings(suppress_health_check=[HealthCheck.filter_too_much])
-    @given(
+    @serial.given(
         X=hu.tensor(),
         axis=st.integers(min_value=0, max_value=3),
         **hu.gcs
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
index caa9121e924a0c..febf05136e4ded 100644
--- a/caffe2/python/operator_test/group_norm_op_test.py
+++ b/caffe2/python/operator_test/group_norm_op_test.py
@@ -2,16 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
+from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class TestGroupNormOp(hu.HypothesisTestCase):
+class TestGroupNormOp(serial.SerializedTestCase):
     def group_norm_nchw_ref(self, X, gamma, beta, group, epsilon):
         dims = X.shape
         N = dims[0]
@@ -40,10 +40,11 @@ def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon):
         Y = gamma * (X - mu) / std + beta
         return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
 
-    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5),
-           H=st.integers(2, 5), W=st.integers(2, 5),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
-           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    @serial.given(
+        N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5),
+        H=st.integers(2, 5), W=st.integers(2, 5),
+        epsilon=st.floats(min_value=1e-5, max_value=1e-4),
+        order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
     def test_group_norm_2d(
             self, N, G, D, H, W, epsilon, order, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index 1292d843d0a827..ed8945b7927e90 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -7,6 +7,7 @@
 from caffe2.python.model_helper import ModelHelper
 from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from caffe2.proto import caffe2_pb2
 
 from functools import partial
@@ -246,11 +247,11 @@ def generate_input_state(n, d):
     return hidden_t, model.net
 
 
-class GRUCellTest(hu.HypothesisTestCase):
+class GRUCellTest(serial.SerializedTestCase):
 
     # Test just for GRUUnitOp
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(
+    @serial.given(
         seed=st.integers(0, 2**32 - 1),
         input_tensor=gru_unit_op_input(),
         fwd_only=st.booleans(),
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
index dbb308f680021b..252855b39e4f25 100644
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -6,11 +6,12 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestHyperbolicOps(hu.HypothesisTestCase):
+class TestHyperbolicOps(serial.SerializedTestCase):
     def _test_hyperbolic_op(self, op_name, np_ref, X, in_place, engine, gc, dc):
         op = core.CreateOperator(
             op_name,
@@ -30,15 +31,15 @@ def ref(X):
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])
 
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs)
     def test_sinh(self, X, gc, dc):
         self._test_hyperbolic_op("Sinh", np.sinh, X, False, "", gc, dc)
 
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs)
     def test_cosh(self, X, gc, dc):
         self._test_hyperbolic_op("Cosh", np.cosh, X, False, "", gc, dc)
 
-    @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(),
+    @serial.given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(),
            engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
     def test_tanh(self, X, in_place, engine, gc, dc):
         self._test_hyperbolic_op("Tanh", np.tanh, X, in_place, engine, gc, dc)
diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py
index 1f8b7344b74b9a..6a3678abdbe3fb 100644
--- a/caffe2/python/operator_test/index_hash_ops_test.py
+++ b/caffe2/python/operator_test/index_hash_ops_test.py
@@ -2,15 +2,17 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestIndexHashOps(hu.HypothesisTestCase):
-    @given(
+class TestIndexHashOps(serial.SerializedTestCase):
+    @serial.given(
         indices=st.sampled_from([
             np.int32, np.int64
         ]).flatmap(lambda dtype: hu.tensor(min_dim=1, max_dim=1, dtype=dtype)),
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index a91154a4e45f6d..1d072a230ae3a7 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -8,11 +8,13 @@
 
 from caffe2.python import core, model_helper, brew
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
 import os
 
-class TestInstanceNorm(hu.HypothesisTestCase):
+
+class TestInstanceNorm(serial.SerializedTestCase):
 
     def _get_inputs(self, N, C, H, W, order):
         if order == 'NCHW':
@@ -131,7 +133,7 @@ def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean,
             atol=1e-4,
             rtol=1e-4)
 
-    @given(gc=hu.gcs['gc'],
+    @serial.given(gc=hu.gcs['gc'],
            dc=hu.gcs['dc'],
            N=st.integers(2, 10),
            C=st.integers(3, 10),
diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py
index 011720d109d6d2..6f9e5d90572ab1 100644
--- a/caffe2/python/operator_test/integral_image_ops_test.py
+++ b/caffe2/python/operator_test/integral_image_ops_test.py
@@ -2,15 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-from hypothesis import given
+
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestIntegralImageOps(hu.HypothesisTestCase):
-    @given(batch_size=st.integers(1, 3),
+class TestIntegralImageOps(serial.SerializedTestCase):
+    @serial.given(batch_size=st.integers(1, 3),
            height=st.integers(7, 10),
            width=st.integers(7, 10),
            channels=st.integers(1, 8),
@@ -45,7 +46,7 @@ def integral_image(im):
         self.assertDeviceChecks(dc, op, [im], [0])
         self.assertReferenceChecks(gc, op, [im], integral_image)
 
-    @given(batch_size=st.integers(1, 3),
+    @serial.given(batch_size=st.integers(1, 3),
            height=st.integers(7, 10),
            width=st.integers(7, 10),
            channels=st.integers(1, 8),
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
index 97a1fbaeec2222..51faa14b9029fe 100644
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -2,9 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -24,8 +26,8 @@ def jsd_grad(go, o, pq_list):
     return [np.log(p * (1 - m) / (1 - p) / m) / 2. * go, None]
 
 
-class TestJSDOps(hu.HypothesisTestCase):
-    @given(n=st.integers(10, 100), **hu.gcs_cpu_only)
+class TestJSDOps(serial.SerializedTestCase):
+    @serial.given(n=st.integers(10, 100), **hu.gcs_cpu_only)
     def test_bernoulli_jsd(self, n, gc, dc):
         p = np.random.rand(n).astype(np.float32)
         q = np.random.rand(n).astype(np.float32)
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index fa0958afd1f99a..59203fd960c88b 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -4,17 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import brew, core
+from caffe2.python.model_helper import ModelHelper
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
-import unittest
-import os
+import caffe2.python.serialized_test.serialized_test_util as serial
 import numpy as np
+import os
+import unittest
 
-from caffe2.python.model_helper import ModelHelper
 
-class TestLayerNormOp(hu.HypothesisTestCase):
+class TestLayerNormOp(serial.SerializedTestCase):
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(X=hu.tensors(n=1), **hu.gcs)
+    @serial.given(X=hu.tensors(n=1), **hu.gcs)
     def test_layer_norm_grad_op(self, X, gc, dc):
         X = X[0]
         if len(X.shape) == 1:
diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
index 2284fdbba0785c..84e1307568f22d 100644
--- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
@@ -3,16 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from hypothesis import given
-import hypothesis.strategies as st
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from hypothesis import given
+import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLearningRateAdaption(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=2),
+class TestLearningRateAdaption(serial.SerializedTestCase):
+    @serial.given(inputs=hu.tensors(n=2),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            lr_alpha=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
index 73710e520f9495..3677239817d7e7 100644
--- a/caffe2/python/operator_test/learning_rate_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_op_test.py
@@ -5,6 +5,7 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 from hypothesis import given
 import hypothesis.strategies as st
@@ -15,8 +16,8 @@
 import numpy as np
 
 
-class TestLearningRate(hu.HypothesisTestCase):
-    @given(**hu.gcs_cpu_only)
+class TestLearningRate(serial.SerializedTestCase):
+    @serial.given(**hu.gcs_cpu_only)
     def test_alter_learning_rate_op(self, gc, dc):
         iter = np.random.randint(low=1, high=1e5, size=1)
         active_period = int(np.random.randint(low=1, high=1e3, size=1))
diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py
index 08ec2638886291..20ac2b25ba1103 100644
--- a/caffe2/python/operator_test/length_split_op_test.py
+++ b/caffe2/python/operator_test/length_split_op_test.py
@@ -6,11 +6,12 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthSplitOperator(hu.HypothesisTestCase):
+class TestLengthSplitOperator(serial.SerializedTestCase):
 
     def _length_split_op_ref(self, input_lengths, n_split_array):
         output = []
@@ -26,7 +27,7 @@ def _length_split_op_ref(self, input_lengths, n_split_array):
                     output.append(val - 1)
         return [np.array(output).astype(np.int32)]
 
-    @given(**hu.gcs_cpu_only)
+    @serial.given(**hu.gcs_cpu_only)
     def test_length_split_edge(self, gc, dc):
         input_lengths = np.array([3, 4, 5]).astype(np.int32)
         n_split_ = np.array([5]).astype(np.int32)
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
index f879b702cd5092..d9cd2b2446045d 100644
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthsPadOp(hu.HypothesisTestCase):
+class TestLengthsPadOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(
             dtype=np.float32,
             min_value=1,
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index 42ef87ac364f70..4a9a6b0ff1a9da 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthsTileOp(hu.HypothesisTestCase):
+class TestLengthsTileOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         inputs=st.integers(min_value=1, max_value=20).flatmap(
             lambda size: st.tuples(
                 hu.arrays([size], dtype=np.float32),
diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py
index 6ffb5fc4fa843f..8bc27c31144f48 100644
--- a/caffe2/python/operator_test/lengths_top_k_ops_test.py
+++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py
@@ -2,15 +2,17 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthsTopKOps(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=0, max_value=10),
+class TestLengthsTopKOps(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=0, max_value=10),
            K=st.integers(min_value=1, max_value=10),
            **hu.gcs_cpu_only)
     def test_lengths_top_k_op(self, N, K, gc, dc):
diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py
index c690b3aed3891e..b98100168df022 100644
--- a/caffe2/python/operator_test/listwise_l2r_operator_test.py
+++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py
@@ -2,6 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py
index 29ff72cd1e72db..49051442350e28 100644
--- a/caffe2/python/operator_test/locally_connected_op_test.py
+++ b/caffe2/python/operator_test/locally_connected_op_test.py
@@ -8,10 +8,11 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestLocallyConnectedOp(hu.HypothesisTestCase):
-    @given(N=st.integers(1, 3),
+class TestLocallyConnectedOp(serial.SerializedTestCase):
+    @serial.given(N=st.integers(1, 3),
            C=st.integers(1, 3),
            H=st.integers(1, 5),
            W=st.integers(1, 5),
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
index a6ea88e55737e5..e57bdb7a1d41df 100644
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLossOps(hu.HypothesisTestCase):
+class TestLossOps(serial.SerializedTestCase):
 
-    @given(n=st.integers(1, 8), **hu.gcs)
+    @serial.given(n=st.integers(1, 8), **hu.gcs)
     def test_averaged_loss(self, n, gc, dc):
         X = np.random.rand(n).astype(np.float32)
 
diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
index 28f8e0a20f0a90..9e0168eacf9354 100644
--- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
+++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
@@ -3,16 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
 
-class TestMarginRankingCriterion(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=10, max_value=20),
+class TestMarginRankingCriterion(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=10, max_value=20),
            seed=st.integers(min_value=0, max_value=65535),
            margin=st.floats(min_value=-0.5, max_value=0.5),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
index 4661c7715ec518..0772aee5c9b285 100644
--- a/caffe2/python/operator_test/math_ops_test.py
+++ b/caffe2/python/operator_test/math_ops_test.py
@@ -7,12 +7,13 @@
 from hypothesis import given
 from hypothesis import strategies as st
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import numpy as np
 import unittest
 
 
-class TestMathOps(hu.HypothesisTestCase):
+class TestMathOps(serial.SerializedTestCase):
 
     @given(X=hu.tensor(),
            exponent=st.floats(min_value=2.0, max_value=3.0),
@@ -31,7 +32,7 @@ def powf_grad(g_out, outputs, fwd_inputs):
                                    output_to_grad="Y",
                                    grad_reference=powf_grad),
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            exponent=st.floats(min_value=-3.0, max_value=3.0),
            **hu.gcs)
     def test_sign(self, X, exponent, gc, dc):
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index 67fdf2cf5ffe11..1872a129e569c8 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -13,10 +13,11 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestMatMul(hu.HypothesisTestCase):
-    @given(
+class TestMatMul(serial.SerializedTestCase):
+    @serial.given(
         M=st.integers(min_value=1, max_value=10),
         K=st.integers(min_value=1, max_value=10),
         N=st.integers(min_value=1, max_value=10),
@@ -125,7 +126,7 @@ def matmul_ref(X, Y, axis_a, axis_b, trans_a, trans_b):
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
 
 
-class TestBatchMatMul(hu.HypothesisTestCase):
+class TestBatchMatMul(serial.SerializedTestCase):
     @settings(max_examples=30)
     @given(
         C=st.integers(min_value=0, max_value=3),  # number of batch dims
@@ -214,7 +215,7 @@ def matmul_ref(X, Y, trans_a, trans_b, dtype):
         # Check over multiple devices
         self.assertDeviceChecks(dc, op, [X, Y], [0])
 
-    @given(
+    @serial.given(
         C_1=st.integers(min_value=0, max_value=3),  # number of batch dims
         C_2=st.integers(min_value=0, max_value=3),
         M=st.integers(min_value=1, max_value=10),
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
index cbb1adc954784d..77c6b82625b139 100644
--- a/caffe2/python/operator_test/mean_op_test.py
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -4,16 +4,17 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
-
 import unittest
 
 
-class TestMean(hu.HypothesisTestCase):
-    @given(
+class TestMean(serial.SerializedTestCase):
+    @serial.given(
         k=st.integers(1, 5),
         n=st.integers(1, 10),
         m=st.integers(1, 10),
diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py
index 1b4322e0624f70..9f3302c6e75a3f 100644
--- a/caffe2/python/operator_test/merge_id_lists_op_test.py
+++ b/caffe2/python/operator_test/merge_id_lists_op_test.py
@@ -3,15 +3,13 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-
-from hypothesis import given
-import hypothesis.strategies as st
-
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import hypothesis.extra.numpy as hnp
+import hypothesis.strategies as st
+import numpy as np
 
 
 @st.composite
@@ -53,7 +51,7 @@ def merge_arrays(vs, offs, j):
     return merged_lengths, merged_values
 
 
-class TestMergeIdListsOp(hu.HypothesisTestCase):
+class TestMergeIdListsOp(serial.SerializedTestCase):
     def test_merge_id_lists_ref(self):
         # Verify that the reference implementation is correct!
         lengths_0 = np.array([3, 0, 4], dtype=np.int32)
@@ -69,8 +67,7 @@ def test_merge_id_lists_ref(self):
         np.testing.assert_array_equal(merged_lengths, expected_lengths)
         np.testing.assert_array_equal(merged_values, expected_values)
 
-    @given(inputs=id_list_batch(),
-           **hu.gcs_cpu_only)
+    @serial.given(inputs=id_list_batch(), **hu.gcs_cpu_only)
     def test_merge_id_lists_op(self, inputs, gc, dc):
         num_inputs = int(len(inputs) / 2)
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
index fa456c8382f64b..ae9d9158f5062b 100644
--- a/caffe2/python/operator_test/moments_op_test.py
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -7,12 +7,13 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
-import numpy as np
 import itertools as it
+import numpy as np
 
 
-class TestMomentsOp(hu.HypothesisTestCase):
+class TestMomentsOp(serial.SerializedTestCase):
     def run_moments_test(self, X, axes, keepdims, gc, dc):
         if axes is None:
             op = core.CreateOperator(
@@ -41,7 +42,7 @@ def ref(X):
         self.assertDeviceChecks(dc, op, [X], [0, 1])
         self.assertGradientChecks(gc, op, [X], 0, [0, 1])
 
-    @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
+    @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
            num_axes=st.integers(1, 4), **hu.gcs)
     def test_moments(self, X, keepdims, num_axes, gc, dc):
         self.run_moments_test(X, None, keepdims, gc, dc)
diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py
index 7bfceb61121ca2..39e358f30d386e 100644
--- a/caffe2/python/operator_test/momentum_sgd_test.py
+++ b/caffe2/python/operator_test/momentum_sgd_test.py
@@ -5,6 +5,7 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import hypothesis
 from hypothesis import given
@@ -13,8 +14,8 @@
 import unittest
 
 
-class TestMomentumSGD(hu.HypothesisTestCase):
-    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
+class TestMomentumSGD(serial.SerializedTestCase):
+    @serial.given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
     def test_momentum_sgd(self, n, nesterov, gc, dc):
         param = np.random.rand(n).astype(np.float32)
         grad = np.random.rand(n).astype(np.float32)
@@ -69,7 +70,7 @@ def momentum_sgd(grad, param_momentum, lr, param=None):
             reference=momentum_sgd
         )
 
-    @given(
+    @serial.given(
         inputs=hu.tensors(n=3),
         momentum=st.floats(min_value=0.1, max_value=0.9),
         nesterov=st.booleans(),
diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py
index d37955ac1b4d2e..309236a281a492 100644
--- a/caffe2/python/operator_test/negate_gradient_op_test.py
+++ b/caffe2/python/operator_test/negate_gradient_op_test.py
@@ -3,18 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
+from caffe2.python import workspace, core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import workspace, core
+import numpy as np
 
 
-class TestNegateGradient(hu.HypothesisTestCase):
+class TestNegateGradient(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
-           inplace=st.booleans(),
-            **hu.gcs)
+    @serial.given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
     def test_forward(self, X, inplace, gc, dc):
         def neg_grad_ref(X):
             return (X,)
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
index c1d02de1d09690..42fde4c9452251 100644
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -11,10 +11,11 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestNumpyTile(hu.HypothesisTestCase):
-    @given(ndim=st.integers(min_value=1, max_value=4),
+class TestNumpyTile(serial.SerializedTestCase):
+    @serial.given(ndim=st.integers(min_value=1, max_value=4),
            seed=st.integers(min_value=0, max_value=65536),
            **hu.gcs_cpu_only)
     def test_numpy_tile(self, ndim, seed, gc, dc):
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
index da1c11fbc2cc2e..19e6ee10e3ddf9 100644
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -7,6 +7,7 @@
 from caffe2.proto import caffe2_pb2
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -25,8 +26,8 @@ def _one_hots():
                 max_size=sum(x[1]))))
 
 
-class TestOneHotOps(hu.HypothesisTestCase):
-    @given(
+class TestOneHotOps(serial.SerializedTestCase):
+    @serial.given(
         x=hu.tensor(
             min_dim=2, max_dim=2, dtype=np.int32,
             elements=st.integers(min_value=0, max_value=10)),
@@ -56,7 +57,7 @@ def ref(x, lens, vals):
         op = core.CreateOperator('BatchOneHot', ["X", "LENS", "VALS"], ["Y"])
         self.assertReferenceChecks(gc, op, [x, lens, vals], ref)
 
-    @given(
+    @serial.given(
         x=hu.tensor(
             min_dim=2, max_dim=2, dtype=np.float32,
             elements=st.integers(min_value=-5, max_value=5)),
@@ -108,7 +109,7 @@ def ref(x, lens, boundaries):
                                  ["X", "LENS", "BOUNDARIES"], ["Y"])
         self.assertReferenceChecks(gc, op, [x, lens, boundaries], ref)
 
-    @given(
+    @serial.given(
         hot_indices=hu.tensor(
             min_dim=1, max_dim=1, dtype=np.int64,
             elements=st.integers(min_value=0, max_value=42)),
@@ -134,7 +135,7 @@ def one_hot_ref(hot_indices, size):
             one_hot_ref,
             input_device_options={'size': core.DeviceOption(caffe2_pb2.CPU)})
 
-    @given(hot_indices=_one_hots())
+    @serial.given(hot_indices=_one_hots())
     def test_segment_one_hot(self, hot_indices):
         index_size, lengths, indices = hot_indices
 
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
index 0cba8053d53e05..eaf0ef58ba30e6 100644
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -2,17 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
-from caffe2.proto import caffe2_pb2
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
-class TestONNXWhile(hu.HypothesisTestCase):
-    @given(
+class TestONNXWhile(serial.SerializedTestCase):
+    @serial.given(
         condition=st.booleans(),
         max_trip_count=st.integers(0, 100),
         save_scopes=st.booleans(),
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
index 3935ca8c8f17ab..f6674ed625bd45 100644
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -5,6 +5,7 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 from hypothesis import given
 from hypothesis import strategies as st
@@ -12,7 +13,7 @@
 import time
 
 
-class TestTensorPackOps(hu.HypothesisTestCase):
+class TestTensorPackOps(serial.SerializedTestCase):
 
     def pack_segments_ref(self, return_presence_mask=False, max_length=None):
         def pack_segments_ref(lengths, data, max_length=max_length):
@@ -53,7 +54,7 @@ def pack_segments_ref(lengths, data, max_length=max_length):
 
         return pack_segments_ref
 
-    @given(
+    @serial.given(
         num_seq=st.integers(10, 100),
         cell_size=st.integers(1, 10),
         **hu.gcs
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
index a5a3d6de537d01..6bf2315ca0c52e 100644
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestPackRNNSequenceOperator(hu.HypothesisTestCase):
+class TestPackRNNSequenceOperator(serial.SerializedTestCase):
 
-    @given(n=st.integers(0, 10), k=st.integers(1, 5),
+    @serial.given(n=st.integers(0, 10), k=st.integers(1, 5),
            dim=st.integers(1, 5), **hu.gcs_cpu_only)
     def test_pack_rnn_seqence(self, n, k, dim, gc, dc):
         lengths = np.random.randint(k, size=n).astype(np.int32) + 1
@@ -47,7 +48,7 @@ def pack_op(values, lengths):
         # Gradient check
         self.assertGradientChecks(gc, op, [values, lengths], 0, [0])
 
-    @given(n=st.integers(0, 10), k=st.integers(2, 5),
+    @serial.given(n=st.integers(0, 10), k=st.integers(2, 5),
            dim=st.integers(1, 5), **hu.gcs_cpu_only)
     def test_unpack_rnn_seqence(self, n, k, dim, gc, dc):
         lengths = np.random.randint(k, size=n).astype(np.int32) + 1
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
index ee5e001a91e2ce..43cd10c231887e 100644
--- a/caffe2/python/operator_test/pad_test.py
+++ b/caffe2/python/operator_test/pad_test.py
@@ -2,16 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
-class TestPad(hu.HypothesisTestCase):
-    @given(pad_t=st.integers(-5, 0),
+class TestPad(serial.SerializedTestCase):
+    @serial.given(pad_t=st.integers(-5, 0),
            pad_l=st.integers(-5, 0),
            pad_b=st.integers(-5, 0),
            pad_r=st.integers(-5, 0),
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
index f09dcdcde2b6d5..83d67f4beea966 100644
--- a/caffe2/python/operator_test/piecewise_linear_transform_test.py
+++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py
@@ -4,15 +4,16 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
-
 import unittest
 
 
-class TestPiecewiseLinearTransform(hu.HypothesisTestCase):
+class TestPiecewiseLinearTransform(serial.SerializedTestCase):
     def constrain(self, v, min_val, max_val):
         def constrain_internal(x):
             return min(max(x, min_val), max_val)
@@ -31,7 +32,7 @@ def transform(self, x, bounds, slopes, intercepts):
         y = slopes[index] * x_ + intercepts[index]
         return y
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @serial.given(n=st.integers(1, 100), **hu.gcs)
     def test_multi_predictions_params_from_arg(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
         intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py
index 9cdeafaae50bc9..5882f7aef8346d 100644
--- a/caffe2/python/operator_test/weighted_sum_test.py
+++ b/caffe2/python/operator_test/weighted_sum_test.py
@@ -14,7 +14,7 @@
 
 class TestWeightedSumOp(serial.SerializedTestCase):
 
-    @serial.given_and_seeded(
+    @serial.given(
         n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4),
         in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
         seed=st.integers(min_value=0, max_value=65535),
diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md
index 00d104d309f652..2885ed290cab8e 100644
--- a/caffe2/python/serialized_test/README.md
+++ b/caffe2/python/serialized_test/README.md
@@ -4,9 +4,15 @@ Major functionality lives in `serialized_test_util.py`
 
 ## How to use
 1. Extend the test case class from `SerializedTestCase`
-2. Change the `@given` decorator to `@given_and_seeded`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
-3. Change a call to `unittest.main()` in `__main__` to `testWithArgs`.
-4.  Run your test `python caffe2/python/operator_test/my_test.py -g` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one folder per test function
-5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. 
+2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
+3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`.
+4.  Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one npz file per test function. Use `-O` to change the output directory.
+5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. 
+
+##Additional Notes
 
 If we'd like to extend the test framework beyond that for operator tests, we can create a new subfolder for them inside `caffe2/python/serialized_test/data`.
+
+Note, we currently don't support using other hypothesis decorators on top of `given_and_seeded`. Hypothis has some handling to explicitly check that `@given` is on the bottom of the decorator stack.
+
+If there are multiple calls to assertReferenceChecks in a test function, we'll serialize and write the last one. The actual input checked may then differ if we refactor a test function that calls this multiple times, though the serialized test should still pass since we then use the serialized input to generate a dynamic output.
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip
new file mode 100644
index 0000000000000000000000000000000000000000..415a47d71c3166a785e06108d24af457f02a9002
GIT binary patch
literal 1192
zcmWIWW@Zs#0D%Jyy1pAV)iPv(Y!GGz;`{==f+Q}+2qDHuDF!8G*PK!<9xm3zoPvx*
z(X(gH*jh0Jpy*$te@IoInSlX>If42!^YTke^zsU-V9Gea)@&1EPPhV;c4cH>-~uZv
zC@m=ls;iv+M*Nkuz{wK<#{$;Qn-n!CW=Y=C1#yqpuUoue-n96TpeZxN=f_W(#3fYb
zeMZxdwM#r*MQjPvPpLMmSyo(Ixu!6zV_WuQ&!sy}UVESXSroGO&ZRk%&Ky~^=!n<4
zAB(O$`LpNIq(zsW2(5E+>(E-aN2DW1ak9B{wnlPLb%5gLHuJ^i$=V@~UY@CrTH%V`
z8k6_8nm3y(2QaQ~GcVRi?wvn>{`GnV2H|_(R6XZzY|WAEk>r?sLs2c2!F2gd2h-vP
z)7uNO(ipP%QXlSGmlHi{O2nik?C>Ch_}yxwX__n0&mBO_55y^{WtqvT@g<c7sW<~5
zZ*t~Bj1VYC4FRzKMHuziG#JoqL=J6kppC#N8Q#Io$RxrHO4HaA8&Coa8X7@VW*#go
z0=!Z6p+^)%7Xw2>;~yXsE!xmEqlY;{a~?3wBgH^~H!B-Rk{Jls0qJZe5D%7JAi46|
zGltzJKv56|<*M|e#FTggaL$b6VvCFi@rBqTz%($Mas#ueJ5VSyHLnCcyLvDpT#7x*
y1I-76CE(xz7Xav5&~rILivy5}l;>dqirpIsZPS3o1>94}A+ivt<|9y&fdK&AiE7dS

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip
new file mode 100644
index 0000000000000000000000000000000000000000..e4584245ab112cc07ded499bf6d2d0d771345f9d
GIT binary patch
literal 1200
zcmWIWW@Zs#0D%Jyy1uVp{@_;xvO$;?i1Q2d3X-@OBZL?ur5KbreNq#%D}z#VO0{^n
zSQB#!G7?44o;hP1fMUiP{X?qy%nS@5%n39jGcUiiL@%$P3Z{$$Y}Ga)=7cLiX;(%D
z1}?C&g3^*=pt{Q0Z^U0o3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180t
zNnAo@-e)xZSi8j2Rm7Gs{gi66nq|ecm1_#aI<{p`_FTHt<hA$7pG6^i?_8QQ>CBNu
zi;j4$`?2WClRtYNO<Hv6iO@PHw+^j!dqg^N6epWIXKN$}RR<_;ZZltOo~#|>=;fK}
zs1>g0tuc9jt9i4zascD%HuGYQ<lg!7=U=Z^U=Y6dP1SSm#?~Ck9!ZYLHx$)U8BCYY
zbTBP$FulDXD~%zGFZJQBbve<KrbJ9y!p?x|cdL!2X|6y&cK|Uz5T~S;WhSS_msA#{
z;tYVi$(ai=LZBQq1i=0mVbo*OU_i4GIkdTfHUgt$cn3EllL#{?U1LvdKnXBtXarH2
zd9bhu@J7{#9#Ifo3=9p8e}GK1XhYYG9_9$mdB8M}6axX?tZX1jW*}S#q_deoJXn%}
z<jlFh*hMUWq96>)Rp~{EDe(s2+!@Km78wuX3$aCjX<$|rglAQEphRYBUI}WJ^<YG}
z7JJqQ+5rYjz(E8q2+*~l=X8V?7a$WU-@^hGyH^m}W&w)~xW|w~Wg$?_PoN|N0|1zJ
BaK-=t

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip
new file mode 100644
index 0000000000000000000000000000000000000000..0dc8e48877d43199a6eb609c10c5a82dcaaafcfc
GIT binary patch
literal 1166
zcmWIWW@Zs#0D*%Iy1qy1PEQa3vO$;?i1Q2d3X-@OBZL?ur5Kc0f>Luztr!AOw8U%f
z6V+#CU;trGpq9+M{L&J=yn-s2G7hlGH|F?Hy#kbWWn^IB0xK&hEhz@7tDOBt{FSu8
z$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm(
zsWz)wR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFok
zt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=L+98f!o~e#n;fmfGllQlpH=8R5Fs^Ph
zFV;xzoj-s6^?C&c;d|dyJ?Cz0&5`Vp<d}RzQ7x6hboopN)8Yox+Y7SN7_#_MAMV<B
zW0{KPMiozfco0GSZne=g%@ye94j|?S;*`{~%;ePglFEWqoB@zGIddUK2$Z9S0NDQ`
zjCyPu3}`kYhc-9RMqrc-@8D)+5@7}<W9*3yC;<iyjUXyB4;B^y-l+P}BMPF6fuW)C
z50Hr#ZRncO!yKVG518hWVj#eql?^1x420`|bT$)+2g@#yjK~pebXXlI3c{dVm0pyX
z5^n&`jFDVyk?|nD5L*P82Ifp2P|kD*ie;wel~^%&Fd`g>J!1on1cN2u5CLa%bS>x^
k8==J*$VAHKurS2#a)h={U<m>D2XY`R1gg0Xlw@E40OUYqG5`Po

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip
new file mode 100644
index 0000000000000000000000000000000000000000..07e439a921cfcf7799227f9c8eeb49605f8d3ab5
GIT binary patch
literal 1184
zcmWIWW@Zs#0D*%Iy1q<T=NC!>*&xgc#Q6n!1xZ|t5kicSQVdG0L8&>Vep(z{jCrC!
zN&5g4y&@A!KkG9yFn};8P&zX&zqCXzub>L1j00?iAWv@76`-^$BLf2$SXn`7Nik4e
z<?J`&ucQS|o(MP=uy)?0s5vo9@|G@$d%S+#;sx`j#fJn<nIS$ue##^+p)&6?ntrTZ
z;^``4OPGF2wOP%w;@ZkJg<&1rvL|~k-D&dL`{d7}kiB;<&6#xO$f89@yw?3#bmhsP
zJ&z_Wy7WY7os(OK*1A0+9XX1V&7HF~l7p%P6gRh-FE&rs4srDIOm)->SM=7Hyua1F
z*<3k*adn${u|{(5{Q2{**DEjx-}|QOId@}gj%1G{$K)G|YN-sS%V#>67B`sQUXYc>
zkj0mpxMSaqWh$B*RXq6_Q2lPT(KO8!=;sa~<_F@G)UwRv)cBIhf>fLVkT*GVAw~$4
zqlN(3|00ZfY#Iz`HX?^MH_%34lnn3SW@Hj!2Bl~0i47<L1`UlMDl-oj76IO<`p_c^
zqKko{q45upi56|>n$g1?p*atj=8<9`z?+o~B*_ef>wt7N6Nm>(GLRgpQh(jl04NH=
zpj?$+l$a840M3<>Tx^l?AifY=1egY9QC@Hsbq5M&rskC(vZ@Co!j;&wI?!w|SON|j
qaK=a1f}X<>T5Nz!r2GyGOzd7jXqyBqD&QVM4vmFCH7|jZ3=9B2lwn8!

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2bdb95bdaf798c9f6fa4cb0d2d03cd7c92361ae2
GIT binary patch
literal 1167
zcmWIWW@Zs#0D+)JU0>OkO~C~~HVCr<aejebK@t~hL1Iy2E*D#Fer{@B2^SB9j!#L=
zDM{pFNiRxF;bO`u5`ribf+`b2Q6|Np#Nn8d2-c#-$HksnP@I{QpC`(7UD-y9hl@2O
zH94^o%rjyLKygy!O4bvh%nS@5%n5W-W?p`2iC$ho6-*fi*yZONUO2x6N*`imVBi8P
zD<~}~2CA!^{YLziw7|&|0mlN?&YKi9CuT|B(gksk*RNZ=VBWO&kf13u#OKFPnZzYj
z=6y!fkF`rYT}5mO(@&{oR<o?QwsK8jSjV>P$(~Dhn!NTt`Lifw@109?CY?F5Xwea`
zbw3tedGcq^qe+V{JrP>x<kq3JZjVSuj^bo<=WLDSpy~j{&28q3&9k*b9KAeK9ks$0
zy)`E9Z#8c=R}NrY-DX~_k=#3f{`~9p3Jk*czNvc7-PoEVCD|j%vG|6go8h)JhSKFT
z9ZHKEN^dWCmB#RjFEx>&X_AVFu*E{Pf`sYM4(Xrmji1HtIqUVVO}?H%S)P;R;9&;|
z6swJ<X|BM)=m27VAWlgw%S=v<FR3g@#ThbrlQS1$1Wh?=(0~I*gi()8g8|LPKX-jk
zKLpy;0*nT3APtNwO5@=ewc??eR`D>wPuKGjGcY6=nM9aDnF)K61WJHGLnDaF%!36C
zx<2$+gXm&lXlN7#hALVdqH9Kv0EFgtpk}1#2=HcQ14%Li;W{8akqN{@7>B(;0I30i
lC15{*iwATq=ouEG0i<OzkcpIiA@(ydFfn8SCHPrD3IWOBc{Kn4

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip
new file mode 100644
index 0000000000000000000000000000000000000000..adac8479290e6238c90617f30c1eb8d11405fd31
GIT binary patch
literal 1209
zcmWIWW@Zs#0D+)JU0;I<AEfGmY!GGz;`{==f+Q~1g2bZ4TrRfU{M^*M5-uJH9iNh#
zQ<BKVo|%`DnVee8#gbl>n8L-BQzQgYF9cODgrZ)GL5VLIXkKxuV@e{}1T8);_SAyn
z%$)o@QLgLCHd;JftSPC<iIrfU5kmlq3#&3Ciwc+-7(kd4=)%mr{L&J=yn-s2G7hju
z9=p0t;a~)Mf{B5F3#_c5w4@lQu5$Jp@mJCUCr<<%3s^gEQq-K7C3#C1#64cWZt;S7
z)8a#drpyqZA3tRhmr$Aa8BIUdF7b2~u_a7DrJh;Mvf|pxHHBdv+p;HnF5PMJ+WX|s
zqL96JF3p*A=E$N&N4(bkSajvdpFNKzExPnXXq}T=ht|41A{{x3lg*v8HIjp>0~9y6
znJ+f4)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3d9g-v@BI1muh%Ou2;cjr>N$5~YmSs;
zk0i(98;)*<+tL_Hm(O%4Ep8~iz2H?E!z;ejM24nGDk8!b3)Kn|rawERf0lPr8pG}R
zoW-{Me!nl4es(G?PVoAuc=L2=?9F8=C-3fC`fA~CtBJ3!Oj5Ch2P-7JtTvjaxdOwf
z1Bm&7I3=|V7^(3kl?ACd11)cI=0c26D@P4AaF~fO>al4spxOB6uJ7rGK$}{CvBV9e
zfze256dj{h6cy7diYEB!dR}4%h6E#%2s0?#VNb0<2{34A1W}oJuz*3=haPJXT?`Bj
zjRwF_MT<jp&FB$;(7Xt!87Vpfyjj^mlFUH34oI&7<|$BY!u)`}=m4n!fhAx+fD00I
eE$Dd}q5-635s-<Lvmy2~F)%S?0VOP1Knelad5cH@

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip
new file mode 100644
index 0000000000000000000000000000000000000000..9326bfd4df1da429d3b9e1ab8b7e73252a07e0e3
GIT binary patch
literal 1121
zcmWIWW@Zs#0D*``U0?IcrKX-hHVCr<aejebK@t~hL1Iy2E*D2`er{@BNog(@OL|da
z3KvsOkq|^m2rebXpv3N&k_eL5;^SgZEhx^+$<Gtzx~^<v#1MdD*VPk}pZ%B_7(kd4
zXjf)lerbtbUO^R183))&TScDwyaGxuW@KRC0xK&hEhz@7tDOBt{FSu8$rAy`0@lu(
z6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm(DK)EER$N=T
zrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFokt#fkg&|0@g
zq$5Xhvbl4%MsiSffa2yh^Tp=g+98f!o~e#n;fmfGllQlpH=8R5Fs^PhFV;xzoj-s6
z^?C&c;d|dyJ?Cz0&5@Gqk>pr>!_m!fTN*>@@|g~$#SNvm7raVic*U2R$j~%NMMT(Q
zp;|%0^k;|k&-SXEEsedoOht36iYGrj;2<Glwb3-q6&Mm7K+F%sDXC?d$*J)rl?ACd
zgC%cr=0c2cDMt+#aHxnd>al4spxL-@gQW6ZpiQa3IN%1-z=)zW65Oa23Bk08gab>1
zy(fJEh5{p#2s0>qU{8xc2{34A1W}oJus}iAhaO`PT?`BjjY7Z>MT<Xl&FF!T(A)yl
zj1(0C-mGjONoF8i2c)~0Ks<zT*t0lD4G1g&`vIKc(Y2uGREP$UmK-1xDc?fuXJTMt
N$O1|{1ZHdo1_0t^WuX87

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip
new file mode 100644
index 0000000000000000000000000000000000000000..27c7db4e9a17de487d12885762558d303df13411
GIT binary patch
literal 1161
zcmWIWW@Zs#0D;IxUElN%{Cx31HVCr<aejebK@t~hL1Iy2E*D2`er{@BNog(@duCor
zW^!sV7fX6kVhR^iPLU8qjSyUo6oZmTP=0xMW^rmTP<wHzV@e`Ou@)Z}dul;(W=?*d
zDA#pm8zY7Q6c-fy@i$9gW?%qePM`}i^YTke^zsU-V9Gea4xJYLO!f~@`Ys~_0~c6X
zL1{@bP+jHhH{!3P1x}s_I2N#W-lV8GF-!86E{J=)e%;~)^QOgz1WlPCK0kiSBrc&c
z?=zZytX<;iDq>5Reo8H~nq|ecm1_#aI<{p`_FTHt<hA$7pG6^i?_8QQ>CBNui;j4$
z`?2WClRtYNO<Hv6iO@PHw+^j!dqg^N6epWIXKN$}RR<_;ZZltOo~<3?=;fK}s1>g0
ztuc9jt9i4zascD%HuGYQ<lg!7=U=Z^U=Y6dP1SSm#?~Au$sS3L#Wx(?47a5*lrEp?
zP+HtjdV9gEG=^7vsfi3tlT<{6Ef%U3BuszC<Z0ZXd!og%UwS#~i(Qoqc3B!Mt(f@c
z>8^D-!IP#$O<KYZ4@yWFS#2~;a|MP?2N3fEaY||#Fw){nDhpC^22|eU%!L>sRgM}`
z;BXRQ)ML|NK(le*21(_+K$}v5vBC|cfl)_kw75|#T7qd2EeDncdr$fT3<X9e5oS;}
z!k#*T5@68K2%<9cV1a_J4?V^px)>N58dZQHiWYz9n$ZIvp?MlmGg4Fpc(byBB$<J5
z9gv>G1mYo#!(J4C)PTSeupht$1G*OUybI9)($WNEBIRU={Y(r@3|T-4P8N_t004`2
A#{d8T

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip
new file mode 100644
index 0000000000000000000000000000000000000000..8ddd81fbf44a9f09e9f420c3aacdc5b616c88735
GIT binary patch
literal 1154
zcmWIWW@Zs#0D;IxUEhwK-zG)?*&xgc#Q6n!1xZ}21&Kw8xm+B%`MIfiC8fDs?3sBf
znaQceTrBBDi78x6IYmMcH9~MTQVdGG!9ab*sg5a$AXQp?T<oa@#hE$zd7@m`m2Hd|
z0#K|j_~UPuz|6n^!kj>>GxPFGOZ4&zs$j}Ez%HB?{Y>@`Q2H(-0|OUWSwU$@F;HFQ
z>^I`Cqy<i%2sjq7cHX3@IWbG}mM(~Uynfx{1@orGhXhTTAwEBT$|NqKGVe2*eym;M
z=_+DNn0`tvvzleawUuiM!#cKQPxf58)8w`H$)80bd+%JDGwIBcMT?Gjt^2X)%9B5P
z9!*+w>50%fC$|o*b$diQaug?<J7;So2UQ0sZf-MQY@V$h;^^g>>Zld2=&dn%f2(=3
zxpDyG>NfLYjpW|>^XFf$S6~pn_f6Gv?#9*}Dajs5j>R_|-3+&-F_bQ!=}=nSP<ng8
zt2BmJe5r{HO_Nkage?}T6(mf5#^h<-pnIalvR`^R>x*5L3wBu=EUlRM=IO3=Il+^r
zL`_=44i8F57+GyJO>+f?O$QM3193`f88EiuODYReaRyZ0<jjQ_AytkVQs8hBVbo*O
zU_i5R-v&wLyFi;#fw963q=8XKX|%XeD_VkS5iJLn276EX0t^L4CJ|;({=uF)ff8WQ
z&<LV3^I(C3t`9xNAi5YB8X8rAA&M4%=$g?3AE9{~P%~0g1bDNufh3uMa2=4I!vx|X
tjKf|AfYgA%60je@B?7t@^t=nv0Mb$oWFqBci2Y0qObl5-2__bhLIAy-c5VOw

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip b/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip
new file mode 100644
index 0000000000000000000000000000000000000000..cb08f9f86e6df4497a0cb6c6c0627687032d7b25
GIT binary patch
literal 1299
zcmWIWW@Zs#0D*)?UEhpf|I%H7Y!GGz;`{==f+Q}+2rky*<iwm*E|#Rs#9|@FNGS#-
zUdObw%)C_RjKsXW)Eq4VE}qQd_?*<lqP)bUoKy)0EnY6x{Gyc9A|)0-XOHjz6nj>g
z#b*aIGcbTKC(xeEy!_G<y}W`dm@*Eqb2uKqDEkbQ?q+0Q-~uZvC@m=ls;iv+M*Nku
zz{wK<#{$;Qn-n!CW=Y=C1#yqpuUoue-n96TpeZxN=f_W(#3fYbeMZxdwM#r*MQjPv
zPpM;8v#hwba!p}a$F}Uro=bO{y!Jl%vnXWmolA2jojI~-(Gjn8KNeki@@LPZNsBH$
z5nAWu)}gg-k4Q(3;$(B@Y>nig>Hx*fZRU&3i?u@>y*yJLwZav>H74(GHE%Xo4q#l}
zW?rn3+&h2%{Ok1!48r%msd~=c*qS3H*(1rZ_(r2!qYv}pNjwV_Gt~l|Ee^EZSRi$X
z!9-=sQky^j_f1j}X|_1HY)e`J!_rps>2KU8KXaN?C#uIk#ZTAs5;Fs8&{%CWO#{VJ
z2N3fEaY|}gW^!tLNo7GQ&hW{boVgGqfXYz=2pl{jjCyPu3}`l{#CW#c1lr^Sj0|od
z4U92LW222)v5`zlY=9zT@{LBdMqcKFz{rpUMuxG)fwUV7W*uZOQJ#`IdFzXPla?Hr
zq+$yU0Y)YfW>EgXo*;n|V9?MAqB8Sf0fMd%J+>ga7#JEFrGcS|7I)~H(E}c#xfiGz
zDH;O2S=m66%s{vfNcS^=c(8N>$>BCPsu_cUq96>)dFe%oDe(rNT+S9558^<xx)56g
zm<!D5q9{4t9jGQVHLrwVj`v_hcm#Xt0<<0smViSKT-Km#K`$r}S~7r4q+$aWPuP8l
V(Dn#e;=uik9AFE9YGPPG3IRuBk+1*&

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip
new file mode 100644
index 0000000000000000000000000000000000000000..88ec3cc8dbba7077a6f0cc592f40baafd5f3e012
GIT binary patch
literal 1061
zcmWIWW@Zs#0D+W7UEc{W>ue2xY!GGz;`{==f+Q}+2qAXQyp+u3)M6<HB{s*Rbl=1Z
zEgmk8?9|kPl+4^>2?i|=E|$cKOd!jMAppfl&2?Efw3!(gK$sI~WM*D|X^CE5K^06H
z2iWEn{9HRP0i|6T85p?0$_h$Lih=4XXTK4DB`t9BM8L6tweu!L&52o(w{$_=<Mrzn
zFPJwiJ|t+$4DtE#Qzmf<m3g1h^keN3PgfCJ!t_(B&1#kv*H*464C~mIJ=t^VPLtQ(
zCw~@&?7eeo&ZIL(7A-pBweH8FD^LFHc{FL!r6)q`oZLFJ*6k7L$WfeZ?wqZW98?{k
zxVg=Iv3as~h@+Qhs-sr8qPNE6{jKKB=E?z#tJ}<rHIjSh&!2z2UV%aQ-Zxdxxf@$^
zBzq(|Cf`t0OJy)!KGVUpxWV-Hf~+)#EWXr4hNekNj!aUqg$EGC=T;j{(_Dc*?f_zb
zAWlgw1BP^bNo7GQPVeVU&RmG$|8i9SgMBZ;sK=(kfM%n$_@@7$uyzNAH8+q3M#xYO
z>_)ZBG={Bdhqe?oY`wK0E0rOO_h~`{BCr{mM3_NI9D70oN`OH_BZ$h(gM~zZH>y7L
zXoBcsU}$Lk1!ST{8oFlm5Jzav0mcDR_y>5ivVkO-fp8s=PG<t~5XNE8rXV#SumtP}
jaHd7qf}RZ_8bDftflQ=K39+Avfr%jtD6tfnYZ({-etb-r

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a0e1408b5a731b9a7007b899357284acc3904d14
GIT binary patch
literal 1061
zcmWIWW@Zs#0D+W7UEf#V|6DQvvO$;?i1Q2d3X-@OBZSyJ^HMUCQ;Ve-l-L}L(tR`Y
zw0O8UvQtwFQZjRkB^b0gxL6V^GJz~3h5!^JHP>a`&}L>}0AWs`k(qh<r6qcK1ywL*
z9AKMQ@N@0F1eA7VWMJR|D=R21DF&*moc%`pm9)Ug69LBp*3O$0H790C-qHnekJqnT
zykOq6_>iC}GsNe|PnpCeROWp~(~q@FJY7X>3DZxhHmg}yTwA%OFsx%+_GHhcJ564D
zpZr-AviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@TDM1}BS&$vxpTHga!_@E
z;^sE<#pcP{A&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1KY#x9dIbjId*4(&=WcAx
zk?fJ=n0!M~EtSD^`Ai4X;s(>(3$oG}viMRH8JZ?7IWkGb79KzlpIdD-O>+hMxC4m!
zfjA|#3>eb!C6xuKIK7`YIddU~|I1PR5B9wXqaK?E1DcK2;+y`1!rC1e*4#iE7$HMB
zup8Af(-^j<9okaVu=Uo0tW<_9-lqu-h`?rK5@7}<aqI~VC;<iyjUXyB4;B&u-l+P}
zqY0vmfuW)C7m$e-Y3Q2KLmZ(w2N(xP;UD15$_A2T2EuhfI-Lo`Ll}oWn}XDUz!I<@
kz?l|Z3wkz$XaH#m1~QQ{CB%Lv1}26qpu|#Ou4P~V03}{ei~s-t

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip b/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip
new file mode 100644
index 0000000000000000000000000000000000000000..5a115b00e3b154737737e13c63d5077ac815561a
GIT binary patch
literal 1116
zcmWIWW@Zs#0D;U#UEfX#7BN2{8-!VbIKM!zAc>15C9x!ti#;bXHz_625XLYPV#_Zr
zDJU(GVo>6CN-RmvaLTW6&acoC;^NEA%!^OT$xqIXFV3t=m0-}~<YLWB%q&ZlU<^R9
z@LkOFEEi@51`y^1IU)~ere0n_6-*fi*j?NIgq6PmN;fewFmQpD0qrdYs;iv+M*Nku
zz{wK<#{$;Qn-n!CW=Y=C1#yqpuUoue-n96TpeZxN=f_W(#3fYbeMZxdwM#r*MQjPv
zPpM;8v#hwba!p}a$F}Uro=bO{y!Jl%vnXWmolA2jojI~-(Gjn8KNeki@@LPZNsBH$
z5nAWu)}gg-k4Q(3;$(B@Y>nig>Hx*fZRU&3i?u@>y*yJLwZav>H74(GHE%Xo4q#l}
zW?rn3+&h2%{Ok1!48r%msd~=c*qS3H*(1rZ_=ck!qv_UEhOf(KI(#i|_<DQ6t8|7}
z{H#J6zbqYpNk$u6SUE?2YMoWS(6d<4)0n~0Gbqb*k{knSm{@HzO>+f?MF$Y`193`f
zS!Qx-d`V?ND$an(o1D22BV@`^Lk1i!B8+-$8VqPQp6%ghx(Kw%85j%PKpGPL6h%WD
zwW1-JlxP6O!sHu{YK&jEq%v#;#=_R(hOM_3WTi7?0b`+4)w2rdb4Df+W>C(+o&tdq
zV9?MAqB8Sf=@DHYdMrV7F)%bViU30qEzZz2qX#%ba~n`IQt$_Ov$BCCnSpQ}kZxcC
w@esyg&)pz3Ag~1N2XJ0T*MgouAsRqh3V}?d+zPRuiGhhB3n+01n4K9I05p7Ii2wiq

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip
new file mode 100644
index 0000000000000000000000000000000000000000..73717a440d9581a254f5b9c6572bde552a149eb2
GIT binary patch
literal 1265
zcmWIWW@Zs#0D-(lUEikF*Y#|GY!GGz;`{==f+Q|>H&@4y&>&YXc27SSPiNO)E-okk
zP(K&PAWtBh-N)6>J;Wneh|yn)L5a^Pu_QUesWdq|wIs7DH2}qUw(RQ+ubCOZm=kDx
zW?p`2iC$ho6-*fi*cDPQI40IJGBEfsF)(m}l@*kh6a&>&&VD2QN?PFLiGX7PYv)ah
zniI1mZ|Q=#$LrTEUNCQ3d`Qrg8RGNfr%d7!D)T<0>Brh7o~|Ocgz2Z0n$;{TuB}{C
z7}l{Zd$Q-!ohGlnPyQ?l*?Z^GoJnVnELwELYu%4USDyUY^JvndOHYK>Ik|Oct=l8g
zk)t@-+&Nn#IjA~7adVsbVsmfp5JxZ1R7b6FMQ@GC`&-SM&6NWfSGSoLYb5v1pFjV4
zy#j;qy>F_Xb2ql;NJ;icaxA{#=(e$H^R`rmughmTd@XMHdV9gEbcR>_zAx>!T>Y0Q
z+I`rZSH697dc_;=+}$!<x$2e@(%jZJidt`HANykcMZT!?!LE5Z`<>TaU3Veu)Wcto
z_{%agbHDv<-*+pd`-bI-?Ox1<<^g+mH>dU31=O`WM@7y0JKKS`ea1iMGF4BFyeBL>
zC+-w|a>!=}_s)V{6WSR$BpPS1c3kv%#5VKDwOI!*N@_7q*_xh^!SIOLVa<aB2PQmV
zSafmA2M68fOta3gTsh2Rx>rSWtBNN-18Ou`Z8S}D1x8N?5c30ZN@`hVa%y}@WkD*=
zxXPQHxez0=%26W=991HWdTbgDXf_HQ>dJWvv<Z|rxPde<^-!8J+NhN>l1WJ!poB5`
zhNIfXD&y^`3|rG@IczO%*m`?GRyso#|JKB$goK2aq!cp_Wo9)M(=!1ZCoMTLNyQcz
z8jMUL%%JRwJ);06z@VWKL}li|0ta0mddxv|F)%bVE(eAzT0Ej_Mvnx9W=UXvMT(LD
zZ&o&tBr_1M1JdfioDYgom>;kgIUqG4umtP}Z~=s_1-%%7XaH%62QrZg6^Q*z3``7J
KK#4*YkU{|Cudg-$

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip
new file mode 100644
index 0000000000000000000000000000000000000000..bb95ce7149a6962fe7c564be633bf1731a4e6993
GIT binary patch
literal 1246
zcmWIWW@Zs#0D+Q5U0>bZn+mjnY!GGz;`{==f+Q}+2qC82QX%HD#3CsMB_5~5lH?5E
z{M^*Ml431hF4p{_l++?67C&c?@BkFk=0}~-abad)0AWs`X_<NXr6qcK1ywL*9AHcL
z+pbo-36%C>WMJR|D=R21DF&*moc%`pm9)Ug69LBp*3O$0H790C-qHnekJqnTykOq6
z_>iC}GsNe|PnpCeROWp~(~q@FJY7X>3DZxhHmg}yTwA%OFsx%+_GHhcJ564DpZr-A
zviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@TDM1}BS&$vxpTHga!_@E;^sE<
z#pcP{A&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1KY#x9dIbjId*4(&=WcAxk?fJ=
zn0%vAt&x}c;3S>}l9_4&#uf+CZY-E}kikTGit3NYn!6`0IWkGbmH{;YtTvjaxdJ`j
z0mS@3oRV6WnVcG5Qdy9SGZgYBXD-ADhH}(k00)8yqaK?E1DcHmqCab%0BuSI201s7
z2F44eVeUq)Fb^gr%%wnqzW9cto8h)JhSKFT9ZHKEN^dWCmB#RjFO~CO#i?~FoJU#w
zR&dHVyo}`+ZMd{j)w2p17>rCJ%%EhCJp}?Kz@VWKL}li|!UkO*dc;9=F)%bVG6Dk@
zEgI1^qsIY4b1_geQj7$6v$BCCnSpQ}kZxcC@nA^^l5s<x9-rzC6a`^WR!c8ROo=xD
zWnT8&(s&S$ivyT_AuM2iW{Utbf!SFEIXk-p6=nkSHCndzU_>|`dqDxT5)77rLkwJO
npld-d4G>!VfJ~&a0TwsdeTLAs6IiUk{fQh%3xR6XSwIQ_28(|g

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip
new file mode 100644
index 0000000000000000000000000000000000000000..eaddc47759a9c0fabfa64e0a71d16108d8ee6781
GIT binary patch
literal 1501
zcmWIWW@Zs#0D-DTUEd>JEF9KAHVCr<aejebK@u0E4;P~+7h@O~W3UjTzZ8R#kW*qw
zaz=1LVo`BwNWM#IUU90H2p4ZkYFc7xPDy-OVoqtQC{VjY0E+3y^q9Vwu`n=zFegwt
zGcUiiL@%$P3Z{$$><BT3ty7jWGBE66Vqo9`D=R21DF&*moc%`pm9)Ug69LBp*3O$0
zH790C-qHnekJqnTykOq6_>iC}GsNe|PnpCeROWp~(~q@FJY7X>3DZw0HLF=xTwA%O
zFsx%+_GHhcJ564DpZr-AviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@TDM1}
zBS&$vxpTHga!_@E;^sE<#pd4HA&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1KY#x9
zdIbjId*4(&=WcAxk&^6@<XC*e(QV_lG=|djLnXxxrMDNnN@IA%cQqlUfk929_gs48
zB~?$2Z3Rs^53|lbc)`Qluqq+J?ed|X!x|C^M;7v()zVuL>cGErri1N~-rLTzXYYOc
zJv`)wbm`WcovU@W-A+A{zu)q6=i9SSYIdu0>%5)jyDLFFE<dX~jho-%#-7OAMmwyX
zgXQL~+Vp$T_gkOpyBh!Q-^5>58{G1?r`+;Zxxe1F|MK~{x1THK=3BR4zL&?<{+H1`
z?)RV7X5Tb_=iU67|7CmquBc;m?;q!vd7Y{HHeYPlZSL<kZ`u~|ZTbKE<^Ehr|IWGR
z@BA+Nn=aWnaeCCGCF~5S(P_2OG|d$ljU7PD55y^{WtqvT@g<c7sW{^_Z*t~Bj7Tj<
zjZ|=yiZJT2X)vJKxUzK08V+Cr0wp4DppC#(MrjIaqgDz^CM5-d642xuj%pjLHg8L1
z*t&eC!`9-4t+y9sr88vl``*0sCinb-0}mRw->+Y>!TeR$W@j<s@W`G+_fwDXn<W=N
z6TUtB<j*pHv4~r(*Jt0-n_a@VW}S-WMiozfV1O|)i7<mQKK48ZlmLT<Mi7;m2Ma%R
zedv)3(Z#^f(0Cpg_-Ij$t{FX^5Skr<Ssy9Z0=!w-K$6TrxDH4cF@t!pWCkg^7@rI9
zxBx{#7*x2V7bT{|8z71=Hvf2#kPurKn1+{P;L<B3A5xfMFU33<5iZAGP6F)%gC*eL
r0+*WTTF?tWgqAoU6R9YKMGSUtA++rVmZES^B8SjIpc*w$>CFHDRr&*-

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip
new file mode 100644
index 0000000000000000000000000000000000000000..f51ee2ee182b35363f73e1677511f2e2b8f03e0d
GIT binary patch
literal 1254
zcmWIWW@Zs#0D*=^UEkckX=<~9Y!GGz;`{==f+Q}MqWsKaF1D1^oRY+1F80ja_{_Yt
zd?EIv{EGPe(h?~KC0-||{ECpG#Ju9P{Gwbf87_(B{Gy`N<dXOjm|%QzetuC(u>_+Q
zHy3+Reo10UYKjDdmJk<TVqSVqYJ5_DX<kZvejZSapNl)OpdhC*zBoBCClx3l!6lfS
zlUWcCQCX5vlv<o23Uu-Q02J3B&^&v%k(q%3ggJq(&&&h5PA{*Z3Z{$$?8&=poUFtd
z85q_9F&9`_L1{@bP+jHhH{!3P1x}s_I2N#W-lV8GF-!86E{J=)e%;~)^QOgz1WlPC
zK0kiSBrc&c?=zZytX<;iDq>5Reo7s)nq|ecm1_#aI<{p`_FTHt<hA$7pG6^i?_8QQ
z>CBNui;j4$`?2WClRtYNO<Hv6iO@PHw+^j!dqg^N6epWIXKN$}RR<_;ZZltOUaTGB
z=;fK}s1>g0tuc9jt9i4zascD%HuGYQ<lg!7=U=Z^U=Y6dP1SSm#?~Au$sS3L#Wx(?
z468P8OJ(@Fe5S+K;)buc7raVmc*QT8$iVb+`rJOTbDzH5e#&P(ugCg9$fTAeHT5Gt
z)3)*SoeOAO?tAWA`l+XHpMU>#<9&=kTKlpY+>2&Nu9?x^_^?Ho#hhX8?5Igg*cniR
z&uXJ-nkz8aI)IoTh*MI_GLuu|ODYReafV^u<jjQ_fmn_jh~OX;Vbo*OU_i6c?}BvG
zW1vmJzzE|8(!f}yG{)Mf6=TVy#26^TCf_)yR=IggD#KP_d~Gdm*m`?GRyso#zvzyl
z%uUGx`)6OCRbr>k)|Tw2>e;F4Sp^IWMkWzvP#(pee1Q^R(9j5?GV@?TgRT!f-XOX_
z`Q9EFu4u7{t{FWF5SljvH6ukxfHx}}NRk-{*8%ChOduY@IPB#NNDT-q0s8@5>Y!^u
d&)g6VAT9fVOr)$1v7d>7i6ILpk;Vd22mnJ(q{RRL

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip
new file mode 100644
index 0000000000000000000000000000000000000000..668efa6e1643b587b301ee6303bc589b22495296
GIT binary patch
literal 1077
zcmWIWW@Zs#0D;y<UEk=$%XJz+HVCr<aejebK@t~BN@7VO7fWtpakdaQh)hk12MI|r
zC~-UG=jWs*=J^6e15k{5`7dXtGcyAN2y+6B%FN3zEz!#>sDdfu0Nc3TL$&B7P`Z_o
zfq@IGte~`{7^tpt_8ak6(gG(>1RM)kJ8x3doR}qfOBcjFUcYYff_c;8LxQHv5T74E
zWfGTAnfDn@Kh`esbQQ5BOh2XEtY%qpZRMK6u#RonlRcO2G<ofP@@G-V-aD7(OgeL9
z(V`<>>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z&e<BtLDd0@o7>D6n+I!$IC^=e
zI%<V0dTUJH-)i1$t{lL)y3M>;Be{3}{Q1}G6&Qr?eN*+EyRkJ#O0q|iWAP0~H^Xgd
z45iCwI+PYSl-^$ODvjY4Uut4P0)td@5|jF&78U0eoG~}vu+KhlB4W~%s7XuM8Bhbn
zYNKhID=;8BfS4bMQ&P(^lT+hMDhpC^hDzS#%!L@iQjQud;6M>!)MEq2Evk*ew-ldW
z2HNBbi~?>T4U8vB<DiXNaga<(9Dt%=@(o3`R0dOE1eg{#nBHEHmBx?-iGW#?A|Rp7
z$RxrHO7hqfAW#Af8X7@VW*#gk0=!Z6p~n+M7Xw2>qaZLG(P9l<GkTCCG&ch^BSk=f
zH!B-Rk{Jls0qJ@s5D#G-_G}AM0|HCHegJ1;bS>!F6QTj6#T&>(%B&FknHZQDvVam>
Jfw`H10RV1=Sb+cl

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip
new file mode 100644
index 0000000000000000000000000000000000000000..126920673705348236aeaa9196b31413c2171484
GIT binary patch
literal 1337
zcmWIWW@Zs#0D-nfUEg=M$NxkE*&xgc#Q6n!1xZ{iDTyVCT<kfidFdq?#X{V<iN)Ee
zDe)jtDF!8;;MBs>)V$<WU!brSKNm+@W=>9gSz?Z8d_kvymH-z^ZhlIt5=SvqnHC2Z
zOJYT4u>@lPirofxn+|v~GcbTKC(!Q9y!_G<y}W`dm@*Eq6GLj3a=ivhcQG<BaDkN-
zl$I0&)m6@ZBmPQS;N*#bV*zXDO^TWmvm|fng1E=)*DYQ!Z(4jv(3BbC^W&#X;u0$J
zKBMW!+9jT@BDRF-r<9x3EGw?9TvHg<u`PSD=hB@fuf0$HEDG6s=hB=>XO1jdbi`}j
zk40CW{Mqwp(xOXGgw{E^b!e^IBhrzhIN97eTO&EBIzVxAoB3k%VC@h`FV9p*t#Cze
zjmi64&6~}Y0~lAgnHOs$_s*X`|9ZUwgYdm?s-AN<w&qAl_DFIpzTxP`m>A6#P(O<^
z#GXB3KkuqKwgvwJru}I0Yo4ScVr+3RZA+8SiI$lMT5q$kDQI|l>3UvbW<U)RtBs~<
zuE3z^0AhY1PDw4xOiqn2sVqpv87_H~GZ$h6OgU=6fP+PZQIAc70nNsc!1ck`fi}4V
zBY_)817nKPSZJeGEF_Z>3!q4teB+>6rSY~@hONtII&3X&*m`?GRyso#e`=@x3lm_7
z>p()Bkx7IZlvA)LL!bm0G&F*!%sg0d1bCzBLys$nE(V5%MqyxBqQxA#X7pf3Xl?~+
zMv8<0Z&o&tBr_1M1JbQbARa8$K=S$pzs!yKKv56|<+t>r#FTggP<|Ic%HZ)JVMN~N
z1Z8@#7)3cBR0e<ymSFS%mIAO4!d?aeodyO=z#$7Rk<hiE7aa&KH9#g(L4s-%dI%u2
Uy#ba%@bEwm%!NQTO)MaV0N4(mj{pDw

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip b/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip
new file mode 100644
index 0000000000000000000000000000000000000000..166b4b1d8022ce6bd6d0ea60a7bd4ed548ee10a1
GIT binary patch
literal 1092
zcmWIWW@Zs#0D%dOy1q+p1j?8L*&xgc#Q6n!1xZ}2xrxQu23%}qi8-aI1`w_xlxrx&
zmS0*@P+B6zpv3ExpP!SOm=~G{(hz`RvRz9Wi!(C=0|;{hP0q{%YSYUrsDdfu06U=L
zOvK)|K<Sx`3=CXgWk92gf$A!k_MG=VY#?*=^VwzRe)xZx#<f)8jY8a>N`o7n6Pnp2
zJ*7@2WY0Dc?GoN2a`UErL~!_r>v`4f`Zf2f=YOB~arW(M2mco{pY00ki*qTlUF_*1
z@BMtP#<QiTOY`G9Q`a`fO9#IEaQyNUlg_wBwJvcRwPuB0UAC&kD%R9{zreZ9X=mn!
zr3O}C*YMZLiC%JI`^uv{r7}-$&y_vOb7O}z*Uo%x-#0TJ-?Vh*aeAnbrp)(R=kz-J
zrhR<7zZ88=;7?|LR{d(l{6lj&x7bL(IH@PaIxCqa?%$*SYB?+8n*|Q_Mn*a9JpZ}<
z>lzY6s%O?Qp$3Z8M$<G`U|@6rF+UKeq?TnSr^c657NmkgX7(HLSJDC}PXrtbSUYc0
z)SQ?lc}o}MP0n1nZt;S7)8a#drpyqZA3tRhmr%L)8BIUdF7b2~u_a7DrDj=iZRHYS
z)ML|NU_iBTRp0SN=Ycjk1EYZ(NJF9+91mD+i+j8t!?H3|%i65K)=gnp$F}Uro=bO{
zy!Jl%vnXWmolA2jojI~-(Gjn8KNeki@@LPZNsBH$5nAWu)}gg-k4Q(3;$(B@Y>nig
z>Hx*fZRU&3leI$}y*yJLwZav>H74(GHE%Xo4q#l}W?rn3+&h2%{Ok1!48r%msd~=c
z*qS5RBgrxOhNGI{)-(oFbx9}F;)Kg*7_P{iI>NC;$a~TkpuZWJM3_N20DA%iN`OH_
zBZ$h(gC$3Feduun(Z#^f&?pKFL$nw}*Nh(A2+i$4%}9YC;LXYgl4J(LbwIkF3B*Gf
uhdmdA)PTSeuphvA8eI!|_Jn8vX^8?dkuod9ekKMchAg1O5nyg+U;qGP^<|L&

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip
new file mode 100644
index 0000000000000000000000000000000000000000..ccdd2257ffc71befc65ec1c36d7e5032b92ef518
GIT binary patch
literal 1238
zcmWIWW@Zs#0D-BEy1pBiFbXCD*&xgc#Q6n!1xZ|-nR)TW$@xX8#atXfW>S7dYOxR-
zL_~-c!evj+Nh~f-EtX<X;&;lg2+u6Z@beA!$;{0x(c<D_&dV(pO-oC&*AnL90c(se
z$pD&{Aqup_J^;lH%M{CuRx>j&fG{UeIx{c7v_vnjpbDmp1MJl5dq=m4F)}cSurV-j
zft3}MmJ|ckRnC4R{z_Wl<cWY|0c+<?ikcI%ByZ`0xX0_)EnYBhT6{>*lo{gl<EKpG
z5-RgPqv^-mC7!M#wuI@Yl$+HoE3U0vQyA8<Eqk)((w!!+y-)ru3fX(-(ws?Wjx1Vq
z#B1G;MOU8u+4E@9qDxPN);YO#Xsz2L(vhP$+1xo>BRQx#Kyh=M`C{{6?GQ&V&s0aP
za7Ay8$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`;pld<(s*ks
z!`J0A9ljPfe7(KkRXW2f{=XBZO-u9lt@+l&ANq04%t;|fRXsHx9_8fJ@7u=HcWwgn
z86JlX4;VzO9X@KzSE*}MW|`mcD9|pmMp&JruMx=7{4;6El}ReL45*=Jwb3-q6&PwA
zK+F%sDXC?d$*J)rl?ACdgD`J$=0c1xEJqDPa0rSp>al4spxKyy^+_ooFuo1}F*ndg
zV5CwSVaKQyVZ~%bn4()MLn$z_N{bsxfsvKQ@QUxLr?>ZJ`|4xI>#q5DE-2JcD{S)p
zl$Lq=!3!SKgoKC!hAvr!9R~!P{2exS3bPz<Fr1Yb6Ohi>`zSFXV?+BxctkNWi7<om
zB=&?3lmLT<Mi7;m2MfgjZ&ZEgQ4G<=z|hcW2aE!=2uIh99$N^_8-SXT;x53O6_^1T
z7?^=@9gu#`1mYo#!(Og{)PTSeuphvs47wKdVgRB6q@@MOL@E>@_A@asF=PQH!dO5G
E0bfI&7ytkO

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip
new file mode 100644
index 0000000000000000000000000000000000000000..928a74f90cec5d452b12b372cc85a909dba4e5b2
GIT binary patch
literal 1029
zcmWIWW@Zs#0D+l}y1wSeGin5YY!GGz;`{==f+Q}+2qDHuDF!7L=hVy`D~13RErIi!
zo3xo37(kd4s3kKmzqCXzub>L1j00@)3VyDgmw?i)j0_B1U}XiRCB;B>m9yW7zmgU>
zc_QFgz}k6}qUOXb$y>T0?(zC{ix<qB79SEcWrq0t_$iaPgvz|nX!@~siKnZGEn)g8
z)n+xzifb#^6oz$d%bx7Hbf?K{?~^}^LiXOdG-uM8Ba0Rt@mlv|(Um8E_B@)j=+YCR
zbxv*_TI=?RbmS;bHh0d}NDitFP~6;RzSuljJH*k;Gu2TmT+v%&^8QxyW^?5L#?@`+
z#Tv=I^XJdMUa!C)eD9m8=iH60Ig&k+9FuP-s--fRE}!XOTHIiIdqGwjLl$3ZB16-p
zB}XQy*unz{;&ZEwrfIG~A9nyTKM<#+mSrZV#+OtUq~i2`-sH@M82&Fu^*`A6B8+-$
z8VqPQB8N3M&_-Z{4ClaRWD;QpC1LCd4JZKy4UHfwGY=LL0p6(k(4z^Wi-Dn`@fVPZ
z7HR03(L)@eIR}{Dk-|T~o0SbD$qa<+fOI+&h=(u^do~2A0f8l8KY%kOx)$_o2+;u2
ZVhm&=WlD(sObkp6SwM+aV6J6g001K-KokG~

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2c3e35be43a17529931b0f2d3ba53e4a504afd79
GIT binary patch
literal 1568
zcmWIWW@Zs#0D-xUy1u3<e@uLVY!GGz;`{==f+Q}+2rib~)Wke4&dj{B;F1)V)G{uv
z{L+$w(h~Qg#1tXU;^f4fR4|hxDKoJc#FAoA5^>H*%*#v7aY{_iE-1<`2rfx1DGorf
z&#QdJQcj>nU<|Y`GY@F2US2^JOc@8*S)P9;sm^6$U@&B6VBi8P16o}SR9CrlLbgY6
zp~SKCGpDZ=O-vLF+T?P3isCG`YuolKdz^L-KJ0ig_1!I&yHDTftgk$r7~HwYgJZL$
zj<2`LiY4dn@r2E2Yii4!*(Wt|+N{Wo)Egg}md01x?fGDL{_y+5#hz;RbBcJ5pG){z
z(|c`^+Rxt=f0|PEO*npC(z7c>O2jQ_aqJf1DKo`%A4h-jnmp|%>&8O`itlya*B{YM
zTgiJ(Cpd4(Esrea&1+_cv`tc(x-*7#WvDc-Sl5<Sn(HLv^9p%qJ-u>9$zZlj_*8kr
z-y9rsK5ShpViK~>^R(&jQ;{bNS==7@9Q1eiapEJ>OrL364rII(_&DWM-K97B7asrG
zuVL04wd5ho?1bPdwng2sjRn&b9$gjaX*_Xvq2u3m*KWsoF8E=4>+?^;mcB~{5tDhp
z+rMqmUnnY+emQu`_f_(>-^-jr4)esC?dFuc^I7B6dv;kDlZBS^`5)wcSoGzxe?ZK)
z{2ez{o0hOD&TvbaBzDx~$~!xjulnr*=?jFOw(huiAdbCcZao9z#gFU$t`~Ncs(Cu`
zVYswb_B^J0XG))3Xuq&oaPk35QC`^+A&=MhIzwj_u8^Fpl;gcm!~OH+*1*n1>o0$w
zu<ioen&-_OL2IRw?q1?Dvpuf&o%c@O!s|!w)IaLo9<((3(bv`KDN%X#Z?wJ~;aS|j
zH&LPFNYi5VE<U-v%~7v?gjSXEnJzoBKrB){MvrOwaZ6RZ6)PG{x;ZL0`#H?I%XfH}
zP>6?YL)WS-jt&jiw{;2^H><0J?c$QjUlqHmsYIjf+9J2-VFLF(9Phe54v4nwW-{tq
zd{X@RgTxBHJ9n?L&zsZjuzl(QvvjRbOSE71{Y-nH8Y8fMdf2X0<vk2syV>Tw;rzJ$
ziCy^L&3Ptv*SagD`*xY_mrXLe6twDuJ!&Gh+Gv{Q3QW8mK+F%sDXC?d$*J)rl?ADw
zlsx;5_$z6FlP3a>1+1MnDQZs4lDwr0@+N04T(@|^ylL?vK~rXk&ySxniA$*5`;4X^
zYnOPsir5mSpHj1|xVCbMFzT^sFfgFn$i|#e`xIzXDkB2}H;{&8S#Z9=YFpgn^%$0w
zp<3o<1-5Pq!#cKQPxf58)8w`H$)80bd+%JDGwIBcMT?Gjt^2X)%9B5P9!*+w>50%f
zC$|o*b$diQaug?<J7;So2UQ0sZf-MQY#yu~;^^g>>Zld2=&dn%f2(=3xpDyG>NfLY
zjpW|>^XFf$S6~pn_f6Gv?#9*}Dajs5j>R_|-3+&-F_bQ!=}=nSP<ng8t2BmJe5sj9
zKa;{IwIngAUt!@}P||<$&0kf|m8zaqz|df15@80FZrF<vpad8+G=iwiJXoQEt`9xt
zAi5YB8XAuR3njF8MAwWS2?)*Rz|ssUN&>uD*+7!aK)4P_yRv|I2;;Dq&>%G+umtP}
jaCwcc1-+1kXaH#`05Xw^T!{Tl3``7JK#2vQ3V;CsurW)y

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip
new file mode 100644
index 0000000000000000000000000000000000000000..1c9f5abc9bd0c552b83a644a28fce27f054dcd1f
GIT binary patch
literal 1241
zcmWIWW@Zs#0D)zVy1u7giY+k$vO$;?i1Q2d3X-@OBZL?ur5Kd>oHG*h@=|kxGfLCa
za#FQ8xmeSS@=FUO7`1r0So4cgQj3&W{G2_)15gYt>3PX&$;`k2!kj=uGxPFGOZ4&z
zs$j}Ez;<^Wy)gMEP}+x)fq@IGte~`{7^tpt_8ak6(gG(>1RM)kJ8x3doR}qfOBcjF
zUcYYff_c;8LxQHv5T74EWfGTAnfDn@Kh`esbQQ5BOh2XCtY%qpZRMK6u#RonlRcO2
zG<ofP@@G-V-aD7(OgeL9(V`<>>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z&e<Bt
zLDd0@o7>D6n<s0BIC^=eI%<V0dTUJH-)i1$t{lL)y3M>;Be{3}{Q1}G6&Qr?eN*+E
zyRkJ#vPY6*@{LBdMqcKFlXw<LW~v1kTO3Hcv0&Cg1{38esrhLq|4mwQWRi+418M+R
zZ8S}D1$w*#i1~pyCABOwIW@kdvLF>_DCAAfT!;}2<*2~`4g?WKJvI#nG#inFoEvB(
zFkVJLm@_hoFoTjj_GAZ?0E31`5S5t+3y=VBRDI~N1<}R8(9p;TOu%Syhprht;1QaO
zftrz`A;6oJ4J647gzJEGDHDhXOE-{AJ84}Dvl~zpghBZ$y(lpy-T;(w*&^dX93i#{
zFb&MYVyIcz9VnZbnpc85D|;{^+>X7B0NMx!OTYmJE-}!xpqBs$EularQh5LiV(cD6
VXxj=bO5ol^4x@!YHA*ZXg#e_TfExe+

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip
new file mode 100644
index 0000000000000000000000000000000000000000..f0e22405a92f76c30182684b9df4bffee2113f39
GIT binary patch
literal 1275
zcmWIWW@Zs#0D-lQy1vDmxU}SfY!GGz;`{==f+Q}+2qEU;(p(`{AW>W>#h}FFoROH9
zmzon?l2}q4fMSCE@25UOEDQ`F%n39hGcUiiL@%$P3Z{$$Y*Fo>pVyZ&GBEHkF)(m}
zl@*kh6a&>&&VD2QN?PFLiGX7PYv)ahniI1mZ|Q=#$LrTEUNCQ3d`Qrg8RGNfr%d7!
zD)T<0>Brh7o~|Ocgz2YLo7F5UuB}{C7}l{Zd$Q-!ohGlnPyQ?l*?Z^GoJnVnELwEL
zYu%4USDyUY^JvndOHYK>Ik|Oct=l8gk)t@-+&Nn#IjA~7adVsbV)JC}5JxZ1R7b6F
zMQ@GC`&-SM&6NWfSGSoLYb5v1pFjV4y#j;qy>F_Xb2ql;NcKo_Ouo^m*2w!g?R%R3
zNrqVy)0j+~nRD8BS4r|X7(4aPP*s2bdPl+iYYY6Z<Q;ANv{%~i-fi&<vs1!@zuo9_
z%=~Ak|24PF$WXWJ^W*RDZ!2xRxz4!ouATYU$lvR7weG7=j?XSn=U;xek2C+9s>=NB
zxdL&ypH&}zv1{8UU9A_BwfXC=@8|R8--_SeH!ZtNmVNoH*i$$6=kI#ZTI=N0|5jwx
z{hO*jGq$*^mmj+qetYV-RF(Wq!au~*tY5w|-5p=F`rFHyZ|;`o-;CYOdC6*<(zof~
zxjN2li4EFvz)yM6c|C{FlkYBH{r|g*Z|}zFFVkw{J%f@wC&@9OMyJ(A(==CLG<E<n
zKM<#+mSrZV#+OtUq~eUzyvdmhF(S1bHB!M*D#ED8ron(_<2&88_nrW4N(Ck&ZXgXz
zWt65MH)^GzU{X?$6etNTzTxO*xGjyLbooq&(&C2F+Y4T$F}&hSeHnG)LBOPzBqsGM
zEPM-0?q7WA@bbze6<c6nFfxfSgEB7m`~;K$gN8;Bm6-<%8+3i>5eL!5z|hcm9vHA_
z(TJ`YJq{3>9f8>zDMkXkS=m66%s{vfNc%B^cnIUL7d{|0Ag~1N2XIk@t_8icfoK3}
ZaRM@t${vXQObkp6SwM+uVBy5T002})%jp0B

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip
new file mode 100644
index 0000000000000000000000000000000000000000..d19477ff24f9401ae1fd0046e1b4425c286f02c5
GIT binary patch
literal 1208
zcmWIWW@Zs#0D(=7y1r4J*Y2tT*&xgc#Q6n!1xZ|t5kicSQVdEg&N-O{T3lSrxrr5`
zKv@PbJ2MZ=4nQ#|e66*kE;9oI2y+5eX6EIWmgwabRKb*SfbHXd!`gQRDDBF~z`zAo
zR!~|}3{+P+`;GW3X@Qd`0*(c&oi{0JPRx?Lr3>O7uV1%#!MthlAwg4Sh|iCoGKov5
z%=?U{A8VI*x{BBmrk_%6R<o?QwsK8jSjV>P$(~Dhn!NTt`Lifw@109?CY?F5Xwea`
zbw3tedGcq^qe+V{JrP>x<kq3JZjVSuj^bo<=WLDSpy~j{&28q3&6Bl59KAeK9ks$0
zy)`E9Z#8c=R}NrY-DX~_k=#3f{`~9p3Jk*czNvc7-PoEV*(1p@`G%reDue0rnGUAK
z4W_pjWTi1=@ugZ;KkLc$3`+8xB*%d2cdL!2X|6y&cK|Uz5T~S;WhSS_msA#{;tYVi
z$(ai=LZBQq1i=0mVbo*OU_i5R1wYr$OF)}Iq0J4Xfl)Htf;*9+Y0{D-lT>VhUT0(y
zVFo2_>`4tM0R|0?ASyEt782<C(4z^Wi-Dn`@eh!R7HR03(L)@eIS&{INZ}vg&B_Ln
zWCp@@Ksu8N#DgUmNaj@9a#hq8C<?-$Je6LQm=bRQ&Y+Q8Y?1LGz7Sgkm<Hxn9#C#|
z2a08;=9S>ewH}NJCu1)HfEIzl5^y+y3j}m6=$Rd%#S_Rx%KESv!R{Y~wz<Gk1MWNI
Npjil1!^i?s2mtQ;WZVD%

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip b/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip
new file mode 100644
index 0000000000000000000000000000000000000000..016c1495758286f74879f3fd20860fa9fa05390a
GIT binary patch
literal 1174
zcmWIWW@Zs#0D*0dy1oqwm(ExM*&xgc#Q6n!1xZ|NC8>GE`9)mJWr;aLjFD0dO2W=L
znFS$WQK!n_<iwoJymT!AF3ysSqSWGy{G1fgvuDoO2B4U&bXLZEGBX1M2y+6>&dkd%
zEz!#>sDdfu06QU4eA!EWMh1pXAm#!qD<~}~2CA!^{YLziw7|&|0mlN?&YKi9CuT|B
z(gksk*RNZ=VBWO&kf13u#OKFPnZzYj=6y!fkF`rYT}5mO(@!Zkt65fDTe+q%tYcgD
zWY48LO<sGS{8<#T_s*p`lg=DjwCIS}x*v<KJo&Td(WFI}o(QdTa_i7qw@0KSM{%;b
zbGAluP<4Rf<~H-i=E2$_j$WRrj#}Z0-Wrqlx0*MbD+e&HZZj{|Nba3KfByA)1qR`J
z-&8&4ZfwnwlI)S>SbW3LZDW=3)>MYC%V#=#EpGUFd%>%8hFAPo6Y4GQ>V4>KuGLvT
zZSVc1D!1eA36~wZnN{}r^y4?LMJK=IJ}0~3@kh=L+MZ4u+l4dw)7ae8HBP^5FK^WI
z)b+f?%zzqfRvS&zT!F#W0mS@3oRV6WnVcG5Qdy9SGwkvvXD-ADymHjQ0|%W5qaK?E
z1DcJ@*{9e41=^Gij3{m(4UA1nW2%i>F_lb8Oo1Y5@(o8dNJMP~M%32ghOM_3WTi7?
z@n5~M=6<rWMoLQc`_(;fx4&?BEquD{hQ#@4TPAXR-}*FODkJ)L?&7nO>g=;8Ejcnt
z#TFP!j7%cTpge^=K?5bgprH{&W#+*G4P764j6!rVFf=rp14A4we$h3fM-D>sN}y(>
zs0#3AWdlhv1K~O#eVz%#Ll}p>TmY#7fhAx+fJ+H<E$G=Dq5-5O2FOIp{1E$@7?>Ec
KfD&RXAcX)VtC6$-

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a1768f9b2d700f7c9eb916fe9d639f6114ad7cf8
GIT binary patch
literal 43168
zcmZU)18go%@HSf8wr$(CZQFKFZTr;jX;1A_^R11ywr#if_x~<(zk4^C&F(YLBr{L4
z*=#baA`cFM0rEeiD}+%|B`RP85)=gFf7ySSvkQ}p862XUi<7aNv#q<am5UQA9NPc2
zHFj}xHv2y@G>HFkqrv_^Bs7%&wY0VHF?O?b_i%Qz!~-EC7k74Wur&7&b+VAKb@y<y
zHS_ealyY%Wb8%8}b9QldH+68Q!-hluFVxuF)XCY&*4)&=*!{oB6362I?{Kj6vUI=!
zr9=HM!T%y0O?{02zmTod{|5b6#{ZY~KhXb+g$0NDKh^!0i-)n3qr0((vx^eWDKs01
z%KxTQD~z(ud<6)|{}}9lI@vlodwMWAx%mCxG?@P-CwwZKb-We`$el3=2;Bdoxp;cG
z|Htz!>+sc341Rq!{tYehhayHMB9^1bbZ3>{VhTk}BK2O@Hd*C&8kud;g21e#?1KOf
zTX=FRsaPsPPSnnu&}{RHvtY3BM^f|3+DYJb^K8$~i>F~dVXN2TZ%_TVH-Xp0>9(zl
zHh|}|+0mNOaq|h?Y@556r}x&C#nq)<*GsSx^TyR_TbB3sbkNH{Pxp%*pZoF&oj?6g
zAHjaF`7;k8pUZ9ZF_HCZ&u?fX-hcd;UY=Wl5}WIuH$>Ld+}^9={+lC)uRkwzuGAYp
zUw<aEH3LL*UmcYkTu@c?NBZPQPJb90JcZp{>O10gD_D=AYF@SO9<X5k)S>~0Z(6;s
z)-!&506%`ZmitcbZc%X!zXkigevIY9we6<yLo~$(p1Z-k;D3}s=YKT&Z-1Ej0rV!t
z-I+<V%s%`~G5G`Dkk|&OuL%&%xi*C-QKqVlEzVsKLYA`6`k%yHI#he2uTmTrHv`VA
zuz#tSq4A#4JiM8OE79^IHXz4=(;PT-xp&t=Y~8c?CQ~neF5mlwy1$OW`w0DI@5K?h
zVxfzjb;7^KJER$t_FWH>iL`RyntO?*e7K%j_K7>hOQyl^Sk<%l*@H4&+y$t%=Z7<P
zS*rSsBHf`SbraoK+UllB1)EzbdR@UK0E`jU@r-;(cExQ)nClJ0X6?vxnUAHq<KNkt
zUir8CwDFc%2L~$$>G+k33l*R=flE?utiC*lj0pvhxHN6Q#k`GY%xY$b)sH5ux_Me4
z+}k+CTOL6X4o+B_bEP|l4zMvA63MSl13SvpluNmS&m`6u%9wh_1seiF<7#^5?1BpS
zwd<pRdb)N&b78SlNC<-8@<@PClOG3gOx?I<Ou&-Mw+98{$PHi+O^Auhfk9udX824h
zPXzCW{G9{~GAYcMG`I&6l*lXj!b4*77b6#3r8rQ#pxm4JSCcMNTQ}U0$fSJtIt`%Z
zAY<231x!BJ|1j6K`9d+6l;fEJO*wl&zK<wYt$z1WJN_u5%S?BmN|r@P?Qs!(ZrW4g
zV{i!PF&BKAN*>LHC$6-q(zIt8)w%k4U-F-@RUJ;m&m6f;ZkAbd0|yx+XUf|-_P6_M
z;`mkN?SNd)r?H{v#v})hXATQ0r8QobTJ&P@m(7L}?z7lEChW*b$vQb^*qJ{H&n;|>
zePF{hJwok=r|CWlXigv4rF1D@ZgD}<eF{$6sPI8!fIP<n8$;2*V8Yk}Y&ZHJO7(<u
zeWlhmL`S-E3Kn&O>6i?Z?}eBF<0s%ej)?z^zt~MVp_AY=;8uaR!si-=uVcF7{YluO
z`g>qD-_obU4DYqTGHoQw=G>p>muz*$Yprc;|K4QTg=g4%dk;nJXxgdEx9F<EkV{|_
zL83kO!I8Hm0)x+3PoNi=b>ex6<+n8f3cBeARm%W(L-Ia4_#Wxxp12`Q5O$t4x-jvC
z1J!}twq<nhcli1Pz~x8+mEa$d`fT{xWl*nB<j{r5i94F`Gv*u-Gp3inEn~+Tw~eM2
z5hnw8RFRdS?hH>HzVh-wp)l-(QXGl%5i%gg+jK|oSED&1?c0N%40dap%ZoMy51r{T
z0e(8QwAu&I1(*v}I^#o-ZegFyh#7UwZFl)-uestW5&c=HYkJ`cZQzz{K7k*nnzkY0
zRWOm>@Lo5bRnN_8;fMAjiDqj0koPQk8MVUZ>d+Sa3#!mc!9YE}Q~_eS{ZaP_AgB_g
z7B^?fnf<|n%lTV9XL)6fA>fCkrzbOxW_d`)n@J>6g*+e}S!FvBOJTz<v{8J4;qy{m
zhF9J^_tneIZR(fF8P4ZA*>FWyR_dUwq@GU$h`P1#uM<4MOw%CTI46KOoR|#8I6(8v
zCa+gZmRy*7;g7SgfwY-Wq?nUCr}}{@@00NT^XKFhTXp%?#f+%8COqy%&E92YKF;q!
zKoHTPdGi|ix>ZjI8@9orSk_b+U`KdquMYY<4+Vxk$}~}vd<uhjf1(S9&y^RIc*p42
zs36nyrNYAetP#G7-#1qvjGENLn5oX;F>C}2SHprL^-enQ@6M8pdt`7im~m$;ihFuj
z;zEw4;28T`7^M2wT<3P=qGn-SRQ1NKn<2RtGqPxk2XSq7dnObBirmqG`*PCk^dx~z
z1KM#U`7ccu4tt_($LUZ7J=g<!h3VZEuEXk;lLcY~shSoDdRw8im^pZbPOEut8Z$1l
za-^iTpitj;j8O>f6cYon<d@&8sbLgnV|()8adTqkWJgVMBCncI*$LUmgy!EpEsqTU
z{34`6<#rk7J7%rCVEHia^Lajdf;!BEp(`<5iFV!nsCqU!f4as^D?CYJy+NuS$Kg}~
z!zo6*u;7Q9DjQiuxTc%MtWmGD{eE7ZyQ18hY)r5?!TPhQvx}Av3&GmVSZ>{(vW$0Z
zzdb{5^n0IX4t8D0Aztuo{CU_#+lr3&ZcO;#%{`|3I_)c<PWY4!h4_xhl4pDJKDcP6
z7E0VbhiA0VM#vg&Jx$n0!u?j|t%NaL<Z=$nkABF;kYoq?D<0P~n$8_JB3_!l>#7%M
zSU748N074c)q2CL1-~xfQAElQqJUGA<6u%Yd`Zg{D)j)EEP_}j?7bLb!HZy=6CT<;
zYx*%q7y?jQ*rVrDNqkp`-3pl6kD!e0%^qNAfa2fpp1<`c?fTr&s1W)fh=(4!-3=hG
zyVkVc69zQx)-dw~4vF5|xt3lX3^w}RLLOQE+g)lIw@GD+SoA^VGH=AVxHu#2%1X@c
zs>LX*Yd1fFr#)^h&P|+N46|-z`_xw!6}n~nu6t9e^-H^bc<~dc3y6%#^1<`z13Myu
zOzsK{S<L^zz#Ecxb=q-w?~n!yhAeHYJD?I%lvwMa-D@C{t^5{KH84^O#j0hNdDx5%
zYQ;W0md-ng;A#o*JZMwM3uk@D%HG!jRq987aDr`mf2G|-f@SsTejHZmrOtb{5*71B
zwS5HcZR->Sd3}qN4k=(YGA{uiiikOukjx}5@O$gjE#OL#6``~u2AZf42VY?((d$Wg
zs9`NnDv~z{9X$jdL;#^dn?T_ZtFz2S_wt<A;@l|jjL*LZ!={^poiP&?Z6wn7c%KXw
z*}c|~&joG63!88nG;xhsoaFx<Ps~lZmV+t&p=c@_k*Q<e$Z)Bu>Gdq9eKNQU1Jc(*
zq{aDr$}h<f9Ck_k+^~|{LQ7$Voz_&3D(?0NC&6|#SHAH)hvAMw)s7SuxUYVbA`gO#
zjn(+ihbxs}DoAgfK@~2%-ca;xA5EOEZ0278bg|t44b*SNfOzL_Sgg~>G#^hwK_<n~
z^|U&fzM`mo;XxgtPFQ9y9I^uy>=j4su-{?iL>`bLyp@>H#M5R;c$B0TSfI+P#5P?@
zRse5h>zPt))Hg0H{N_Ot^0_#Fy-wgwK*j@tU0Hg&JL(ouTpYUazD1y*z+~23{~&K!
z=MBd<(e%*oC4*smjiJU4$T;$FV6|b%)L93)<u_8uHrMt|CS5}p_pE8xzMFw}69E1|
z>K>I3n&EoH@xhL1bQlGji;*lpEyG6hLUfGP(}U-DsK)`ehsg)YIm^?A=yxRaWN7gM
zb0h$C-=VAr63%zgM1u?atQ~nzAp$p)DbX5l<hzPhFm51odJdR^$i9J>e2%cWIU6-h
zhJNidBX}9#R54GYy=0LWx=_STfS|edF-#NSNf)krVm9-!K~<(RJG46EufrgOs=jG{
zrg(v!w(MEi=-~~zQkxzAD<kXFk(<~yDyUq!nD~#ZOpaCbc4hpaKY=g=>rM@rJk#5J
zW95pEF<MglJ9k5{G^G|6mHO1DZ+5dCxs?wk$<GIlRqri<f&txpvmI)?X*p4smnb46
z<j9-`Jfv@1NoX+G!xENhV4~ITpyl1c1;isFQPgNR*q9UIKy0P*sn3uf@KJeWRGQSr
zHfe9uigSReX1Sy81=CMX;8xlnf%cMV;H;QE+jZv`p*OzdUL@-I6)@5JToWk#47xoM
zEOIh$Mqr;ZFml;A3Qa^bIv)2k*2_wq;q_j1^*briTebt0VeIdly{n@({)$nm<fP*0
zvqtX-Tn0Mutu1Gh1MV#ANr6M)5&*J2Z!T{5H(EEAe^YzO1CDjNKYw`VZ%AhfZ^zRI
z|L@TiGTj@Rp3UzZ4NYM;%v$S|M*h^qT40#bfmU^#$6!YbtWLWHT`sCh1S@82w)Uz_
z8+2f!-MQ{*(A}Ps;7iB=6ebPtwur%)HN_TRg0}K`!(T--wDmY(@z!j^{s&L^v9Fom
zEC^#4ON?T(`e=@Fr-chiYQd=zAoWsi_`7bnBL*Rbc14Qr9Pd{B-#Fvbt<>Q>U3fin
z(RX)1_~B;1K_R}6<8dtQ0rq@zRWpvFP*k*W+S)7>`sbHXV%~s6-OrM=j$l$m!kq&_
z3tahbn)~Tl2N%v%3&8&c?Q$|^xIGBdMn_O|dst4OClJ;5TJ@FygT*$Nw}28s+oaim
zr*5Q>3W)yc|E&_^jO0LAV>-52E?mJpcE9O7B}XHw>j$2f$h#1eoQv_V1srUewZpRq
zL+zt<++&a`+E_PXII2?z*Z{)rgKIf7#uL4TW<`|W9k3gWH5qA~c2O1xb8|VU3eWm{
z-G(D~JX-2$gmFzo8}&DnO+Q!+UkMfDM~e`_duL>Z+mNT-gSVs&TTJX5M~>~A<Y0s_
z-lnYBItN0i`ecC;6*K<jOzQE7UPqvieB?r`!<=72T@zdEL7JB6A<I4Ro>8aa4khP`
z(<E-+aju9oVH>&R)p^rhp$S$bLmH$dYIX2}+tl+#hs*VpYenDMjQVDK*7<IC<x$bB
zvfTqko-u6pInChFjP&x*VZ4VAv-%U)Mvzls2w+%c_re+;;Ml(~Y`C->oTKD$d9*^Q
zWoo`o7L5^589KqP(ttU->_H1P1nQ7$Bq2Jj=<_g`R1z6{;8Rbt?m28qGqvZn!zZvo
zI!w0gJH`1YyzLL+0Otnep~DwUHR(fx;R$kiG}>_>f7~7$Jpqo2RaBppnBECTjJKxw
zqXJB-5o%3OhK#E*_g(iH;#Y4PVq%8KToV|mxw`9v9t2ZIbko1XoXclv_0u+t;uk5b
z67heHR5Cf8Mtvq~!uiBVYuYg6IaRFo;VC&}F@eTm@{-F{G)a+uuO3__eju3M0gC&K
z@r`Xq2y)vwLoc4JTyb?_OK9p&D<T}jA9^V7+eD1U$H^r?5X<8msk?;w!*pJBRBc?W
zV26^|vM3#!#l-7wyx;W)F^F42E`GIR9(75UlMn89%0n1!lclu0x{;lnKP8l`k*EI4
z)3z-uBs||(LXIky6+aXxr`}=d@i*#5y#hkG6s8~e_->7F;+~Sycei(hQ#!jzCur{T
z*QZI4y1orBL5W!Af>_7ig$&zR*?;dT(+&J&sGiJ$<X?1t`ycuL64G4!o)z-~n5Wu4
zc_Y5AoA18EqGOKxcuy5|;&v3GhYc2Zy(6S<;WLMZe`s4I#N!>fmqlpWliHvJqE6bS
z0=QQwN@M+(7=iJ(U6wX+zlYyBDh3=$h<=lH*_LEd!EXm+vZ+PQH@s7L`_ItE<E@A*
zdSQ%Hnxozf-tp`caK-nHnimAbpYbH+t|Tcw65!qKrC?JCPd1_}ifar_b`@m2j5jL9
zAzpkAZwx}@C2&T)4)`1F+;j10)QeP`;$69EMTHIeSoE$XMf;Dtu>F&NxTy_mymfLK
z`f^opb0lJr@Fv-&L;fR;Z{phl3X5OSsOfs;L%bYoiNzGX91t7Mc~kthBV`_QHh$Pw
zMTFamk9y9P5R)!YYL41E$0!_EcJR|6qERp+J6$S%gXwxG4GVc86uxH~)UmB4w)0Mn
z;o(M!L`gVuqh5IHe55hMv4S5EMJL11f{@GUBx-6Wn27sDN?oIHdEmOhkP~ysI=e$g
zmZ(yZ4+Uq>N4nUG+Pq+t8(^>xLoKMt(}Qh|4lSYc9f^!#&Ivh1JjUicN(O3c6~u$U
zn6IA9;F9F`0^fL(v6#%m1r}#XcNf5eVowaq{&fyU@Wzgc{8!oh8DE&@{<WRVFtJBJ
zFA<Y|a$ax`HZb+Xyo3rgySx+L5mz+2DM)g2JS}-%C}y|I_|Ix5lzS8Ne#E#~o4jtw
znIY;>-rN;TrVIeuS(rgNf2MmKtb?lABO9vBEKQqK$T!g%J}5`@?+FAfkhmy)7*AlC
zKkT*xYwTa^u7?c1$MG|nVxo?+7kePfV$^rz!|4_(45Phh8wdaIZK-LhkApbbpJq@A
zAvD<l4ZtLHh>u=Td{r>(oV^aj;z6Lrp;u@D3>TSJA+kJR+B(YM`M~>)Na0j30UHj^
zAxVo>Ym`Dv)|UF6my#eg(;RAbqhVty8v@e$;N>60qYCjX`ch8&jU$|#Fml-Jdv*=&
z@M~g{C6VAXPfIU|>Dj^EV-HmvjH5KDvzHS$(9JshAf>Vi((;eGj(h+@(?psHCWjc~
z%p$M})abdHCA8)iR-uDpLh-g2r1nEtwqP}^_hE&?k|O%`QRluDY54mrfX7rbE{m)a
z1sBr-JqUY$bdaG#88-YylUr>%e`IvrfLa|Omd@W@3dzptdSTA3B=Jm2-Gs+@U66dM
zS~jEJ#4yx`!nm>r^PWC?=0`tpAr3oU5ilA~TZxjP6RJQ2uWyh)B4?qH^N=f^o#Xdw
z<IT7;=0}p3C&JSFeqU_3hPyv_mXJyO9j^_8^WZNK+mC+-@Aoz%4qY9f|0~B~kjpYp
zF=?n!7Jr`Q0q@<3iX8ZaHPYAKfB&fdT1PdMsVU21TMD6&H@*Q)o5m8D3Jzy+Q`!&^
z-LfE*a`^0uQgs?OQBF5?PQ2&w@_ASXIY#oLkJSCWJyZ@{RB?|PYL3<BK%?p}P@D-}
z`JidhT^(9zOM-%E(RqBc(;445m{0N{=+y*Y^|S@H?Xzyr`Nqq@E<&`o;sn0LBr3=a
zrsh}Py1=@Yc1kv!v4_f*;rP?r#Prez>HMf7q+qd*O(TQ{i?GCW-axp2%2R)*ndf@l
zHD;w2RK1>n;-6_<3ein=Q?z6+FBZ0l1e4`Ks?i=>5LI&{ng3JgjcmtPt@-w-^SVJy
z58RB%s-~sTC5#_Q3UCe9V*CAja?X08pYU{+>LWFw%!Y$-N4ZjbgDGb6ohvL9j$3{C
zdCeu89a+ts|MWE7-6qCaIvt<*!xZG1K^&Hy$u2pr4^FQ_Q998bci{>g<+%_PL84|J
zn{mo+LQ`j29cp9&vCLf_7V&+JWt}jzXsfz~`JL@~ODN&!;z)PEL6?eMF!#hbJSvjX
z=UYE?ZwaIcVyl705xnWdQ8eQ~;y6DwtV<wKl}{y7a{IjjFMsH|fM}%T6#ez>kl410
zTJ3Ft3!<+B5wnq>iqtxwxRj6B+X7R^#F!xzKHdkf1**idONQJ6qn;zTVY55cT`fTV
zqkEGl!CjbzAqY<m8J`5jI;0(m_I9}&Vb!-jrAZ5^I0baKi;pP6yRrdVK4NW}&cz&h
zpO#4}%A<ZYc}f_y%vkpqw|_IL>nT}a3c6V!J(S4>TKLqkg~5h>L?+;2ZF?{1QjUL8
zuwAyK%eg}Q&Xq0G$rqY*NcWYUir^xc;@;<*DyVna3TY@vWsWvvt}Z`%1mT%fJ-K%F
zA3`3lOx&t&>9jv_r1eCDeYKqjXAxwq+X#(r5ONKhM2<y2u1>ksfcKLn&UW6El~15l
z`I8%uqi}>>A>k577l!02aw(Tr_AnUS++H7XR=n_D!H>_%H#ubO+g0{eN*wVdfbFeb
zv7<CEELHN!-~vN-ig<UrEEL4Q<}@y`I<3klAT+m-Ep)2-r@_boW8hnG)9r}<kwX()
z-wAquSxLHsC}G$2#n{iu4}0h)LUmzp_Gyt3E-ij^Y6SZ(T{#icAOXB~5}@@5W)p7J
z8ZK;cZD{|i!Q#jC_nVgzt#Ql|287y2xXtI%ct2;DYzM9{IAKPlOl`F2k&mH6Ej(aG
zzhufc0CcdcZr5W)0plW&R8<)j%cG!>R#}niC?Ax^sz#K^(Hz)EWi&$oGqBr0_s#e_
zKEchF1<5&d{2!Xl&Y$5u%{PKSj{`zN$wG%C>Qa$BL{xtz5{%d}?0Sn0PSF_n^amuN
z_6y^L@KZC)PzR02CChP8k;QqG>Tz<hK8x*<5lk%4z$5B#%c{xx(dyDgGvg}!Q>$-Z
z?GsIN#Ttyk{u&j7e$-KylAS|i_e$p&t~Djut=o-##PK7-QOw&lqr3_+4F75({l@<z
zy^{(R%bq)_HTgn%zRhIgdto|)N^S9AEOFQb8r}NGtf`I7jyha)!i<*bnyw$IRG{ht
zP^$LDt9#ftSa?8;T9A$OXI*@*AUGvc1ET0LT^KAhtk|6vT~fetZ48sd#u{0xEY9TP
z#=V7-3o<w1oHfUL8re**;iH3a8kcD#?aSL{f9=DC14+E66DxnyAq;TCwf`4gI4(~W
z3P9|d&6OHQiAtM&_rTdUV;EPqRgH33UsgYj3T${IjAz`LUuHK~6;`v~^gaO3#P(JH
z`seYxW>r*IoRWr_3#70YPhCkS;-gm%p_h#$V>>H5h?Urw56z~0iJq<nu>=$Loo4zj
z?AVpjPh|}H%L7s3HG1gL8&3%^ks7zC<}o!99+C)R&($3-cN5mZogMLP=&6y<nc!F*
zoOmnw=a&ArI4tpVjsyFvh{3tK8t;Bc)nwY9ng?8*U@1ue7xAoS@Io&&<tc3qpd&xY
zRHb_S8dHcW0~}lRgg}OUx{{?w|Ei_D2NpVAc`sKJ2@2^;)Sw>3-r;r_p)di*Q780X
zNvMr|4%v*HR6~s&`$X$OG+<WFMz%X*Dbgp8w+r*5kmFzl61ij8wu#`4iO*F9?}rx)
zeJ5rWXfF&Z7{F_zj~y+oY~MkWYL(Vj82;JEHoP>lTYbTV072gvUFU$&?kv7MDVjQA
zBe9dr>sQ~pdzi`kI@%6qDEe=+ul8lnwFH4)_^zpC1r=wu9ssbFyG)q_TOqrF>ZC;&
zt@EX_YbO%_25PoYmYdJXU=U4iR@uJdA~DcNMg@{b_F?X^Zp7?MPOIGfOL%7%rhU4w
z^U)0~OGq?haNeV5x!B<-ox~MIKQRj^WYDMb!O0pNEMlI%$Qq!<;}M8nPZY!0>H8%K
zRe|Gk4)rpn9xcwzwsJNX(H!98+FqFIc~JCqfu0I6ld9LjwiGX5;w_%tx&-KFIEPb<
zd$YB{jB0Pe9Y+_td(7eqeeiHK?nar6q*&93cCDb%2-i~Q4lkEAc~8M;=^PgGe)ogH
z4|g}sM0&Ix!>-|;6uEQZdfylosk7qxsuV=`Yx8EQolIWX@=i0B0adU}PoATnZPTwX
zpH|nteE9>|hX->gKLoeAX^6CY$KCc;b4dgEKBpb(&c-WrZ=!}RlGmW#1wwppT$i4G
z(VmWI^nWKpbCh|m-G6{7^xU#~Zww+luh);sQKL2j0vaFJnJ3)Wpu1vGHXo-VWJU2R
zY*t<F?Jr$UJvqkjt^iwYLN?D1QXfSelEa7nX7A$h)%Wc6&eR<<F_9KI^htkWwD(>d
z$hABzL~hi2WIwS5yZ!M%l4y@<`~zVCg{vkpb<Cc?X{)e6S055=#q#!ag_2mLYy#sx
z$TuX<<cp_~%WaoUWPP|3ilAqS8T5DlSkzLiIQz4!S8YG!d$dxPU$_zXpI$@V5ITy!
zpeI*imoYmGhc|BJ>byZ3Kb6SX*9@a0n4BzM_bK{K10bn-xE^CJtZZLiB0e!hN0)K^
z(S~?NCs}3#D|kHTQvCs|EaP$U*3Loq0WHQD@`#pug{(vYOpZmXeYXc5G`P{&=C74p
zt3iHj`i~;`vgr`=r7+R*F40fumiFGQML~*quXzcqS+!B?k<A?i-vrKF0VUt<Udey>
zMW^*kZ%b^1SF~-5wk|*w5q3oG1xqRJziHm^JQox6zz?(YA|fm~#UwPco4%Z!o?>9m
zcc?>`mL5<}B2_8KV`STqPeUc}SC<wOG{@Jg6q_P^eBmdiHR3Iug#ax%usozwi~p{9
zFuWZ`_cnfCI_qIdY#{Gjet0_VNMpWEDGoov34hwuh44CW8;&9;%REAOnP(tu_Hqo=
z2&8=Is6oEnCoPryotb$LFfeoU59{?9E%GoV`*gxC8g`@`6T!-NKpvdD7V#EIed{)W
z+S`cjaI%zk;ujX#Vkfp1)ZDdoM51pbF_=aW{Yg>bxce$^m(O$I0S9_UB~8l{4uN^G
zwrp=G*iHDpt_9*r0~F0)qLc?BD!Iu=#VL9mxi%U}P!2fLU7lnhEr;U<s=f%J)RGR?
zp0Juopj!w!Btz|Q29}OkgX-%^^2HFVm|Uczq|rCS(t)SLDA}JegA5x|1c6+?5-CF!
z2F@|48_8plxRw^KWfzix8&Ckp+<8KRaDh$)0$RF}7t4AQT^GC%IQ6|gTi6#y4)jCg
zgj{Tm1~(Cuk$U|`8+CRE<v5mnO1j5;2Qnn8g3&@^yvv7hIT(S}%3u7bx812yWM-DT
z-1^Avgl9aS*6?U_WSu4ZH9B9yv)K&oyHCpcmp%G&Nss{6z#$2SF}SE)Px!&i*^txf
z5e3^`s!wOf{d)|;JC!Tw@ht1G`a59+&Chb%)mgGW7i0&%!|aV{2zJ4&u;>Eah6n8-
zJ6_Xc_dV3V=WnNr>he>Y+(`Jn6YMY43&7VU!k`RP6^$T8Vfy8@^;B&hmM=K8;{Z<I
z2U}z+<u}QmFY`4;Oy_vs9>|OK(?ydLabSI-Xs4AR7VH&Zqcp)~O5KE2adC~gCPSvt
zc<LZYUWLIpmJqp)%qkyl{F)be(Fir%PrhMhAJ3pwMMNeR@=)?~cR;q8y}8ds_KVb~
z#>3r-$rEMkC_F*G6*#bmp=2F}S5oS%Cch*rf$6<34@B+uFOP5(qE=WymTx*OqB1~Z
zE>X4z@BCuJ%69*UD0NT{oTo<Wc-lq(`k?$}O)uSGX%FR5Ut5{h>WFdj87^d;CVJ6M
zv$@a{+C55AhTMDn>K96S!WbfT1{=NR2Rh=FfD&{M1<P;USSY-~5-5e>M9iBQ{f+=3
zTraI^%DcUzQ2e3oCK4r|@_a#V@W5^M(ozf4N0@k}d?R`<!c>qoY+n0=0Ho$L`8<`4
z#!nP;EJOkN>pQ%=!V)lNJ@?^;n}i5nYhLV+e?Z=1zeYh(vaVnboA#Z^o}S+cSRG5B
z%DwE4qY+PK0FehgExQp;z%O|JU8-JCy;>P_-uW{@5%=0aR9cv3+Q4VZRt+)wBgFgn
z9xUaODex17*3<n-)*mqMIaEEwr#Anw@XvdolT>(hc-+<<gG$mg*|t(9I<P4I9&oZS
zs25T10K=zsD6UK!N}*{)D-paw4djQCQ$kIC){*gZbM#5kLuMuRZ9A>wlRD`xakjOZ
z%YMP~UNQca-HUr&p}jb4WSWPs2H3(9ZR4H6qC-uYo6C{gsDB~}Fv6Fjiksuk6|r04
ziA`IJ(P=g((`W0%>wI`tNt<=#+w6r#gz`RJ>#f}Ue#<)B6Gb|MS2S5?8XRPLD7B~p
zQ<ut}+oK!BaFFtOc+1qvHu|M9Z~oiq0q?|d#xyIOkO6VswW2)rIta@ucO*-6P7{ul
zzCwL&&PSl$DMCXPq@s0P<X98&6@J$2vL%Lr(IE3G#Zb=Mi9u{mfLu);X{cM)d#qIq
zE^zIB?yhQNb1g{c+nhmOH#4%#_AENj1<8`KM<w+gG~;HzrD~sGdh>5gp+9}`)iTKR
zG&X8UPQ=xumK%2@CC>D-<jOy&ZicKq2?GX_uD(2s9iH9ooi88;2J#5;T{b~}C{_*W
zUl+KOHfG1t8~c6RtZTA#Zyk}JA@pi5=U*?{vNl#sW_DfNd!~U5z;&!TQB&0CBo9}H
zJ?8hL3l^R3xQ{uth?$E&z>f4nLi;WR=I@us+gP|DJH-*OG^X25JXiZ(G=fF(U&E|Z
zDed)09tGlG`RpjY&Iy()=Gl(~!8mrr!_DybDzFR^pDHv)H(6%R>Kw^qnBtbBh<cPN
zNt2-3DOIBn*N;CY&uR(yT52ZnPMM<bOQ@G`%Z_g1`bZZX>!?6~<cqqG1q^cV&jtsH
z{Z}uXyAHvZY16829|?Oa`jnX-lfOb}ASt#<iqBNK=6j9HV@8yNZGm-zkz4$?Q%rb)
z&~hoZKM07kRIIq$oQ}K%6A4xKe(eT`CsgvyfxeM8ya8}e%47|<=ak_VG0jw?2ShJ;
z@3~$?U$bzkwyaTaOdaA6M83(><-`=9c2r14H-n*vGLL~Up*3QpwlzvtY$wpJEJI*l
z%3pKsQwV4$)hxl*WTEy!3uPrbd`{|6eb-8-&e)cfP*nRf`F3MS1{AnlF*zRsq&@YG
zErJ7(-1*?e13yGr$BI_^aqB9}c<}s<BRo4o35t1r#6)YP9lhfO%v*zJ>Gu>|Xzf@V
zCq5z_B{<}e^sIk+5sk$lb2eieQIs48(vHzK#}}*zLw9`jJcv>&YJnbR;Cd5O4S)O@
z_ZWiU4w#OHZuRMF=?RBZ(s2~VaCc`2aFjin?B<%&hwH&}RP2``knVh*T@f8T3PR~u
z%yqE>9z8DUmgu2X%ydybwwQ*_EBLlAWk(W^*~ec0j9US|hm*i*X;e=Db<6v&x)>ob
zxijQ8dkob|+|rxykbaM1skj2S75CPBz;&Q<$SShBzAiRV5>_Y}q0JtK^X7x{U^;{>
z)_W~P-k>ajjt^9r5~NPx3A_LoT$5_C6oF1!yepuV)MJ;I=`zD&#QkZmSuPSQU>OqM
zssFDBQRSgAYf6%56tZ(+zCT}LL|y(2mLPM~bOy&^7c<~iRTk^hA481Zt++~cBq4E>
z-1bb4rvP{MaT~C248TgK@Lz1wZ*}3SFNW-)p4ZTiQcfLr0ZuHYGMhOhEL;D6Eu<`v
zOj3DR48|dEf=_bdDhgC4ahiN~b2SyPS~(R1_M)9<e#$=7G|TvmdOs@A?LPTJGH2-y
z**zKGud_xk9jN;B@#nN~L71$K%XZX}j`$H%?%%9yvhWu~(^vBW+tuTE?Si8AVih!u
zYlqh7XkA)zrzm=eMc>U>yz**>rUyY&ufGy4<%I`pxpAT67Ed#a7<;#CC5_D>&+(4$
ztQL{Nkg>%-Q%KTSSO#YR4-K*!5MI7cEZv(?b$8UJ?-OwxkYNoCG;p!J!t<3JSx-iW
zNRzM|GN9CZ1}G6Fj6b^#miQBVHG3(onU~0hfRHlm#~Wq#j2Zr4wL4p^qO9nmSUuUU
zN#fa!OrcDbCLS$dVe7FD%~~jg|Exe0x^PcFUGUuM-*hCa@&ECajyd5o%foD!hFx$1
zA28+nZ>SKKv*N~kn2@&dnRaIdc=0fBC?g`)T%PK}w&wX8s_x5Mv@NR?10?qeljdAw
zl6)OMq*z%e&$R!5a+T|fmrLh4)~+i^^HC>j#35uT+C?3}I1BX>mqn05C)0I8QX8br
zaeq<99!fNxsS(2dsgFFwftUG^-hT&}IdJocy$BSlI;BwtWnPM$t`X4b5g`8~0hv5U
z^ZBF;ztm*MW496z^GY|_JZ^qHp#Zr3TQzCQz`W`>fbNUr5^zh3G@s_(#6*s`tq{|B
zq#_-0N68Q@E)tckyH%pN{-Pf(CBHFM7euae416>t?~`52m8AJv1{)WT$nI%~6I$Wk
zb#3Hc$@w15KDa*YDMkqOs(p2HPX{!eI}CD|o_-f7VwYF$C-{P(O|9%Zc<}`V8$@dK
z*|DJq#bZ)Cl}M?9oM`6{F6X%K2J2-&atHQT%(=p3Z#a`3VuJ`3g0Tezff~1In$8w_
zG5CHY_s-%Dbqmc7%}E3@<f%u~+A9@H1S4&{)uU}3lA)@4L5i^#r1kS#Rx}Djd~oil
zjHr-PY*Ho%e3I66Nr0kzvg{lYpam_1-C(X=$VMtC0&&a>@&>p{otIV%qTTDm2FwPN
zUH(J`F*Em*<uC<g^hBj9hVlprp%Yb4V|0ITYHe;NIsT&sKCGtb*K!Oy=G=t1YDMyJ
zY~ahEc&M)LT1FFh!Iln9zpfYjDjSeW_g&siErg=i2SW0!?Zqex_h2~>-Y{yjdNrXv
zU?YSr1u)`FDvM1sv3lhL7y)MByz|o&JS?QJu08oW1GbHr;^!Qz-2`^U3pm(JGhPHr
zQnQgG9wQA2EBDu)PNU>adTQ~EnTws}(hsDZmmw5Mxygzs*6UgcJl}0_k;j-o&oU_f
z(eXw?d*_Abf~}6-*Q|TDS(6xJhJt9up}yH#l}Vlf$%(t>5y7lo7PMcTx$Jj#1|$fn
z8QI>WAVSDxX-LL+L^L;|J0gq$nX!!>^iOjF3`h?fVH|wkNOhyj2Ln%j20ZN<ZE~PS
zT+1!DO}v}XV5sg*C~*=J4I64{z8)857c+cpM_2Swfi-g<>Y0R!K&W5<=aT0>8ug4f
z?hZ0Q?bwv@c|;WLBtvbNlm``z*H9-|fjEo2K=syFyyK(dKtdWasMnw$p^5R1$sR9@
zM;>#FaSw*@SzOr?XFsV<#C}A<$!Sp7<KHD9QyA>gDV4RP%P1wH9cE+BInFTG{&v+7
zpTgj6{#G#@@yw%$)(Q)Xm{7bw171ZV)l^()5%{((0!KeJ{1I+9^Y@d=ZAzSF+qlrl
z!E1o&qMicZ6&_wOQH1{!SNq>6LJb{W(eX~yOGe+FhpV8GVP0r^uPMyTO+WITF2ji-
zuYeQp0<<sdJ+CUV;1SoW4fBZdvP!tK&qSam0{}kG%<s?@#a%K@W}vSim#={U2}yO=
z>MCqcPL#%QV(=45`*YU4sjP5?Mc*{ELl9mC^CV&4)-!p&PZ3HP)3r+g4PXr4wCve*
z$;b`f=m)Jbc+mLev})4z1nc&!vtY{vhg-fz!{d)F%g3&BLg5{Mq0nEMoO8R7J!KW7
z3{VSkZ3&3M#7W{lWUAOTjAU8N<He1;OmZtvI%*sUM8)C?iaHLnsQQwrSZ;4JEv^Vl
zc!*K~@=o+<bdJT$A`YytTfMd+lGU+8t0Btre37DHV^3MH#!)xn&ib7WPfIW$!;Z{0
zy|4u1Mlsa~#Cj=?w%R)$Ue^yTB(0KE?BHO!J2tx4@E^x39J4A5xIuNvo^RPA^j1~s
z=~7}5&6rPr!r-;V90}p@kuv^+f$R>TIt+&V8T9cL>xtN)rW&?-ClG(B(P+BF5KHS6
z0;8ZlhzY@`Vn!(X7w1Ov5HGrrt{jcV40zi*X>hb+!KyL{-LuBTVsrxsWBV+7qQ!0G
z46<0=h!jM=`Sl~k3AORPfOA5?An=LQo1k`?+>A)%+!PjlD(%P37T=<*Pby5Am(FQz
z>hWxABhVF5uhy&!loQ6G117&98?znRo{wS6PR)Bn+A48{Z!nRP+~C_KP_q<A{Yf5s
z6aB7G%0309=@j@iQ>=3T>hx<`Q?5aiHewIM*kN)<j-f2;`*<bobjMzXwI0xcj|Ai~
zCp&vv+fDF?QFM0i7jJBgc6Taz5rl4d8PI2NK{uHdI8foNN4i<;o?oG##W)TgLVFU^
z!BY(R4!2+^@PVETeiD(d2U6ULa}mB`LyZ?pU?6ajR+|y=VM7nvKhVL^2E``<+RXmz
zlMb*qf)?bp{k+{Zj|!d+E50CNG7v?4%9VE&CPV1s|JT4#iZ(GnpmA%8re+}D=yv65
zBiJW4fnFhcEJL<g7;oCm60O@5(o$JMWT=j8!dXa=n?*G)g`VjC2M$}2>5XIXRtM?Y
zZ`#ysF)`USk~I%WQ(W1G!N0K$%Q|3Y&*+W=y{;W{-RitsU>4XkzF@=0DbrXId}?nG
z&F8REa_0rghRr|tq)k4oSJ8wuu$rt=6E&j3@e*_Z#$;H5m<wnlrGFM)|5C(XIFl!B
z^gzIG@Q=)OKDbjt=+HZiDjqjx;n&O?56W@we!Am0zt0AV!tOnEL=BtrS7)J91#SJ{
zik`|DU_eqZK}%}g<Fl~Zw0%!~W{``jWKSwQpxk5mAv{9X=ojlp_J*ryQly%t;V@3w
zPt4uB+D9_@zQf2GwO%x;M;ZI?Vd?&^xoL-C5zs~ov^kdF;m0@0*hAVdG|}WevF#Jy
z3sNOPlxCL9dhq}<<R)G?xs$dYlabZ_GofG}>)zKx)pdiRRQsgi7mT#UDmz$DMZ45f
zk1a`w;rTEF^Ecg3c)cv0yLAwkF~K?(L^u6x%*bx*L<Rh-=<jt%#%NU%XXgfm)W0BY
zZNy9f7w7nDLs{g@g8KqKQBl7Sj)~OcDL+>(ihXR5RE20sfx#i4?Kf^jS8c(E6_IWF
zXh{`Hdl$Rx(O^y7H@6lK-loKbVKYxwcs>r9_0sthyv69V^kO8KW1QNL690XC5VgHA
z<on6a%F*-gr%_MONH-qs3Ea4y;6+@cv*jruops2D`6$4Q=QXD|m24nQeO|uLz9c=q
z>lW*&OGBWdO}1N<#4%1NR)j{IbM#MZ32fh$bHm1ihg>TaoNVw_VqX9-0g7LQ8}&#a
zzcuG={HkJ-K$~5a6o~!2$&zUdK~5Gh9E{P!;PKf*7JOi4zeGc|6Xsse*p!2P<1@ZM
z(FGhDJ<AB}t4myhM+3aH!`hY0wC3W{Yk}?*2x0Zqjk8yVE(%zHMn-$r8*i8;FF`Ql
zD2jvhBUFzDkPk2Eg|IU3v8F13xVkAs1=L>i9*C)fP&eWC5KmW}%eGbjWTNHc6gg-}
z<;1wum%zE$q|G^j^^|;%9J$ni?p+p6YR=mT@bLjg5l<gBu4qfgtP3pHcm~A!C6S95
z%?wUA>DJSm;jbar?$!$ePOt3%Ur>-2FqrmHd1+y`8V;k;vZ~oHGCyGY@GvJaE$g7+
z{m6TYH}6-A?letGOHq`SZ^xD)($g$FPOz%LrZVjQPK?~U_XkJ%8%LwhEdt)EMTtoS
z@kD4{X)7;8)A@7N?*<tD+_XQkobX}tC8~9M35dL-I_=%C_13l0AEyXy2UnI3M|R`4
z`b2Bot>T)u7~T0BzyAI-bd-U`kV2=Rf1o^uu1>mF1KEY~JJH?uZskG@9~we#$kEk!
zf}v_?(UohORTjJ9QBfjAtAvy=@}p8WoKA&rOE#ECWIZ_&@w#dnsjHLG*iQD-wBUXd
zq@PJHD^9^oOEmTovw3=p2W-hUIb)dWG6|0txxjngQV%st;BhMn{C7(cTYc{V2*g!k
znBl{AvHX{uu75?}L1tHGn(xGm15*OQbBz&p<M&xVIo~B^U2|}Zmck6+Wv$2^l!v-|
z?tR2)F#_!x&34GN;(Z<$U<hmO`*GE|KTFwNItoJ&&qc?Q_`}D58sUug;uoW499FMs
z!xKDCN6U@CK%eY@YP4IL5#*!3!fcsIDADg!aI4yv(ONL}^`fJh$pUoOp-!v~F%6Rs
zFVr<EPLo?@?szBpCR|~sR2=1Iade-YL4}0cuRCNNHuRA|?A`k@g3(|=Gw1o1&lv$y
zRCALBHEq-wv-jyj+E7PYnixEzmPyLKb=qNd%wI!ycr&aM7ds?Uor>3Fx8LX~k44bZ
z+*I=`@XdcL-FUGxav_P0YQMEF6dzle)Q+OmPu=pvOL79HIC+w0jF&>vwMnbJ^M>Y*
zY2%&q|LYG@rZP<xi_zwc_r8CWWx?*Md!N6I*!MABid3JKcOAI}`y6BzxzQ2UCVfWu
zho*I#RyFcIrADx<9wqLN?*wTvxN66)QIKW2cH#X<e}TSFNS92}gJ#T3m8?4~WS8xO
zmUN>f(sKl+v7Bp4%a7Z8Rl@KxW6|<GF<h4rgz`!V&E8)5$|sdCtH*GB{110dz+abv
zTe(=<8mqW-8*0@UGudfhRfDEmEanvgOKVk%7%8C=VCMPi@`L(Kh-I;niAN$lTtI@A
zxJS)(YQPL=*;yMnQG~I6lZWA*ye~=y4ov!Ov)m&}<^G~K(uZZy?o@@*cNTm8^MG=b
zETw#@jbjHG3b5wl<-9k|wSd(QoI<u}0h#!qPT5XGQ&Y6ByyfMy3BE%3*O~OLC0%0O
zjnmtHJdj*7T1jtR)z*`$pL(neji_K^l941`6L@F!Cpku22`#ieZhynhR*Tb8@P)~Q
zjtTCQ&6nX5HkKics1J7Z@{-!)C7!b9=@W+hOeUM&crfRv6yuE%os3ubHrE*g^ku?D
zQv(ML`Rz_pDmnr96$Y$vtLD#6NHs3mom{f~z8>;<uZcg=_b+bVBfMgX1-u2~b~I4o
z%x{$LwQ_}1D1VCw|D|7%ww}kTCCoP3k|(;`vT5TUeHAXRVnYMX-Afbz1&2}iOx#{_
zdUtjdymYH9S=36&`1BH=ss<DAOAY<=&A8;|hUB06o_Y>GQ|y~56nfM&pEvv9vBd^~
zZ&cBzyx}5f3?oYnwLA5K29GW2V*YjGz6Cbqp<|R(&2`(H8z+{sWtm7auTgbPEavLU
z{>qHKNKND{*i9GTObW)pcE)f`G`oL{R50j^D$kHhGZXLb+r2FS4k>qc_~F5ohjn5$
zr<<P8i4%RLhTk77hnZt#q@&{oJ9+H4@KX6amKh(eTPOhkenVBkO}`hfNfA*<KUx#G
z+FzSD^2$)>f-dh>5~zUKqaOrD&$TT=JV{xeV8tTX5t8*ZbVv0AH41;?cK)%=%uA&i
z%2ok))dqQ~JIChJH8pZoP0Cl*0!24k2kZqPcJ4Lu*qX+WLHbtsQ6ezb5HyfgL2S;D
zy|c6-A5Y?&UMEKeop4nvqYkgtk0kHKNmzcO62u!sDr?>~=#foro@-`Hh4Q0GBJY_d
zAz8(I?5n#Y9D8l;dQ{qxK4>z%Lpkpy*9)Mx3QK>Y=6e8V3-7mI_Sfs)=7_f;oAtZ?
zc)zgy-XBTUD~%gd0fo>b9$voaqrJ_gNgK}%i3$6p@kN11>ts5uR}3F*<mYL=z|a5t
zBDUk{-o!;T#F0q&>tuuI_a0$bO2WzMM3kBbWtl1mT*F}+EnM8HS=9!nu@gav`7f04
z#Cpl=P4%&Y0Pwg3>SkhN3*YUEp)dZjkm4kGS$c9Pak5&NrzwC(g5LSm7u?k@iA?K=
zn4&Mcx7-B__Uko?<~6LC#tFk`&+j-_s;`z-^=QWI#YmqVM;ZRts8Fho#SZf~7aKq6
zww~qF*CxodA=4JW31^zt=ad3TfoH^XQ+9^`8hzEWPlco>?udS0YeC20v2#Ewo3Nfk
z7qK-XRm`VUJ#$5=+Yx;hMwPsb8~hDqj8L>l*bCFMY4jXD9atV~T|MZPT5@$3R)~7`
zNX}~m`@F=n?fT|rJM{G{p3wdcBnSO7`@oXR41dSiEp+lC9&}FPYt%4svVApd?FP9X
z)QFK!2EBnGIPrHAQgiv?b)mlh5(7QDo;R(4xF%wG1{$YH!@_*p?Y5jWH*<?FG+|4e
zg4hSDvbJXTy&Mg2T`2NvjN{nj34fxDNX=N4{rmQe<dd7YtNsT`*kTjqRt_Vir0{H~
zJYVNsi-^!lnzSi^vR9y{;yIz~_I}6n-Jcc?BRCOl)iY&rQlmP!Cid`~EPmFmZ1ciX
zB_TV6AuGuShH-cV0rAoTdF*>cJ%H_7x-WxiFU}VUi5b0=Vj$=5XdC*_ivsH}!DvFk
zkX~Y4m2Zl%JR#h@>~DmQJlZhDaTqxJC*3khE(IqD*BID3nwzA-?4w{+jQMGEe?*5~
zukfyIXQW;w>wpitqzYr&N6$_qtrMuyk3^vx;k8uB5!A~Vo1v1FJ|3@PWKVjw-4q^d
z>}^=pp#}Z1^^dU4NzIh3=d5({z?3@*D08n-6Mq}fk?&y3JYNp&yVaWnTzNJsO9ecN
zGK8JxmzPU*8ga66wlv>@SQIh3KXVePROzW0Xb}q}(<x$j2;~cT$3(MVg5F#Ht6wLd
zcShDXFM21<Gu!nq!o-5~Y&SF%{Ji*PmGbWF;0kL8&}25blQi7r1f@3as!UHxLX6SV
zX5~~;&Ej_X6u7EVYtNk)KF#UzdRlXw1vD?}&2a>uZHvs_ns|CeQ`H!JDu-Los#d%c
z!B@$O_~#e0BJ~~|^rtS><iu+Yc~C2C2R!AjUoE>|{=X^GWXuwD9Y#*X<ly{8PU!qj
zp9(bA9W_N|9kpF10tB6Qok`9x70QYJls_KKpuRt!@poFrx_nshyuWnB()rCcJO3C^
zf)Ck+Ml)3FA1TI|y{w%nPtv+pg%<o1GCdJ&E9;LUqoCcC%j_ibAioFEU(6IAkJFSH
z>5UN}3ZhO>KgT#UpP0DZk?KBDr)hg+o{mo{<;TaTJ$N(XvN=LV6aJ2W&Ty(MgO@t|
zpo|)IW~3fZ5o*#$ndRDSu5?TIxI}*v#RF8Ru8$meSegn+YC<&=lFVrGW-ON1w0Ud<
z5&idlGJ2HzZ%J=ksfPs5uSj9oR2K`uuHbcXwh7jfW7iRi<)9oQ>gi(o#rTfqq>@u{
z1yn{wJBlHGFOG{_YrM_Qn7@SfW0%DwXFE+r{2$MA1JQhJgk4EgF~&vz++~;SMqNfw
zrER_Hr~~sma`b+z)H{!Tl&*IL-4KNp-#n2n)1#EvoY@D*Z5L?S*!(9mZ+N8D@s1ZN
z_wgw>KA1z)&;!>fWXBEbI8NSt-S}xn1HVwl@gC&IcTAxb_cW8=&_S8V=c0)->_x+}
zhHR)`La8w5-L<558nunH)E*~0)5}YTBVvClwA;N^?uN<akY!8S!tC<=s^OrSXVexs
zOwsff6%489Fo3KPAC)$u;{~mM>!#e$a^4erfcM$4ABky-ZPffm`q2)d_c4*|wlTku
ze0^e5>KWvbCYxvv<pezCu^ZdHfA;9(xhrLdTGOqLl(Lwd%BW-iX*jyZ9?o7-FB?|p
z-K%oS^&Klb{2Zp8GD|Y~9TGV5UdnWvNw18iALPpBQ`J;xH5CxaA=e?_w>#VIako~h
z<fVfe{;Yu2<;v4iFXgTC9lN(<$GXbO|E)R;61YqHG&`YPnm-f;e=qNfZOedfEXGFs
zW7P^}Ih>U2Q^4lf&MSW<>8}&l`M|rK`1Lp)H#>9PR${J1=)itME$O|YV!r744Vhwl
zh^i)A?IwMeoQIV~U@~Rrmrt=rti^$0J5~E05T)gPP5!WJwf^Qoy%wypH#&7u;&`9)
z35P29{vw^bv>c>rV95jNI6=<-<MLe-k@0A?&n4&2H2n4KKgL&nlDObb^2aM;&LjzD
zwdb55qO#8tI;cV=@Xc#svnArvYGh&QsVQAeH$)ymzN%a&Yud1(VCtDyr8N^1lBuv?
zFeB|S)X8E|0abYaX~%@}FPQB&qwhjCnTTLP#NVV}$cQ{gD(kEAj{ys!Vh;opvs(&O
z+P(9_Oi>gk(-diL^6NF05(ABKGTsK<vPfHI>c*B2CqB$24&zkk`PUNjftaVltkHUi
zT54g|5HVBoq`{MD{|5k9K&Zdc8)r;oL@n8{;3$flo=d)Kg(+&IHs8iSiQvY0{(9O)
z@nT8rAy~tXvENIdJQ~<I$!c12+mI|ZJy><`U35ufJ%yEOa?&58__~rOXhb=dU8Z2j
zJ^6q%+unvejwtbm9_*x7-y2xxTy1W{>NGOyf59r$-DS%U8`I+Kha~EwPiwP#SeG3a
zX=TSF+WdMV=bm_whK%^^jnE8IsWza4BVt_VvPKG+u%FCKy{K!qGG}=0Hnn`|W&Ooc
z`EB8wC@(LE4P5e)+E=C%OJ}I#ViYys>LJ;RPSP3W1~GO&YZ#V8G7m3N)rX&Su=xbl
zwpOtY$}8yQwO*?A_)a74#dNH^lC4)=LpIT?=;iWnWHvfW#~H?xzwBDFFc9Y&O2%>v
zg|71Z%Nyuxt|oO?_R^_GBAiRx{?VwXOE3~fflE@Tub`D!4|VFjy_>>L&m!$UMb66Q
z3&o)dC5)Lv4vXyRSBL?{PuotxA!Rgm&R)uW@r@t7ONvW8pvp#{a-wkk6t?;Faq^`C
zI+jpFM*L*T3O_?F4LS5G{svh*m*T!!MpM|91hN$`8trv~7OM}?r>J7K{pmdV_Fjv+
zQ=KU+F_l)lK(_3bE9u|e#U2+uO?o}o$)&-EmW^_@M{hli^S(p_ZSy$P_M_FBnshxx
zl4N^@xM;4LKDymt_up7T^7pQ>W5xDT;SMD_)V-O)jQ7*i2tf*#9VXG`y|nDqUF_2k
zB`3LWY&ly&T1g+sz9fa;zpa$|9=nl=lni&%Ux~ZSw`B|Ld)PwBTDDnv7k&IuOuKS+
zQtkONeo52}ijKWZc^4~bQO02!o?*n^zBtI9jw>Tc<7#Rr1M<C|PI(6T6nsRLYvw-D
zf%Ig`+@(xk{LfQMNI5-i66J>bE>L^B6+F5jLR-a4Nxot~U0EeXe(FJV`>6)C#?}x|
zBZI2LezFtWYl(e-iA?vkk!k95E^6CGGW1TM&4v#A@X>iI-C96J4i<F$g)-luz=0~4
zpQ6=C(%iB0Rn#%;K<jUqQIAtTeYF+kPJb^V@tShl)O40Dvu>o8XSKAVxSB55*HCwq
zBhk}IT%vX|eK6~0t0t}>w`r&8wnrO@td!#-YbyCx!;O^neF3Teenz`3k5RdUEyWhO
zvJ00MvR%*blhC7f`gn?CTMT>I4)Y|^O<qA}vKvSrvM4DwpN;w@NXu^njT;6!puCdZ
z+3m;XZfK(At4>jE6L6J<K4jx3LMGZ9*`(e>B(ruSMJE-Ik%uF#dS1Xfabq~kNfOky
z!;!8GY~pK_%h9r@+4OvxHC2CJN*bG2(66WG=#SD3%G%OGLmL9vV+q}CWNQ%xE?P!~
z9nzfT{hxf#nqt<hbS)X0ePBPW73I1Q8<B8XFI#!-6n)t`n%fI6keXx$$(}vH(%4#>
z(<x0aDkpMJ-ZruCLhC8~h9FI!J&`lqV@y?-kUBD>=%`jcS<HS-Uqa-#A9qhsaOW_q
zEGtgAd|_&QafZC?nkmL4mps$1l9kdhr3ua+-BZbI>@8gqKc7tJVo%YX;~C@@_lG?(
zZxeOMO(B8)XJl>hlVIZ`*4I6jqNkP7E4y*rq=8Plm~fcCEVhRYQrpHR7Jp`)?f9hV
zXirY#Qz=eYi4MFuPG`d#NH=abZTho?_OCfX>6b52_dhZ2uRBLFhXkm}YBL>ioJWSb
z3S5^<ApgL^NmTGIlK82I2=af^u){k3fzXpwoBW+`_G^G-1LW!5ygjHNuE@rC9H)yT
zC#n1Q6mFzzGf8h4=1adDW<OjOWS^aoA&K>o{EDeXY{I0o{DhKn*8l4V)}f_{ei)qP
zzkPLuysu@Gyp}1~r<Fz@+zV)9(FgWh$88E9-3@}AC%xeo)3aOK*cn_dty!~^3P!Z3
zIJuF8G+aqDSc==VLXnf$HG%yc{FuF&ElIiKp3~{=Pbs$LGYcC}QPAw`e2;u_u0C=*
zt!etfesu2Pht}O<SL(Tv-6UUXTcAUo|K!=Ms%mVU+Dsz%<k&bGV%6`Y(ZPXIx+c>?
z3rZC@c*3LS%wfK}vlgB85M*6h3GPnVXhyCPqerb5*z~u#ROaGBE0@ZX-o+a7xT8Ts
zLOPuIW)J$b&WT)-7Sgn&U<x`vM2AgfIqM%ata70;Y5%U~7i-_3wcb~0cl;D8UD-?_
zI$@N!?=7jUe@;$9$uyHqAI%#Z$WQMC!NM{+G_#3ayf23a8!V|cSel#NE6CLz^rX&c
z9&HuWAo=w-$g{(RgccW)#kOiviaSIz*6ML4dfME_v@CZ2^lLN}wvo1!kLBFbRk)T9
z{*-(ED$NSqP2(qvbEjezxI>~jw9)4$d$s*E?M^nPYcGG$`qU(PT!OT#coWS(CdUbx
z@1eOj_mEb?Vd`pr&Z?(%kZJV}dcS2dm8)A*#DX&Rt;2Pac-ltAxu?l&RXR;c8DQrK
zsBk*H+h}s+d3OBj6>OwaBORAH!|u^m;9fNpvtLV2l59}{+a0IQ`FF|C%AyMd$J5AD
zT8GN>G`YR)7f7dgBK`WA!d{)fgZ?(}p_e!7N&ovbmh4Wm#>*{8RjY%|6)B{tj=A(P
zaUH4E9wxy8aW4Hw2|at@O}(!Aq@F&W!sTT+#gdyet?N3QFx80!+?Cn-ga2spGy}4q
zw22*hpTd56*Nl-e^)&599vu@R+THSnl{l(Obwb0WIm4EE=P`V@ymHpq!k!+UKf@0F
zjHAOjTSw2?h?dqhkoXHR?wiP0QdJzy$%ks_SCt>hw+e9nBNE*Hfm(WeCWFSEuVs@g
z6u7yIQt5ZWR@%RXXwj^C;`O<byi*tDl@zdpZKv7174@wB76r06ah_~4PO;Bi>e(r;
z3t6GkMB*C`Qnc6qbH|*-f9{w|_ZQ#1?4XQ*z=75aVCc0MJ|<={iAiVS)|C)2Uc#Yg
zVIQNeAqRtdvw>Zg3lSyGxO3-XT&p3E{Y|+rX2D@NP@shUehem;UV+j`A=F;(jc1&7
z@a{wjROs2sI5Bgf;jS$l%TPm;z#isU#yj5A4Yi;vW`eEj3^4NTVsvB-ago4eWQM-O
zu4E^0%aX-MKQvK0<PWpq<amsqbDc38D~*RPXo5$>XGXAikojT~fPB@7cw0UhS{-v3
zeNF<+E#)C_p(QE`{e|rBrg(CHD+J#(!k46vKWb%gO{z2gQB^>xe{%S(q7V$%h9Y^3
z;DhYzFriEmSJ%nIhKrlPU$hV+p9|wZX+t!U&IK4F1#fj_(5ZD5rfGIS>fX<wC6@<H
zGD}cjZG?Glpbxr#Helt^**L>i8V$3o@wCwf%!=lqe(z#@s$T)JhUPeDx-o9g*?@YJ
z`x)cKQs%cp5nT&pQN3pliVX6xulzSuZmovZ^B1De!DjetsfJYTgB}K&p#4Ucmmjkc
z)TY^B!X#PTH$M{NS!vuLp9wOu3((X_7wwcC@U!4d>|Qen*L${seS<f1zpM~4;}|A&
zRyQb{_yJ#YF0PODg!-0aOwfhpDAFIp+yy?~8xFyZzFmy3g$$mz8U${g4SMfa!>+h&
zII*?SEZl4a9Qu6ld5a%Du@wMECl|agHkLV{(ZJxUwYXRPD(H;tU^03?z^tD<=B!Ev
zujjT5Mp!t*abq<+>v{rYO2y!^U;-ZbumLW0B{Jogv*F>>WPqT3@Lr`F?9}Wq<5@Ig
zmwpbmu6xKZ5$n-0L=I#Zd<FLUSFrhIiD#E@g-cI)`0er|-i4JR$d$D+i}b(2_B{r$
z{6am%tky=eK3NbSlL!4z^>D&mXVkf_gpMynu`>P{b9S5}h}*<qr|2y6I??Y?R>WeK
zzzTS8{RYw^f5SD0bm;u*ffA2`amz!*q~x)@1%et_e<}n*+A?9%d>&+Po`6eg`{0DJ
z1!Omc;?TEf2wQE5ISX2O^In*Paq&{*J4&FP-&D-m+XZ@3z3@6f0M01ugAW&kO?E?Y
zsQ)9B<juf~(SE$6ZsNG=s29F3ZHHF=1F%wl1>Xl(q4;!F^l?Bu-8moa9qgGu;_)cc
zIv15T%A>rLDdQ#<gpC0oV2}&IFw+j;TIH~+)d$5ty#a+s^Fj5{W7z3)89FDLql4KB
z{Bqos7vTH{%pPTel&3D74w``u15T*tCWSvEEP%I+k20~Mc;&Y(%6*T;S?4+cmK}z=
z;2n_IHwVAAt;3$>IdGxkEd+0o!{u|Mu}Cfv^7LXcS7!|FTx*73MXk_byDs!(#-ZfP
zICN-mMU{aSs3BbfVj8P3HozNoV>2P8aW=TzU@>+;4!s7unSWc`VBl97h-*zpXM6<5
zWOVS%5gR-wWQ=W!Q*hxpe@w1R1k?9|xOoSQ76EI(<f=2I&(g>5k1X-_D=&24+z+$e
z=cDgfeNfjeh7<QS@J6T-<d;O_gm*$9^27pHYaE7CMds+oy8{J2h`9;A_+-B_Hg`{h
zMR8I1(_S2-XT^Zm<a?05&jcTbg`uRPKmIF`fqXr8G*nQ+)TVoo*7Od_gRF5j=ZwQa
zx~Lix#>9`<VP^k*Ue@NTFck0-jCTm4qx44T+};5`S&K0A<rbK0V2>Xq?RZa3CqT-x
zV5H6uFh|y&5wcLlrnuJ-9deG*);$Aa){`-{QXS=`y|H=vZ)i#GU^25`G2qt@53Vp!
zd^rduXRd_JRgDlkQyACJD1%>`Gf?c43i`!|<HBFd@XTrz^sVWGXP%RhO;JO+F<Sr@
z&x2DvBDi;HA-v<Yz~`|~%%W0f<ALH3rrVN-p`Vw5My)Q4yEqYFTv?AP6FcF|RyXXn
zNn{jEgwQy1Hi{TELRqc|EDcKFDG6T%Rc*kwyhQ9$dIw(rcEi%7N^rHPXCP((lDDd3
z%ON$?H(tsko+gF_&cIV?0=WNW1V&6rgXv1EP|ME{{!Gb$-*=aS%@Hq5*0#nwM+bn(
zYJy>RN1pu|Bh(+W7Nu-0@yLl#xD&hy4#ahWar0dmejNg9OBUl0ZxLE_&Bd)*hs^CB
z2V&sEa#*79me;SRg-agrQ02-@)YG{R9x*PsD6<-@Z%QEZJqTB>FvGRao0#pNEYPIg
z7mwLEVxGzuc*%3bkeB()-|4CFXUG*7WK3roZry|?OBa;4?1K{9lT6$AH;@u(jv9Z9
zAz*?(ig$+N#nzeN!(L)+7VZNVoeyAhT^%Z1*TRXnq39_zAOB34h`V{lm~_O@JX=@+
zwkjH+p7?wWb6JH&0}A+|WE{@ga}(}MEJgh(tI%NnAe^&pgq#>VCbMNRuCtwm3cb%@
zypbTD@?L}d3J*NtIUi0tuS9;55V|dU%-oDGf|FhPs95WRYX^E5^9ichwNDS_{I-En
z6^m_!518(d7#MR_z$~mv0Og-&z@z6yOk|2Dwuf3nuemIy{n-K+zAEGI4byN{qbUXi
z?15d`ETbE{8V_r&!-KtE7^7dtoc?Qtk?r5%?C8JdzE2hJj)>!{d$SO(O~#HBa=4d2
z0k#CYqFg^82cE^^iHgNISzQ5>ZU*Az#A)bo;{eR8F@m8g6?_%L!^OWHkdw`U^{;*L
z-B&q0sA`HgpWERE|4=N9%LA{E*MWR|(d?iT?q1yn-z?t4pWXBE^|N?pj%+;$&31&#
zcSJCybRmwB5{2UY1Y9uvJp}hH#G$aWF#g>Sa28yPqUYw|S)Kd5^I0MApdk(3#m8W@
z&_b-Q(?jW_*-YzITg(<WgV!sS;d8JX9vpYn{KeiB#=tEIpWRFbadQhy;pw8nlYS;P
z*&1!#mO;FjA<p~P3|=u(xZ5le&8pJi-1L0tYuN<{q8eatp&W8OHmG>m2MvqknYv&d
zd|0A{+K(n;pOQ7MR0_wMZHV3%^+D7p8nu&LF+6x5gt`A@e4Xsj;8Ps_+A0JImvnGo
zh6YdAQ<ga^xCj+n?J>FM42)mh5BIiOVrI^2lylt%O4qFL=NVg&lI(|L5+<0iLmx$U
z3*du<nV9#>j=8cY7s_H?L1SYH6I;Fv-p_Esosny>Bu*TU2z`g8+V9}U#u!X`9F9U2
zt{}0snR(gh#+0blfZp|Xs6LPgy}3bHw`4hL?gCucGy>iqy|8U00^`6NweO{aAh_aE
z?<yD@{hN_=1C)_lg2xLJP=@~v#4?iM&zp3nE$ace-j~CFbN|BaN*ypZzX)%B1>#QK
zHz21WjJ^jxg71$jFnQTJ#<?O8YX1ELpF#ypzWoKJ+1G*a8C#s*ZGhv%BC*Q&CGYQg
zKyJ-?hJNLN;k7`lNVGu3_~|Hev<-eJ*)Uz04&igZK|pLLIPVyOgLdC}Q)M;rTi*aA
zWp3sz3J=F$5sRU}R1Z3BS^T;241{H?;(-m9%?JCs;Y|2Vp4Xn`AmC#OrP=H7WqUr%
ze!Lw0LmoluSr-f`pN=_xuguG``<Nnc0fx-aGs5|v@M+Q_OuBE52D{@St^XSMiMimE
z$$6vaFdo};-!Sg+Nf@d20!)?!VseQpoJ&i@Q?DYKOy>yZ#3cu8ylsKnFB3tpB>^7)
z`v@PlN8m&=bv&-_go!^#_v}dtXqzRE(@GQY`18w5y7Eg1xy9kzWr<j;_mw#@;SP*x
z=3|k9GWx&wz(t1l7?l^2sQG0Q9*MHY+IVNY-0}$|&Z%I1sQ}N|XAWk{Ch~TbOhvy`
zK_n&`pZ(SWucRyRWbJBfDiB7Kt@e0mKoAEiXXCYUPu!m2!Mxt!iLvZ_&{@smk|*Qw
zcJy)R_lUt+{wpwhZVt@KS^@>PYCuuyIC$^Tf{#DK(8t^xAANPlzYec~r?nVwCD+1S
zdl$SW?T=$_OW?XYpJ8N_lk7X9z)!uN@l?4Alh0e>qD`|gj9Y<^bp~Nc>R;wct0-DF
zYA`qQyTO!i2Zz+1ahj+IoD}{HSLU>UZjdt`zAA&P-e;I~P#JxzL%=6a6xB?sz-@dl
zQ&O}T_3CON>Q***Iw+X!JwAzP{+JAgwWT2T(-19=)x)^g)|g?hh#84hqu*Ntni8wb
zvZq~vqoXAKWl0^Z3be(<GnbjJrEeHP*)aICQ5a7?G{w~Aepu3&3GF>X@M5(P>iO)0
za<K#)G|*!H#BpHunvYfwTHw*J3`RBCV^+;5pQWnc->tLpY|cjJ*W&BUvdl@Syu2RN
z2a~W`(E%&<LeTxD2{;a`p`LpmZ;>Gn_kWnh#P67eCTmxMwX+gt+iZhL88y7s8}zZH
zdX$m;Ofh$hFB5P<48twlm?JiFu;h*v%1sXhU-e6j-8@++crYG1MVv8ij1qc3T8yS=
zHbU<Tb-Xjh7nf9uW5_Ev^ssb?)XcSb&-gI&?-8Nrc00K5^`3E5mBCB)hhSm1H?%x_
z26H45QFlb+zvn6_`Xu1{;cQScH^J=>GT^g`1y1eO#y9&X;KDCMFnD4e1PiMow7cUZ
zGXVy6m}5S9qRZRKD7rKawtT6A@=8SQyBA>LLJ{82iJgq5at>^>y#Xzmbs!yShr0*7
zV7L5zm=$4<=cZWWE*%HB=2r*V<JG}Aqn0V?9gB&1he0bv7`YTh4AP&3f7-+GrDZt0
zd9`MgyB(OY{_nh-O;4G-*R^qJlNc5W>%f|5B{-ESh5;!tm^;}V4wk2Z$Cq!+!}-eC
z#1BEJivpwPA0cF=KT0_m<4r~k_m^p*t+F}pNHoJYcHg0B@(u`7a|Fv7L8zD)$~?&{
z2lbLUsD5A~KD^xnlIz-;h-c%#afu-+l#IaM$0s4$D=|V+*Lj?PfIxU;Sa?{JXRy0>
zSWIYigw$w^cOE;cM}`HshXjPW2d{};6C5FJE%?9QjtmcV4~+@=j~W#*c2t)AA9=J(
zw0meslzVhoxLw4!|L!CAKYhGCL&HJ?yhlx<MjvY;WUPJu<1t$He>}$jZ;#Q6|I0&m
zRQ~@P{ikGtwP4JeXs_A-1yD-^1QY-O00;oPfh|osu{CBz0001p0000F0001FWp-&}
zWnXl8aAhuTaCw{T7wQ`j$;eQ~P_3SlTAW;@Zl$1ZlWC!@qoAIaUsO_*m=~X4l#&V(
zcT3DEP6dh=XCxM+0{I%6ItsN46ag*<C;$LZO9KQH000080K0)LO@<p+i@r_(0E?{v
z01E&B0B?14aCLNZE^csnq_=rg&0qNTUz!I*Q7UO56)L4s>b<YsgouP9q>?f;ODLL4
zC22ru9yAX~n&(MMLZ(JjND?1~OuzGeo@cG!dj5Xif1I_>K6~Bgz4pHE>v~;rTX$~R
zX3fs#&30y`!y!+5kCnR8D}BvZu9jZu=<4C+anR*}tB1p(|5-LZc+B$<Yu(f7pxYtV
zcLnv;imRplr2k(Ye*Zc4E+>w=ol>@U^*E$_{KPRY=Yt*|2T$=I@Hk}e>f-6;al+n<
z-<|z`@8WlH_}{;}&)LitvXk8|%)*^RVKe(?j*$Pe!NKd4+aZ2;&Yg42wKubEW@l~S
zQrOI~lihdg)~%cVv7Z0+vDv`O?)*M?ZUrZHr~khc53AIc_Z*wKLY-Iz9M}Dy0=)mL
zfZqRIfQ?naQhF;J+xh|xtS*^@e`~8r$@*Fp<vu1pwbexS+j)f7RWsUx^XLReEew|#
zqVr`N*8P0I+@Ncz!kM{{kz&Kp!4%@a=76!Cu?U`SKxBP8`FvtNdBInY&ySzs-$_pT
z-r_5H{lO6;^<kvNIu5T&j)42XPUJ>i0e`nJ2CmLPPc;dldbi2nk#=+yav}Be7+l%j
zpmmP`4oe@!{b3n&dEbGaOep66xrOQtJfN~(DB#cr$HE_2wyOze`X%60K29W6E<iay
z8Q=C)!~RbP)_-_Peyo~9eg9j7Gz()Cyb@#9UbDcBT_xCiR*;@GH$~IK(_pfH5zR%_
zSo&@)#_*B6(9^)i1S33PW>DM5MMVewv2rjPk0XuXn#xXxM1Ani)B_Upb;xMkB}}JX
zM$E4zw9w)+NwARs4Fs;6w?KzOm?rk+L8vhm7d{8$hT2-%Rl1M{289wnul=CM{*thx
zqI9P%8~wm(g<**RZ2!WCSjz<D=LF*t-#7AM?ExtJwPK>Q0b`Gs(>>vGaQL7@98w!G
z{^vP1^a|2Vg?9|Zu6p5|QU@-N%%c%oWoYK$dAMpMW7wS?#}-V2Yu5<O*IkBzdld#Y
z>o69ZH0j-I3h22ehfQOgD1Fxkot-UkxWGddxb-n-=py!P_)dm9QgHcQDDq-!(CXy_
zfpe0iFKRK(b|^-D;~JzWIAZ*%F8H5uQUUD^xOw#g6x<UrTKSetMRL&5CD*ZHaWJv3
zY(W3g<1k|rrE4+;>CQR+SVcl`M(Z37uPKDSQ9foml94*zgmT|n+ze2n<?H&PX_<t>
z<C@q7OY)%G2nHH2VU%J^`d{BBVLJwyZ7J2*k*R|ZlK$wfK99;5Va(d}DdbciMc*PX
z%<D_W*YM@=Ra!!q|H#7X{(FqZQWfO9&ByTgeVp(Yqs0{yL?!bp5!^cu9F@b6yUD6K
z7LM^rdHmB^f*qon(2s7$Kvg7?PH#k8<#J|q@;~&sAA$JvQ-iV1k+7=G1UJ6GD(o%n
z6tAE@qMR&U<%%^Kon&gme7e(MltdItlEky7_@LX3+doHe?u{|HdnM6j<48RBe}<w@
zA<U(3;-bkkiT$1fp1hy<%{Gnc<g-kM_zi=b=bE5*CJzsNS5ZDKQ#{P7Aqxk&ad|=!
z4_i4YU&2E2-?8W9UNk@A=DkGO#bKDn-9bhvJDJ$b-~-u(v;`3;4Gchz#zF9JJ5QXN
zc<6QaSt6p<OSDHM(OVIVv+p>mS^Z)vwsf5MT(iS}gL?^^!5yduTqSp(Xv4<p9K`!&
ziM$*KO}<kOTLDK{X?NgaYZZC;A2;2_Zw$kK0tgPcN2K&0lY&AqeAP}x?b0MzEj>#7
zeEu@F8|uixZI@9Z%8Q*h>j{^l83gB8LO3uME@zLCoZsQl+!F-pTjKO#&t=^2x&@&l
z@x*Y!PVnC`L(0>9v`sl7uBsiYk6eb(QeIl(y@>K_u7Y-v39ibi<CynGD9js#+$nFw
z-ChFsk|^l<#DV)#A$q?ULZh_=gZHi@KKwbhgh<oVL(aJW<SY62ivy#Zrm)*O8{JM#
znEYOX7j!>Lby}d<-;16PN%+VIbe05QV#znG%;2ZaQ$bk0D+h1q1w%Vciuz@bVNUlI
z^c;9$py}L%&(C}z_@5{pePoZ3eJYR+5rUh@e7ZRI9zN=3VdJI>JYO(GzKXF?UV|X~
zxE_jCa|Fn%P(7p`u0v?;QyiDMgXC2+2JgSxV~NBsGWdEf9qcrL)q*`l9ISaSibKin
zf7s8~LR@^ZaQ=28%-;OKs*}l3{pg5u@533ZUtGvDE`WGSH$orIkU`g4tjfKCR|e~-
zt+Y6ujIM{{x%2pWxQN*M%F$k<I;^{Q1#-Sw7-)V;q=TP8ry>T6nRxO@P#)fY%`s_L
z4El?M>O54$^s2Yyd!{Lr)_lN&?bjgmv=JPMmiXYD4u9p_@L2N>-KM9(W3ma~Mr%mN
z4qLqW*F|*s7ed18DtX`b&A_bw0*nq8p`ktqFBU7H!ngz?x04~6AW1E&{YmEAa@3`~
zCH&pb(0!AQ&d#<&Mm7Pxr_Lg#AseAD8<0_%fU5Z#u&$5B{`<Vtb$$^RcbbwpbLJy_
zUm51@x`cl&qICBw1Crmf4=N85NC8(F)(myxP}5KH#cGIgsqP`xYxZHQgd?o^t?|8O
z8#G0Q@za})p8T~2+q(RT#0G8rv}!`i{0?j|YR8OAF--e(8Ncs4k$zweX8ImM?UyO|
z2Equ3({YlXQi~DIi%?68#qj44c!(DhRrw`U<jOeaNkrqoR2F$%%!}LW%h0Ow4I{b@
zczZ(vAN(T`TG)-B{440NK_hegbuMu#+d(8Z$l%9?EL^z|OD2+zU|ks-b2V}<HL8<h
zZqCmohkU~!T9k-gS)I7w#7X1RMd6UGNph-eAuoRg-(HJTv47=As82wrhb_iJrD#ll
zF5;5};k@=EDIBYWB}V`r6fLFOuipkPx)I%N2hi?!7?aCIsLsm@*xkuQp_d6JPYcq}
z-_zKf*nv-RZ7{ZU$5rJKByahI(XLivNu1Go{668_`Ur;ZFK{e~3tVTnB7b!ha+lnL
zuKp6L(4+x%(gN`pwXjnk0aL0;CwSTr!O}(<r8?4=KLgd!5LEoP9Wv*4<Ja(Q{1Ls2
zsE9UjT2&%wZU%bp#^9PeFRf2oPJg$^qDU%<_>ZiFi*7!~U3qE#`;(CW`HXB?DMi#D
zg~IGq5F90w30q4Jd3mM|Hv6Ao!b+0fNO6GKVNPf-4<ctrGBN*T5Ng|NiQhLbGR~1o
z%uK`K|Fjqe%MW9(tSmZ@^JBrt9O^|Ypbb+J!RmdBHOa7%cm>zJTvXMj31>f4L(jYt
zwx?>bZ=NU}`Be@jpF>c$_(=9`NWzuF+2{{paKz_4G&IxEzWfU`ga`0O&<B<OykRHY
zhJ&XUQ(plUYQ!;*n5?$}t<iw!jdXm>=BC0oLb301J$j1Ai16SK7#n>-b>0<nB#_{7
zNidm=H$uOk3?AgAlXaFGp(FX9L0a`f+M&D~Zj<SZ`aT83$M=xdxbuiB?L#w{G?C?f
zOtOm~G3?;PTHi|i6!ApRTV4e7{zv}Z+z+D<Vpzf33S$99c;?Q5{=g4Jy)Qvr<r?(q
z2E%vPW)fcd0$<KpL$UQ5<P}?BwRj2qJ6Lz|U<o)x*on&5G+eC>AdZ9Wuz&ghu@M{y
zKH!GVuHPj1J1-R}kVWZgO%}U|5VaW%(8)%u$SQ|NuM~#250Q}bn{XoaG=4{Y#r26u
z#4hH*g1uLe+|&pOn+{yx)`&M1A`FkaB^Dp4Ad>H$@kTzK2-bu`_nAK)uSi34OF3SP
z)nmNWmigRTjlQl7>@Z@d&1`?khxINvnGM)JV54g?=1_}+`!MPlkKyCDkmEFp#?eJo
zUc>?2ZYroNYJ)1QpzHY=LaxHt!@rlf9LomhtvUF$n~M(9d?K~O6ozgVNbHNo`@(xH
zmh=NpV-zX<F9l~^AK`ZC5TaAEAXIk(s%vkM^_4-m6%`1bBs(-J31jdxFOIT2evw@e
zqPN9C?Oq<#+jgTvkBiRA4&d1Fe5_Etinv!Zj8}pH^-Gl`HY2k{O=}@sIWkDX`2y7J
zm_+*70V3mB4}n@!G>0yr35v(@eZ~pRztYK!i8{>kL-G897EZ8O=XZrY7WA9pmAE*y
zS1AS7FGuO{HMl=BhyLIBbc^D19C7Vrd_tN?`iIqs@3@K^3%_9Tx3&7#FP4#h?*(vv
z<xl)reA}9ofa?{hunCzWt**<c@mN0co^Qbt8ym9kK>z|8+wsgR9(->rNWEJw%6{>J
zEk}a}dQD>F7e7K-`Q<TxF^ne!=?7<iT2fqrPd(}6@tr)7leduh>kD3%C1GozESy-Z
zJb(H;`1Pjoa^rfidvZhmjV|%3PlTVvHISaeWY{bg1M~UtqpbjXObuRsyMy~Fj*NL<
zBq0lwz-RXoFD24pus#+^tl7|RoxwjEi`{>saKvjRK7HmfQ2w_HgUMU*wBah~)>fub
zTn9&{`Wc%17hHjN@GtTTLKIt>!+czH`MERL`z#Ubk2T}c@Fi@U5~tjn1Gs4Wot$s*
z#c-E7IiNp>DxEnFHp@HA!m|D7m{UtWnuKBFj(bozbs1rQB9MHj5FsrM82raWFI-(r
z++B>&y&?zN>KwF1?>3ThPqXx;9pS;(v1XKqvR|!&XGtgXE?E)hzTPLADSxr?$_Mf!
zp%n+uI+9hBw=i_%0a<wIF<gJRk#*t6ks-JnVIHX%mU#pHpr^#~fH%BW=Hva=c~oNT
z8H)R+nKyDb82;7a*jm>D|Mqcg{4oy^1wN1&7$e5l7SUyn(l9($j0)$EL~Xe;?tgiX
zu-IX2oKPgI%`2b~A&hy^{B+;RG(0urMsP+5{?i*Hm|F-x=c9OkKL!~~(hzd|EV|Q|
zQRzT&sv14Y+}dJ_&6SN%2pq<zs^5^~dteZL(I4vaYw=*QI1L`Yik69Gcw!t1xz~Jn
z5yVfYI$9whtwWRs9PwywB)a;3qRDwV)tQ?J3Emo1-#&{j-3>IuJqI73H=|%dJC>)(
zVKQ$Ft7H0Ly}zCqT9m{2@Ma9oE~clG_h8?K0w`)Fp-S;3u`GOxKWD$-_x4JnUaSOp
zH7|5~H^VDLivB%7F+Y<HO{V9Oy+adf3X5p!>;af=G(e)g8mbfhP`a!hTuJ$;@y@__
zmOX;Uz9MDoIjA%Kh;qA$wz^onKYbU*F9oQcsvAbHi9s&77)cWmcy~mI?tifcJ9&BF
zSl|i0+r#Lee;yN+f6210ODHV1hh4WQ^_&u*t4^OnZ%`wPm#&hd)<3~pFp7#yMevU9
z!^-TpB%Aq-ii1&56$~IN(gLw8vy0%>2=o`5z{PqqJib-o%;PWw@hqb*o!{}TJDF?}
zdP|BOLdeEsE!@frfZ^pl#0;!}W1s@eM7im?YtL{;?-~N4uR-(7Yo?NK6dD3H_)r)_
zChe{=pY>dj8DD~;(PC(Rx{80@yHS?rM~v^+!PUH$=n8DdJhjtsT5uA^Yk6s1?p%7p
z)ebLqI)aDy3;8%TiOjFotPGtCgL@%NO0owkUdLgM$zQxDS1={SO%`}=#q`Io<kX^6
zjP%|>fma-S#yJeym(Qokla+9u41l<L8Mtis<0@q7=NwLY<=|!PSsn^Cxtq}YPmad(
z+`~(O)#x}D2({}4aPoVJR((yXzFv;Jxs(F8lneOtQ;Gh!pbGt`xJclr4XnGIano6p
zO7AE|lkfs0SYJb_V-vWC-XnSc3i@{cF?gJg!5oRd<fY+5On&0V*E<(UT}KtOJ~NQG
zX&XS>rVyDcT2LlvNuHV(p!azN6#rABiFX>He3BpLz2jth^(u5GHegD3klb3b8~az5
z;l6D*IIV^09?@9Dl-~iD>`TJ&QJN|p2}i9U2d-?6Lg(^T2)rOqLz_S2@0K5=O>+@u
ze{aW&s!=SwSqQi|V#w|_dB@+3-@7|8cZn434m*j5=Xa9wrLD*lyo&%!5!w)R87(Wc
z5GtCBmO>UkYLw%{_j3r?cZdjnuEh@i5DZW9&~3Q^%-aijc=LQev<2qVdrqu;lr06>
z8%o4IP!6rjWRT}t4(pP3RJUD)BcCp=bS$F_*B(KKIzRNd8T1Zs!{|OP8ml;$X5CT4
z`$M^iIdOx06J`1A#xcb8t-#;Te9SX=O|1L4sM@M_@aSKIc}4{e%bvhKR{q-cY78@W
z3{f65#r2&}NVDE7yq?`bL5@43mMuZE@GZE^=b^H?Lez6G4Q{2G<UXeZW**jIIHeNo
zw@b;_^Tl}D$&E|l95god5{}5QA<U)@Z+P;-GtEgC{4U0pjC>?1)!<lZD|BPH=w@49
z>d|%0ph1Qm&bfC8rgBhg+Jt5Np;%C!%F@0{s6G?Ks~bz`2SyfGHIJjrECEM6@4~@)
z2qoW4VH(WhNS0<a{$59KA6-g27Q|y#JsWqm7ojKi9=faaAilH#XEyvHE;czhtfhbz
zA`5BGBMD3kR6^mg9b)rjspatnbiKny?0O>$#_=QBb%KkEUw?+XtQ=yy<rYe&7UI&Y
z1@x7H9~!n~g3VPQtLb(qSX@F^)=Ny={Uu@LE0AH-!#uGX!VNQDI2@~iUVbJ17``G_
zmw#hLav2h~$&tnOS712Eg9twX+E-eO$_!8R_r(#%J*;)kN5pNr67~Ha#`G=JBUbrh
zSenv_fYBXrJjnsipyiC2(^=@>tzym~0RaMa*q(C>r!^Jf+|iEvK{+VuOa$3h2y3Aj
z+>h3x;^Q*3J7x!3$HEwyB4Jn#N8{zn&6p}~B_m6sks`1OyaS&Q-<tqCCjo5yn+c^Q
zZFuOQMj^0)8lD{{8$83v5N8vvYO%p+-dg&;E`v<x7Gpu$H8OZgnmP^4hjm&F^b3M<
zkd=#z;$`X8{u<bbIpNvQB%Hio3y*D?*t78kH12LE=j+-@qU#X(@2&{izQ^G)PYNvU
zUXsHxN1$TNk6lhrFesQt?qB}HED~A|36?Gen;rlOb|&j2{O~9+6%xsUG~#<1+P;kt
zC!Rp2+bxX6?y=;E`D<8L?;@xA77(FU5mLUW1IMPU5W_t~+~rCU^E8}cG)u8l?GGvY
zkp|v7MF?vuC;6Kj(KcF%Rm!d~%*zFL`%MHDd`F%B0=lFl3ZHGHN#NT%Xn&K54gCi6
z#k$Lgi8)1b9F<U9+Xs>U9k8oe1CRD$@_7flLHT<f-1^lD!Jvz<$%uo$*=^|cETOW#
z|3P`z4TLLS$6n)2%)LngN>=)V(?ba18)Zny#C*K0X@gKn5I#pNMUnJoxEtn^35!nf
z4R^zSu@QN2PX{4uHV}ohNUT}(fK=&!BD0R|&^_Nl4kQdfM0g6JMH{g4{TA?++Y+Vl
zX}nc#hD`$-`8r~S<%j3NVn&R54GF{iWF%VqB#_4OhzuU#qCcvZVcT9lC^#R(ll~w`
z=e5D@*$OH>REG%TTC9GWhFayN)W&)x6)d`dVDY60+_|3l9+3kQSc9n(Y1n?Y5o*0#
zu{!HNyt@>J-`66M906=xT?@T8`yjG&0DF8R(RM@w2F@`M`O86nHYrlIr2**MZpUaS
zuEtaOJ$SRN3E!LraYZT}-kb};k=KvyXS6WS@djM?XoGuaIT`g%MsQRG;gNDB^U{lP
z;co!hvtNY15`0VKe#8@<%FAT=E@jLfs)DL-BbMm0SV6H7f%XrWY~j1uUDATk&R2|<
z-Z=4X^(1@eMZ;|T6`={NcfL75ew!3QN2DJ!iyz_o9N==>H^P@AiRw9%kX71(*3Hh?
z7*Ymie+l_KcNpd-P4HKHK-N6-CU!S}67ILl;nvj;uBRCg$ookG4PCI%8<28;f&+f(
z7*}V=U6)XZSFA<)cODvjWSo4sn~ym8lgM%{LN-?uxb#-w&zxv%|Cxdc?;gy4P^IxG
z!uqd<C@$TKiRT{pcJCCvHFX%22Cu@#C(oJ672&Y7e@A{FeoS_T#1SDJf#kq(yc^=A
zIRX2?y<U^#9ge}2V<z4evNSH{Ik~_nVb|SLOxJ63u=Tk@Lba9=+v125@8XFl+Yvl$
zABOVJTX?}<!3>MFLT$$hVtPoM269G1XsrtL#TKE~mXCH#Mj>i!As#%a!Q!zSM1(G)
z=I4fpLeoaL&p!Ygv%kc5-#9YP#^KO?HJHfS5^4JcG{~HRD@%_*q$?qXDTSQuJ+hgj
z0k?xX(P=4;@N`Rv@dUt|5vCVce#4$ZYeK^Z$>012!u$6w(#;z%!8JwN5BQTF+(2=?
z2RM(a(+eWuOeyCY{QI*U(#w9~)D1Ue=w%XHSC(#y&qvPtBZzsFh9Xvm>1}jiE^;TK
zKyEk6GbHGxb6?gUaJWal^(ElewiE>PvhHM?I=Oa<4X+-)CTSXpQ2q543A%ZVf^#^G
zC7+VKmK3Bu8321p61<`~@h_~3{Oi2{X*OkC=k&z7Obg_$*^TOB3#ngv9)dzcp?)(5
zC9GV!gDFDD{sCzDRN~MDD;Tjnq*9uXwkm9(Pr4d#?Xoqo76N?jYRJ0V=BOA|hjr6y
z65Mr?w3qke#dI;8cke-YUk$vkZzWDAx^O)873udaB=H_;xO=yqcm=A{th2Q^r)q>*
z`3OW?j4(5j6?kP?N;G1^kjegxeCCUTx7!TFx>}(VABNeXe?&<!g;W@|BlJ}-s*+B@
zE!G%ctCL`}o|}3$$WayE@C^<-Yr&O2Oss=m!smMvJnr0JOeY%=^}`U-zejP|<T`p?
zuj2e29z5-;Vd-5Q@@;Hz*~kQOdy63;b_QkHe~|s^0hW4H5uv@iiO0@H6wJR#t}I_f
zf7xt8x|1&Kgl|H!!U+4GbVBm-33M~N&^dDyIr4unyT2Ts-<siDk-~f_tA<#`7t$)N
zNNZbc5d&^$M`$6e<O-xb52C_#mMrP71o_+moy2bF1S--kHvf@@N3Ad#KSf$b{$UP}
zEjoUtqph$5E4}ma(_}6cn`y?B{Bcx1@Ha@Z)W_D=izxeOjsdnjv~LT>vyoaRgb33`
zd(R_`I|E+N6H(f|3nzP(>EFz1eD%yA{ll7=`7DRQKr6WV1tRECIXZc{A@GzP-$c0R
zl;b~my(u+tT5=R!Ypqdt&>ftb0pRU$!<&vS=I|FeoLij&nMxM37c9eu^3}-dJ%Njr
zrU<z)pB6-Wz_@7IAQWFn{74uy@{eIzzaGi*I!FS-i(n`sh2%jodin55^2J^Ne>vl^
zt-A*i2E5d%sSs6;TBz$^i|&u9xEIPvP0Xtp7s(Igu&*bhbonPH`7dDnoHM!ZJ4igb
z(r{#>B~qRU(oYA|h>ArJjL$DZmy!@2e3k;qleu`^J_n+;E6GvS<<N9^fIP8&66Pk1
zqsK}KTi81sa#{dc#X?j}B^WgQnGciQ@yN`!#Iv48bl3-@ckv*T7|EIyrGD6PJCVs#
zk$AD-79`KrVdG&Py1Szp$EBn2-6juXC;lLDkc0l)w3IQt>Ib$Hm4q*%9J`e6BeZsw
zEGoH5&f1kD;CUak_VUtHF>7*Z@Gyxx*MwbrN>JIPPi0!9aM31~L|-n&+rxLkY+6C%
z49`F}(E=56rZ}=|AsrUxr`NvAA;Z5Bb7b~G_Lu-o*u9vha$F(`(b7=Z;0?!n+_beJ
z0iz+t@N-{{-s%bBCbb1}`X1P&5<*VgwS>~cZ1jD;0MU1B^k`T-be-Io_eMAHaS6*~
zU(TVDTqS5T^h9K)0Ax<3quu=~m=%Yiw}}%{H?L!Dw+9sNzrYjGcyjB$ltEm{af7ql
zccaj`9wV9?nEmlZM8kL+9#`EY<Flv0tYm5H^#ampVh&E_24W?93Cr3=Ao8{r?8TQb
zciBRk8NHZ>tXn~9vdoeAN}N1<B}lcO_>si<6#t&zLg`2qnfTL=wQ3cJ?!SWz*O#MI
z^Bdvy;h`Z{*y)$xD3Yw$fpOO$L=?@X`$NR(!?V|!)#FjP!ea&Fh5S@MKMU2~K~SCC
z3g6!ZX3yu+g%iDa=Iw^nys1Pyuz^`RW&&sDIGFaFW`^}&k)Ul1QH?*0n{o|Ad7Bw*
zvqhQlN0;#aM;)<>TTaKM^>B2m0mJJXNuud$Y9=-euacc`Qwbo=Wd(4=HC*?-19o~4
zAL%BH_FN!Jl;uT-E<ri&Af}g`N7vtI6v+xv;q*Q{`g;UBLy{oLV%a6}>1ef?fP&CQ
z@+#dL_N*Qpf47MK*U=Bdx4uM~GX(2O;vktfi(#qNRL1KT5x*cw*0~j7>B=YY*`i5P
z=Egu*qzW%5gR%7)H`S^s!uI&<WVgyb$kYrH@1*Bwani>Z>$4<3QI!ZA2I1DTb|RWk
zfNb;I(CJNr5V(-SQ;1ICc=+!5P7)GK$gf-4*qM0%as&J{%P9zvN425NTZ7?vZdzL*
zN4G|B<D6(Bd^_ur+5H-pYvrkLdl1uXngzq44D9qR##f*D^zYWqFg=@sJM!u{ykCVL
zQ~rVZfm-C=E_c`-m?4W}?qi+7DF|;!#=_-cgd=?sWEMH2Xz^ndS!{uK+CE~mJ%tRs
ze26(u`|-S>maJ9xfyMK4aCs+5)w||W#g&(#KOTuESDMgyNRdX*&ZYaBf{6VUA85XD
z#Ql*%OfDCtHa&`XZ=ZpEEf<kJt4vp3okh&y2z?*704OQAVM^sYX0}PvB#}MHbo)a>
z7I=|)XA0rm{E_@tHbn5`ZKC0OkG$UVhNv{1hoGe{v)WUetZ`Zl6Z3TP_M#5_oi&K8
zT`@R*nqu5TgYNmS7sVD+WUzZLJ}Oq@$lNFRnDiMRt_0$;X(`Tjw;D)kt)XMAzBtEy
zoD}S9#WHIH%$*RX(U(ThYhJ@780g{JP936tz6xy$0}yc6f?a<M)+R55?n7B>+xr|F
zL$ffcq)zVFzawR*^1-iBfsHQF%wD5*@?%R9vDMH)MYkpR&Ui4TE7FLsL^<P|TST0u
zr^v+%T|~ERJ24YVGB8`;MqaP;1?Q?%lwX!0o)1*fJ?f2awbvL^d&+zm4a3FL8fabn
zACB|!Q0-hRe7wGlNm(I}=dIC*m<&LY<py|ghhf1@US@&cHK;tECC00L$d3^x<O;^Z
zbXbgr9PL53mnPWr!oe*aiB&dDFqM3Yzc=GRk}i{!ZT-aMO$KTBAdj2;CFJYYVCIO3
zC01Q%$A<Yr^f%=qeC}C{mv|G{IV)fgID(*KQdqvq4i#y!q+@P0oD4$|<n0L2zGFxn
zbHEPnEO7ga(a6JP<hkH1SzB|RaLfL{z@j49)H%Si-yF6hwU}q$fU#~(s^?+PM2jn+
zVb?7@j)}zY?tU~z6yrv=686(4#Oc)tzUu!Ymc09rWK)2Q(~Yn?*@$F|Wz=9>8B`xs
z;pFB*hR6-WDJhs(Zx_Lu`a7`gmBIKNdCGg<7V@DX;33IG%{3PDA2;CE)GRKamO`F2
z%YWou;g>i8@iQ-x5myG?Jt2?^7$d4|;#7X*0#rW5q5VJutQ>~P7>5dtzfyzk2Xe4~
z=_Raf^=7sVFQONAcN4D)c@$*+CDO6}$R0|-oc#$H-uHrNJ~zRU$DH8j=z>X{E$*$m
z4ljvd_;q>``TP}ho=6uNkj=rHeF?ZQWrmm^QwZGi7?*Oh@cWA~G6HIdDfbI(4C{f$
z=2oJwrGQIc_hUkWjn2BU(MrrPmEkO|^q3|WjR#O*@COUW*-`c+1O}_!@wzSrN@pIx
zoJ}6K2YKMQfWX2s8pm5t<BeP}rVd;~P}&rkd2%0a{7twS83`?6*12d$k+GBGsOv7q
z&g02=?$AS2{deO|T0feKIjP7*1a_Z{V5a0*`f8$p%S(jm#EZYg`Ai6$MlDFm0e-si
zi~=?*y5LdgJLY0o8`IP!j#C0_absgG48MJ0)>~*pP$iq3(9=avHJG_$NhC2*ooIB`
z5|MvA@cA%6Wc?B_wc|3m^Z6F~lQW6F)ko5KaV{FB>hy>2eTGSiA5lv7LJIE*JRMNS
z3VwflZApM|)G65idTww}RTSHlIuJbg2!hA2K+v!aJHsr{TE2iTJ!A#-6A{Gwhc<Rq
zOTj_z7W5?iQRcZ6M0+v%s6C>~A3$SFkS_3cfb~=bwp7RAvFuv<P^l6xV!twK^KYSP
zjyRcNG4Q5<bEwU4B#u7PB-!kt!MB+nOc<WWK|_7Q$+;50w<TkM4C8*6A{`zWAdYX2
zVfCI{_}bNhO|OLL$(dLPt6w1e{WX}r7>?ApgqC*fLG_z)B9z^P;R$mL-faaIs$;GD
zWhDQq13pR8cZ>9>{E=-ad0EVC`j<v(<v%d;b?s0aJdftOdVCd3gR?a^-Sv%+%4aOW
zuh{_7t`&p7%~$aHUM=>>l#q~?X1sqDK;kBU<Ln?WjIygB9Un@HJ}*ML$1QBydKLmN
zRN*mUs^4e$hS=+5!N>Iz;&ZQ&cim+qes?X|7O@D%g;kgi@jz{M6r{%e;H)Q4Kc&Q>
z$hrYSJKE4RHbG>1xv0uzmPdTJ4Qr;3=zT9BPR$*V=Zu0=;3XJOz9Sxs4kNm+AEzz(
z=qIl-9L|fy_t$bzvKd0b!YbJ5uY{XT1PMLAij2%_hxT$O982TJncZ9Q=yy02Y@dOx
zTabDb?!dui)`;$n#@><t>4AKty)ME0>rEuTs~++(vbgU41&)!0D0!`kp9ku(<?4N;
z)3tQXwG|i=*p8pQ*_iqI1N*}{spPpF=7U8v&ehg}ZA~7uuFBK--1&^+;ZN(&2$i9n
z%0snXjlOsvij}!T<l#jt^1<f|HVDV!Lvb=LT{Q>4+aqS!q#e#R8*wJ^ES~2YAR!|R
zTGxf?ob^}m%-$HgR<$5&t_e<jnuQrhIu=`pW8=sWIor4$cCVLW$Fp#}l%FMshsrQe
z-hg@gUqJhoGF_o;i}vgg#2t7`nDgf$qIw>qhewzrYj2aMZVa;48NjfEhgJ<lfQ$<=
z#Sdcf=wB(Go%)1{g<4p*su&VmqEM!_ithhc1-^E9EK!w4-GWF6SsLMnZ4p#n-XYQp
zmO<oH3cQBqP!@1vd$kfee4mho(H2B(V4CoJI!!`7?vtcXmN3bR!Vd)zYFsTqqZ3=P
z*s~QE!rP&~dJvhSgzi=UL(+K()_lK&$#NSUACsnQcGW_$zK>`x5-0M_H=y5{K-TQ?
z0Y~C>G(0(o?5;{QM~l-hC6u(OjgTkn&q9HvPr3`vfxKef|7CgDugyjN!hATt6{lg%
z6Qp*a3O-ej$+3DBI;lR3rbZvgp4)-x#`UP^W25_n12ES#58oPdiShjs*f~B$+VN2&
z3aue0K1t%#&oUCK@f3O|*`coDPqcJ`F-JlbyF%h{)ao_D&nTk%SRuU1*OPZeFIm}P
z8_u-fhA;ak^7M`zW>uC#&7dE_ybi>&lBG{E?a-O}i!;}Tq4mU<^uFOI^3#c=>3I$8
z<BG}j!f+V;iNgwqCY&A}L-%@4T9@ho{)$#qn9Aud?s<u2%aifUvJuV;lcB3)hk?st
zRITGa?#e5W&Y)IEik70x<PkP!^U@{jf=K-hQIz{6AXVZk;`l2NxhVj8;&#OEr#~@d
zdCk-}L6{Wg<DZft98BU#TAB#;Z23hrWVVA_JDQ2KawcCV{9(3B1j|l_V@56mq8me@
zn>LL?{WMsw-ip+N^5A5zA~P*9u-Nqx37d=XY$zQ|16xV!`Bii%;|<wkq>aLGCj!1`
zOb*$hZ@cZCg;O2P*XI+Q;2I$4MS{D#UED3Wdys_S?(PuWE?nH*-JOfOy9I}!FVDOC
ztGD*~3wEovr@DHk`<yd1)iYI7)#v+ZYI$W%zn-!@X0N<1Ihiok`YZ>zj$#p?&f?dZ
z){M~I@c=0+*5Kbs-M{D&N4DUHMRCjs+`(8{)p0}9$`-dqM6L#xQ}`2B+(Hw<LB<H5
zuoOw}tl<DrE7c=fA=u!S9e<QkE+|d?wycPnaW3Dq$L~D`s*3unGQIQH#1EiKIZib}
zz$A|j`b!nz+0Td~Wjp@A=#BB*nRArdZ&m>ZNMs2bt(SK5A47=O0ZiNn$_(w;56unG
z3o)XNWtcQuc0;X?*xFAraXCM5TxxkQ{>+nyBFgPVeP_>LVxWrl&kcRK^Iaw64GYOH
zkLA@zB@$v-l&j1|W0K3e6=oOhz-1-yR=If5mB1~0gFo6ub|tFldC;*1a6Q1Wa(x21
zg`kxJ?f`0}NZBt?hVf)#_e~F2UJ2~DE*zVkF12{Ou{ltmP^vOQEd<Y$OlriJtH0av
zCB~djeU6nT_v1wws#x(um^7<XAH&MU#C{}(g?md-@1JlK<paaqON<Wt%rsEzb458t
zEYT<~QS)G_!bgS6N(nrnS+^NGUPjw4(qM<Ge#(`4(e!m_<n0rv2@21q;>>aoVSGaA
zBZvX{pHR_1EqC?7?PRy&W@^%l&s9kFUJKI!<Td<UY8iiPmqt#JA4ghbu&mcNZDV=`
z1>qatSRG+#Lh>}tJCfsQ6LZ~w-C)VDP|frM$XnyS&vA%}Ne#j-&w#Xs-5g7k4<lkN
zQ}dY7u>n`RKb~y~&)YvFoEg_y?ist6-LVp1N&0$Nu+YP+#+|c8TiEh#Wv^JFU&Nyw
zEN)|@c@t@8he$<zBY!rB0cGTg;h>(2?=<7mud0O4B*Jst*IUkg_HgyO|GF;lX|w!l
z(Do1J4Q?P{TZNbgA`jI(@Ub+F)%A^bK;|*>l*drSVb<<l7`V{&^oOLTV+-baH;XOn
z{C2b6sT=h>P8w|+A{$qHnAMokbkhaDK7W~bnGH|$i(Fq#v1eI~bjax`A2!Z!Q7ymx
zKZ3SJrUt@ZYHmfN3n<)qlt}jxdsTGlqCYcn<tdA+meHZTKf-;3c0JNq(O~CvV&$OM
zs21>sj_=_a!cNKy?`KNUmsRC1?x5s6rfG5?Ac^1TL|<6sHOAETHGISk&K4rC;w2te
z{C!DjNy&I^C2}o-ygWvdKv8?PxzdtS$B!>;rodH4WozXAMvBjGDM#T+!)ndJEU+iS
zp2E+q)h8@diG5;FrR>KP?s`puM-H40eM<f?E<v;!jZX>scC)#c9aMELGuEM;mk>jz
zyrl;NS^pk6u#7=1Pav{N6^^m^S#p)&WT+e1gU&sR>b}Vpcjf{d>X}OsYD*#bl?9VN
z2xmTu++AJvgVYRFV4%f-<3yWqCe*L_i~?gPRdHkTZWFR4;Chl4rrRu+F^-$(oEV~y
zk4zQU&-0L|+nb_sLpsS%xYDb(g$Z4yQ)+VcSM1bhBW9fMT=<@aPt83NlI{w7bUw%J
z&h@^Eg(mjrOs~hn9Y$LKkn(Fj;z27hV5S;n!@b}##UQRD#S)OMYrW;1OcXvMTeOUW
z0uy-NFW(XIKz1I@N7xx>vmEeJ_!=uZ&diUI1hulFY{Qw=4-P_0I97?2c_WEDuVkVB
zf%}o1AsD?8Kq!bOn+5jna~g$9+4lsTkz8%wx)bH@0*jIU*i-E5^(_%NaeGGuDVbtC
zbasj~PpPu%PWubl9EZM+zbA})1(y6}INipTDErOou>RVkd_a=o%qG^6K*ffoI_dd9
z53P<LTd)q;JtPhotfacAXUs3p__O)_G9yDP4>4QC3Z?-lLjS<o0nF+dY@}WM#W+TK
z0w#?@vywP|L2~6&L)&o{K`1(&!h)A%u_puCJo=$Hgv%hr9velrz2xjt2on<<TkMKf
z<~35Iz!zf*<UVJqDX@!l$$qf1$j%)VKn>73-W(z!<KmUke0$GJrr}sT&<F-wBLQw*
zQ^u&9jS68#-=fT>_L>Pqbu_ijW@1o`Gj1v*X`5$=kly6C(KqH{RtAMGo^9c(k9S0C
zn{oYNk`1L+us%~xZvF{2px|JEPvC_fYQzSM;z~EU4iiWQYyWsl<66b~U0gRKGC)rs
z+$J~{5E)6gX+gyTri5uvBTRWK7yo05NDR{%pk;}OUtdhALZUL1!7s#Y{b!2&AcW_$
zv&oYk1MB)dwt_k0A{B<jfDn9aWp)}0|NesU-kK|+VP&u0inC37GhVqs;_^2y-(EwM
zmd-PYb5jbIga?%31|vWTZS;phs+w;ntPm3`v+7l3jJV-`UOxXqcQe<&I=%pF<ouXQ
z5y=&F6Y{K!kuXs}R-T(a9L%H6#d8_0_P3ImtHP|<pT%9f>nLD(+2CS6N3=Zy8zV?k
zaQ64n)Uf;Vsz0H@p&6)pq!>X>iLAR-6EiixU>UGj!0PGR7sq|>Wy>>A>pF@#$#vmL
zPVlUi!(Ny)crkk|T+BUGGEddG(V1kS2I+w&OYwE7-a**kKLoWeV8}z9+eR!uZ1AvX
zyEw&Pz~X@TE)eXFefsmd{+6}FD&3S&`mky7Sk~dGCzT#{3Nh@?r|nd|ZUw={^B`0c
z_64=7n_4@=hdt_dITm6Y1KN^l+NMADE0ej3pt(ovY)v-%a{7e(n~PFr5W9*=YX+q8
z@8=}70+muTMQ=JSjh-f`YOJq5oBG_LcZ|S^Q&@BlsrlKx{6%Y^Uw=7ihKC|PwS1ls
z`#ReSYV$;N%urx!)DP|Mk%d1=^A_G&;n{y3{WaAP8!%h#A%*;gmdR%O_%v}qHg6`-
zl5GijlHhlIi6_{tto}NF;66Nz>cMU;L`w7>ig~!yKU#pRn~Fio=lTtr+?#lC;a#F3
zi!ny*8E}uPaNN1Y)BLS4ia$WMs;)$YJBH9!LDTQj!UEm#fPf-j{IIXt?8Y<)qps(8
z$i|6MrikU(RswRHkL^*f7$+&1N71?Tsn+aiYR&x%T7PYo%poDf@pCSY13bD<dVbh<
ze85tF5IicDY4I~}#s$`ZuQsQQEyYkT7hTa(KgCs|JtL;xm*GOc9qyZ5=vjUq%PdFg
zvc`Q3`I+)SpAcBr1N`nW3PQ@BUelela#K$`YSsh6L4Seiro`W&LJ`h6KmY0>--AW$
z+ysz!Oq6#lP?IB}U^*gw*XGSLrlNbhH=R?$6#yxEsjph(G6OsXVGk|3t`Bp)mp9N&
z9YTJ4_=Bl0l61&WT7+kqsYt&`h+uR^LkAd5q(%hDerC`#={q4QQZfA4uV4&zrZRFz
z`n+cCnl^iDD;75<b|v~qv>#`fv1Az|^1`V?a^mlnid1F$qH^p;hG2cy!Ch(I-PBGS
zVz|ITaX&WL13%J)MS$4vGSGB7Ap2VeTKT&x;!b&MP94+d0@Qlu=S7J?^5>Knv@a?0
z@vP#zo>=S_74hxE=Z8O0f-#@IuIknX)@+n^yFMAP4sxKKD0m0@B%lj4a^W4@8^i(U
z(4>@;;5dOo1B?*MRrwa-uAZANKOyj#5Sma-VV+zZQG@w&3PV_TPv*BAPmLgc8uI{+
zBuwLp%iaqYsyl5nqtQ_K=#1~a=Nzal(ie!&&mPiJ^Ej4og+X&8^yh+n$r7Fpq9={&
z;_>#8yV%6yNA384q*`P$B8{oX1egkgh~kC&6(uJ*vkISNO0xb|soVA6KlJP2?xX2R
z>DXz9WQ+G6ij{q1)K6y*$2exQ|4<5oBG7+wlejp;bp0~G%_TmSa4@4+Rar)4)0lAF
zi|J`g51M*fmuOfc7%aYlIh`mH9&g~oyIK8+EUVPH_v*@z4|{(~C3mEVz9m3+Bv@})
z#S>@}QY`crvVf3o!^FMlKfk6(7RJU0DZk5l_KrmgP6CMzqv5(eE}BnAy3jS`nP?|v
zX&%AnslQPCdggqXyfN$@#yx%Q6%k~dmh^nnF+Lg;W6<ZzVSORZayuf6*MkFCoNsSp
zeM#ZZFhgA*CT(&13M$|Oc4Y15CVTE~{^}y+^dvhRLmWI&#Yo}2^#2~n)NoO-bu&U>
z6HhEA77tUa=!>>)@5!uu2%RieM%c~}Q8s@{weVH*k9G+PCj;z=)^4CA2LawAI9NE^
z$1-s{Z;5as<#_}ZHAXwW?TTgAWRC2MS`^Rkcumy}al+N-YPTcGXRajR>=Nsx_%d;?
zi4(U)pJaW79T&ovXo}@D4m{p?;5^Um_IfB_5$UtYG7#RF(~rC1f-EuM9_tR5vMhSM
z_hmz9ioX$ca_sqXJ#@5-iwOth;dZ$umr`Jl0~EJGU*tW=djP?B8TZLo8VJasdVdT0
zk%R#gxj)4+s9feM&~Ldo$?6gy9{VPM*5N|}(l+M1X0fEP`|-@`+WmqP&mQuya_YZ2
z>VVhzB;~F57EW;>Q*<b~Rpx?u`P?6^cYn)buR_AhClnhMywIh7<v+C8-iy;Zy4X7M
zW%YM{3hi<r5asLvw_951Gez8~jW@sidZ6|=Zs>yd;QGCD-`Y7^F%wTzArrB%1_OF$
zzTI}Z8?gexb)4G7UfnzhEmgM~!6Ak6?8U6<w9+qnL1=63js${6+pWB(Fu)m>Jnyb6
zK9CNT0wkdj9X;qewZvq`M~zO_rU>m{5p-ciL^I%7ly4-zcvUD6zQW6*eWozRX*T@M
z`$?^Sb7zPBK@T#JK_Y9luxNDHDhYCr5qNeb#_Zn$JVYh;yy?+6U)lvsxr>?iYw|yY
z=u}IR6W#g5u$Zf0jh)x8r{pU(*)S{%T!d$mWod6l`1yddZCoQ<vI^2!Yk8($0{
z$YeQ8mwiv8e(6O?pd5F77qKsWecak4_RmOUUllM7nE(9!2sFGPg>otB>W|mKqYgTA
z#RyJsvTV1<H8~Oxk7jHYerA0o*U-T)`mx{Jz~)<~I~b!#c^2yKhBTQ5(``1w3%&kJ
z=~h#8k~k4$aJNA@>@rR^y&$rGr3;jaq<%=s1UxQ1{@_uDmak~OnVaX0e5A;AmlqH;
zdjxqYMTS?tpmsB5MMC+Ibws#fjRLK2`V&7Abd1MG;<G(0dy?@*Uqbwy?2Xdh!rZKq
zWr#bH{dObR@y=dDJ7)5^XK4MuH2=Dd5fTWy=6c1*1Dct}^w6w2DUnyeXNd@UqD-iN
z-O+OYK<kZ$kHCcLw<vE>H@XHO9Q_N4SAjjZ_riOd^BrGXnrd4zjJ=V^B9ZjpWH3U-
z8CMBNoNV5G=#)ZG0Owe?;rk6X#J*OgBMYtAK6AVM-px=0Atn73+qsu<5ObwjPlvC*
z>d`1$#0X*X7Sz;8g}wXU#=dL82xk+`Tr7dvbT<qwXGnFE;BC?okIk#79)C(9Xmj`w
zS{+%7N2ncD{wJ_1fv^jl{z;oi7t&)xw$OkQhy5xuO`qh;B*cckm<}WMhZ<1T7ZJ(i
z0q1@1`D462Fp*jH!J0!@<&Z)5^-7r~%o0%E1~o&+By4pfXdb9BeUfBjYI<(YE(IT#
zc!g={x*q)gI!%vf!DX`=ts;H#_xAjtVRPU60#G#u^#E9|gPIPs;j8;JkPUc*137;H
zXijJ^h0r5!5#8~;pE8uKOM#ZtUY_-_g>9F{O_mzbuXS{Nu9>)V5z-P(Hdv5rG>MmM
z4Zz>P1an+HpocZ?0M|v&%3=b$V)S;?GDFiYy`<dNpz~z&_{7J={vjgQ0(JS|XP?-E
z$Ks}{8u$aDpEG0X@}{cB^y_n}IVUNk>cmEc(6}wT(fWK#$2c+1rxtzYUNjGb9#Wf6
zmJ)N`<hg&-nRA=4pqm+XYm0H~CUm<uI}D2mH^Ka)`gnvg0=`ZbW3jENA@3_Y1fpl(
zK&0&vVJbYsFg}jt>UU&HYc3zIv_6B5Vxx)YpLrQD^Gc*uF6I~m)N`;t$*Ws^yEYgS
zG)6D3^ow0NO3A9|Ybv2YsmZtWsEeSM`fPaj7d9iSG1B{VPi*O!qOLrww^tQMvkV}A
zeOf*QONF}B=%DO)M5Z!lcBJTIrG}!`-Dig%CRClp=(ZSGSJN!zV4;cZg^zpvXPZTl
ztOPZs%gi3zdg6?HoX#EuKA{NQcTSkWOopV*`f)2Zyj5oMy%dz7Jqy}}s<E=8H-i!7
z)7UW&o|f{9)WAG|sALHhWw0#%CwUo)VQgtp&_|54kW!Ug<zjX;+NjRS+g6g*lUKBJ
zc#q0kUbG2qi_G?Wtph&(Mf+kaYOq~rB4kt>`q)`#%H~)!Bh>+4_0B!_q=D#==Yoml
zS`-EFq(=JZFD(qeAWEsom%hEf<|^+Tiz<h;a6S!DNXkU}Y5+#n8btJ}oTj%#I(0D~
z92H_u54PR<&HR^6B&F#U+GjGWr%@F6sOpYMfaHR-XUo-hoD&7uk!QG9e2LR}s$BfV
zgdnn4%K`w3jR^iEBw1)O)aas4fpy*b(*s9=GV^<Rw$6W|P>@+w26)PYQoOdsJ2q9`
z6}(9f;1tB`aR1as_o=^|V@BEHRooRGu|KWpQ^wPFIUe_O{m`m2;eZAGCCeq08ojs4
z=ub1#PDbIu0To^SB~jKJ*BGvV`^7uGimw{u!@`gS@}d_j`Y08>7>qBRd7@5~S(7Ti
zHIBKAkLxDFmAyki*YlsByeft54P>osR67pQH$JX;XDLxQ=(V}q!B(Ubir!Tj1KlL$
zZz`F3oyF9L)p+ptbT+~pKhpI{yZ8ip({z7q4OrZuLF99EA!lmE9pIH8@~M{(B#aph
zNgl*eBwv>1;wu0RNgxfl7seIzu5BUxqS<aPzG?5n?d;6#0tLGQ*XGeuZn@zM)-dDJ
z+~hJi(?Wg<vMF)xAraT>M=jEl)RkNSZ5$<s594Y~ZL|GV4<a`4)-m+2Q7Fc5T`+j)
zF+{Z5`#M~ZY%GwyzG8==^*?37E1N8QB5?;tWbEsKh%`}H24T=D&j)Hw*AMU16baZ)
z*yfiBj5FPZxR(_BVklpl6~aZUjSz8l<%GK@sDQh=STsJzP}P+kALe+{LF$^MP^ZhX
zNq1kWnVGabI}<}x(+hU$vw|JzwPnM<bmF&~pZD+%DuM9t;GJ0Lj5au6DwswCT_(x5
zLv*+jSBnE=n{Z!`^ejqo`#9tb3#z;cL;g4s;F*<&L=7)1X)Axce1^}!72gU;#~_e!
zMi0JJ&~$s@q^h6o+K=K0hfw|9HZoR}YsZbeps>*V${X>sm!khdV+rrq9iWB31cU6`
zHVZF1;l}m4Vb4svWnnD=<xUUwg1ap+&xvL1mVgD)A@KTh2;+S#RPoLWrfri4cxBUU
z<c){*P>*~n83{e$aU0do!49lM;{34TY?m;2mt)9!9s0>#l8NLUt?3ChIg==`W=PH-
zs17xjj@j2#YL#Lmh(7URkhi76GSd2eOEd|$;++GahclD2jgHnDiZx~6fiCdvJ}&s~
zh)N4}rNFZNGAUR4wp??~sEYdtO#d>r8lYlEx$A~UDzwARp5-Hc6MaTaE%y!Vb{`mH
zkME2_B&Jmj&nP&G9kBhPWcYx*#Nd=K<nFw<sZTmqUznqH&y`!$Mu1A<cILIvQ{clQ
z=*XIMUV;)kae67~ErpA_*74QaaU84@64F&DhzBu}WG-|2)udP(>sbm9I(t_}QHYS(
zU$6Ws^FW^D6MXpXshZLHB|^3o&g4|e=}2S-VOAjBln?xeMft&w&aToXrp!(zJ~1Vv
zE7d4muXk><<N{l;5EH^P3ghE}inaU%Fcj#=>j?4g-Ou76j*(Jnl0OM=;~6n-uq0zp
z%!+IY5pgx-f-85<7WY|jRnxAJ5L05(OjN)siD>9mWP&OrkzpQj!i26Q$TO89kZ1}q
zF2Fd*$Va;D+b=^<TR}mt?Dt3ho}tr@(|_OCedUQNs;VO+zkt?za;^gr`3B90Q$p<)
zA5c|H`%q#Uvyu)+^D5T~yAjKecXZTyZ7Ary%P8C$Av4`m$p9wu{))jvqdIFwC~*jq
zJtT5P?63kg9kPB@Y$xy6NVuDRYL8ap!HrDiRZv>D4o_q`F-*;b9*zHaXgmk6@TY!i
zyQF2;Baa}u-3zj2@L6OqdRYN4Huy^X*H1UCgeGPOBu0Wm@|+E2{uguq<v#1nMiZ9(
z!ndF=SrRIDiuQ{?Iz8v*aA>b(l-Xv{BT6Ili@MgD-0Bj_bMFWrqmH`2CxJ@?uHt?=
zW5p9>(Ir}QiuriawQ*T8cDTbt)MR-h$VLo4EZKs!lJC1q(s^t;7<;!7MO+D7Fi<5i
z#U0-S4>{)X_2$odx9n+EH2CD}?#nQLmeXSAbIjqTPode{M~e&W0}5DR(9%?luBubP
z?A1(<-@YI?fdWpQ3JrHksJzM%1I#ydg7(uJx4lwpTpcyPFsz4!pU$*bdt_j0PdEv0
zXfRbT8j?9zjshLQ_|}y=GAOUGO0|k1%99D<0|T0;Bzqq!scnZ#1~KHmKrqA{v6$P&
zog!bA1I(fZt9HzgGgKRMGOZcx?&Jq|%T5{;`AWO<&pyY|MCX3*@;D?e`6kW)!3ENd
zvACvD4fbQFFewGx3}7+J^C)49=bZzgM|{ABsP}UvLifgz_hO#PM`R52lj0}y^MI`c
z@?~@NPq)WRQYFM65?GE>9&M10Ck6W1EWD2V7VxM6w^KL3&u9ctQ?-gz&nEsj1gdgX
z;x0mW^XtCtxs|)@wn8O<Ol3|}M(>K*-`1GWWQM3)RnW|B<k5c(-hZ60mH2kJlkD9Y
zjyvJE9L=#?0t0uvk@%kfs-a%*qW7bEnOV8Yqhi86g%z?1A*#-xh5KKfNjYzjox?i~
zZt>9d&x?uVJ}f}@>5)V;owNMV*$Vu&5pVReO-MycC!1Cp-8FXA@SvpY?;*KhvOSc|
zeRB-sS?M0;LZltq6>E&lSuE_iB1b<e;G4RW_x_bY?G6CSn()3WXok6u>h~Cnz<58I
zksh10;I#y92o<rFO@G3;u0_Yvo#~sdl}Q1mmUHisz_9q=+x_%ladmEB7SN;tc%X>y
zU7J-YT1Te}xkMDRz%!ePvmkpH$(VB-^tYTdDmk2xDv&%t!`Hr66{Y%*1UDG#+>ptg
zd%{N*j7*;$rMsg2Qc~ZInx?x4dE7A8b^Phe_iDz&Y)d^Tr4-Oq3NDy*|H1I3=%Kdc
ztG-VawpT))k;N2lzRnl%$N{Fy{AX515AJQ@BO`3xF&qV=g4~KLvQPV-y?b#5O;3fH
zi)kJ`4PXzNv^7h*77|apr|uCXS$G}8qx2+|^vC__aq@fd1<FCCI(kjV^eJTm&w3S0
zJ#Cl+M%%tkQna=UM4U`IT%+NrHR=vx*jpXDs$Z&c%XiBN?ZikwRb22USws@k3Ua%x
zKNHMN7!N<dSi&}m*bbZYjn5=%XRQL#9!s~g?5LCe;v1P1$2(TAYHy8Sr)6u~sj598
zb1ZdeK9!=BtO5-000<H%Ei$mukMRcAAAU}mXDMal1;qTC2(4#O-0V;%b#K&;G{?G*
zEg(c_)DHlY?6aZA6$6rRQG{Q|CKgWzU%XulLXX(sG}JGD*zlp|h7_pjvJij!`mU_r
zIR*VPhs)Ae*y1hHK5PpY?oBZMDKV)T9;u65nnAktHicqeV{+**L`k2e7y3J$tuM<2
z8dQ6ZpfJ83wV=u?Blz(r9`_(V8}T6GyQqg?tr~VdS0v?!{EwVhblUn+mMbNXBmYWx
z&eeLHV2@m81@Pupv09V-eNRNf^#z3$dCL-)@y<yrw~=zBk#)r7^dI^*R^<G{>51tc
zh}B^p2g0#+UOy|7ec4$xmZXqmg!7u<O62p6ex1Eu=y3XKT(X*f{;NKK;wBt{#0kH|
z402Jk<iCyR1zf~NTlLtBJd^#MZ<B0}Fs0yEX_dNo`bv2B4L9$(^FTI%D_jHB0|)g$
zr`PLNr<(FGDdm}U#`zS^?=*w2r$G~_a(x;x7kA%ZW(!QwcY-%!LIHcxv|m>L`IyMa
zilL~WFOqzL{d`;qKwhiGxJ)zYE(fXjx{y=R^M<yV;p9FDv)Q~kV4o#A=g<Ijk!{_u
z?K|&P)g<twdP^RSjzWEaZ|PV@dFK^BTJkBac+I=M4`8d=uLmN(n3j!?^mDaZSkm%E
zA+q&WiMHyqZ{*YI5v-sFoh0v=i_y{^V27y|XHq0@5B>;^4ymrDJH{T+oYOEMaIc1f
zIB`-Ro*^zA2_D|A3%9h>;Hs|Fg^a`wA0DcI2s$RXX<n8iU>T6Ad*CGI(JEB>Vf@)$
z+&Rf1!d4mQI4ZHCx~kSSY*KbvhV8(eT!0nMD4~#Sv?{<ftrOOJ;;H|Jo9*YsW}vZ!
zJ1tZ)aGTm#nQk5FEukZn;3!H6@Tr!lGE|b|JoNgJfpt315T}o48-(Wnsj+)#xA{ST
zQOse?JNP#&uWRi5M`*9foJK8l39kC2Gz)L6)F5T%8@0i|6hw&gw+MG=Sf$tSJ*Ed=
zjnv91<h@x@@))1d@#%04y+x=i%A`YV_eBt97@KM|5ihkG!hO)z0#6jUjPiVNv&FQH
z#l~%JJaiEjpvUt!bCEN?;2`}m(a#-`5F2qC4(#yyUNo59`%+`0t4IVW=6Zr7iFqqz
zS=fH#+-KkQ%^-|TfOc>YRzMvZ{>Z~a`W1fn|H;l5AHvx0e|?}F(s4O|DQlVXXjql8
zZrGR!YRmzbmLU+x7Fx`k7k=sP4hH96--hWo;t{ZD$!z4TRAyQNTzyW`Y&d%{hZV$!
zWPwc2TFkbXsQ#<KbRnv#aPVwg<RKGlfr6=MW-*dezTIr5+;HSrG5jr9e>VLw7WOvZ
zdo~+xO1K96zTg==r9>dnI~-nS1+x^(KqG}Nl^e$%erm}cMdsm<Q_UVdkU%b<6UH8v
zF_cP`fFutpW{)bEK~9sdwd8D$|1wVIa+TA(D}8M#$dD6M7m7Qxu?v*7q+!V0tMk{0
z=lVNEdzC*|7d#(N@^^+;Tk#{d)EDYd&S@cRxpYgmASLP`Wo}zgIkrd~HgLs~?<^;?
zE+W^8>kNdqEL)8&`9z&{rwT=sBhD)&ohV0>C}ZUqI!zORs!Bx!qUV)aa*TAKPQE(i
z&dl+o%@O65E<9w<1OaFZFnMM3)g%szGH8q9CuU$<q>>Nc^Lk1hqbg=l9+T~+8CvAk
z9YfAlIM(tAO7fp>e2J&Y-Q^UEqO+#iN(v8d{Pk6!+jAFgg4d_{+!aX<$-QzXXQd1d
z>Ado&9u<TRzrN&d+=RS7UDG9eDpp}rA$PJ?Aei;nR-u~)&x3)>V6m3j;AIs)Yejdk
zBGHg9axpnR#K2T8b10Bk_4BdJS`lArJbtPw+Oc$d5!Wpl(y{DIkq}CbpQ&_!B{K>L
z#2y)(fTJ~oP(!Ap5=$B@PHu(01hFI#yW15uu_BE6Dw!sXWrfKT$0kqpIZag6ia@>)
zpFP1GV&g!`l_qw+EAneNVIShDEsSJ^!BEgS9Fa7oqAf9M1zS#Ge+5$Cl`XYGDW?>=
zDs-yLf})P)R6@4}`%tHFLd-vbls@JCSQgcx@dwa^H$a@1Mfj{f183}(#aQZ+jcKCq
zl=;sJVR(hZ9Ao3DQ&p&cPamS^tHKc#$nlD5IVRQ|;q56g?m>R?iu_d*4swiFfLQU0
z0pFAL9g}2=BPW!3)RibR3y~b-7$IgYqTh}P^a{V+Fs*@g9Mek<QP=XVTO>KuWbcnY
z3V<bVQnHkY1oAD^6eo{h+re<|z{(?(_I#I{EWs(sNAc|9xQ?lBj}r1Xnch>1kHF3&
zv=^?+|34@Hx6Mi8;WsKAm3(us#PMtfdl76sSOY9wE*()P9pYGwQlIZUn{^|V%^Uj>
zUqzBNw{s|kHxALMfN}PBi?RzVB^zrY@oa*-vi-3b?rid-vJZhU^l&^J#E?K#%nA?X
zD_u<ihN)Nsm{f!`Ok$e4CJ(|Md^+qmFwIS%7|rfSHBI3Fut<|*WDhEsW=~UKv!oi&
zUD}ngV-Hy!4%m<XI-d8p&KGwYpe^T<@R<RWUl*A)^GREJeOIh>C|P5MKwH*(SMkJB
z09J|AHivXs>P?B+Hg_Cb6of6YRTtEt0s~aRw#_@M3vixh3@=!#3o(A**;$3~EYEdW
z5z)~X^N5sEnKfUDHXlMJQ;5tfm+BZ;b4YDA!!iQW9m(5q^uH?h{d7c;nLFbc9A1<n
zb4dSN1wS*-p(IC!SMl>9KreUHF{omiqeX%1(4a-S@sLv;LewHV=NMyMlyp7=?JgB_
z$RYsZb(fhe%F24X_i!J(zN~s6QIh=#g$<4Uj<ow<pgW{^R*OiD4<9ZKK79D}-us{8
z>-W!@mEC`+cmMYLH<Isv{OaDL{R`0dZ|8s0_5R11>f8Up+EbE+h5Kg^`u(VQZ&3>V
zx%!`QA3n7DF~}>0c&@;H{P5ww>puqnyMX`u8SH;B{xJTxOFe?3CFDKWKNaym_U|p&
gkNQ23mEAk<@0~2@{-66peSrF4{XR@u^pEX-0l`3vQ~&?~

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip
new file mode 100644
index 0000000000000000000000000000000000000000..709d1674d3052d2c06e1e3184f9654fe8fbd884d
GIT binary patch
literal 1246
zcmWIWW@Zs#0D*&zy1vimr{tIb*&xgc#Q6n!1xZ}Y5%C5>yvg}_$%!TLMXAN5IVD0|
z#RWN;CGnYgY57tNN^H(xNi7a8mc)w8VhM%-6k{`fpKf(#W?%qePN1=wdHJO!dU*v^
zFl8KI+kb4yK5+#o9mvSQzy(%TP+C$9R989sjrc2Rfs-c!js>ipHz{gP%#ysN3*sKH
zU$=O{ylL?vK~rXk&ySxniA$)=`;4X^YnOPsir5mSpHgjBv#hwba!p}a$F}Uro=bO{
zy!Jl%vnXWmolA2jojI~-(Gjn8KNeki@@LPZNsBH$5nAWu)}gg-k4Q(3;$(B@Y>nig
z>Hx*fZRU&3leI$}y*yJLwZav>H74(GHE%Xo4q#l}W?rn3+&h2%{Ok1!48r%msd~=c
z*qS5RBgrxOhNIfXEols<%V#>67B`sQUXYc>ki{pO;J~cv*{SMT1rH*K->o*9rnv(B
z+yTV=K%A0VmYJLyUs73+iZcN6CTA|h2!V3c5CHpMgi()8g8|LPx3=NoFMu{p0fsg=
zkOoExrNQk+t>6wOCAg(P;l22VqZ=r^OM&5CTHH{2d%>$ThF8$=)>u~9<de)Ydm5*4
zFZc3H9i0T>An!?EfC0kDB*F|z{@4>BPy!4Z8bMTM9xQy&^`S=|L>B`?L*pMH6D=yy
zHKWG^LUSH4W{_efz?+o~B*_ef>wt7U6Nm@PLXZsn>Uk2QJx~;cK{+kGC^0480F;S^
zQ1Wj)NP-I~6AN(wGdWlkn4MXJL75won>`p2j>cY40IdRpCE$<(7aQnW&<g>CmT({w
dsYrmu33eYLv>gPNCUE~D2hKvE8g&+sLIC8dg!TXc

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip
new file mode 100644
index 0000000000000000000000000000000000000000..9939456c7dbc89ef6709c026d8647e389fd8f2d2
GIT binary patch
literal 1226
zcmWIWW@Zs#0D(h|y1pkwF8tL3vO$;?i1Q2d3X-^3GxG{cON5vs;tix2lvslcaxzP_
zIJsDhK@0;HEe<Z0#EQ&f35Ea^!{W8)ZO~_CU;trGpkbML`K2X#c?DH4WgK8Te{9J<
zaRn$H$jHFJ1y%;MtQe@Sa`qeXSJDC}PXrtbSUYc0)SQ?lc}o|>Jzl?V@q&5N;zNR_
z%n+X+KV=e^P?`4`O+VHy@pKijB}_l1+N@?-ac$+A!my5Q*^@n&?lgJree!2f$lg1b
z=1e+sWYMA{Uh94=y7J`Do=1}wU3wz4&dIGqYuz4^jvU3w=FZs~$wActiksWa7n>(*
zhd6q9raEecD|%~8-rs88Y_1%@xVp`}SR=W2{`~pZ>lGM;?|oDCoV&3#N3utfWAY70
zwT)ZS7)+PXbTBP$FulDXD~%zGPc*@SS=F;s)w2p7L=eAQZ8S}D1^T%Ii1~pyCABOw
zIW@kdvLF>_0OU>1T!;|@<)|S5_P+?D9-9UOnvKYz%?-2>7$w6yxEYy5m_Z2~dtw7h
zfI&kehytf`aMBC#M%9NNQ4n1W3=NHcfK0S#L)VNR<_OJsz%-8(0|DNwY#>QyAY2Ee
zvzb6VSayMA(a%mzoSr~Y5C-L{^rFO+cmq&2<pAc-co0{J3+xdHTOc06iBHMQEe2*`
zHs}1j<V5Uw*n<(_eC&k+&{8m10uD295rM7+J?kU1_yU<o1pzFSvHK07Z85M6f%_CW
Nm=*%n2(y3`0ssQ^chCR;

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip
new file mode 100644
index 0000000000000000000000000000000000000000..6a22bf9d5b9c41711183d955234b98fae1ac93c9
GIT binary patch
literal 1233
zcmWIWW@Zs#0D;4ey1q)u#%~RPY!GGz;`{==f+Q}Ml*E!mF7}+%y!4WcVj<>;cmpX0
zCBEQ-oXirZN*{=@76%tgVnt@L1VaFdk*0EOHMYzQ3?R%2G%_<UzqCXzub>L1j00@5
zU(FBGCqU^AMg|5hu(E>El478`%GqzkUr7s`JP~j#VC}p~QFCIJ<Sktg_jvuf#S7+5
ziw_B!GDCcR{FF&tLS^1(H2qk+#M4#8mN5O4a<iIc#kG}d3d1_KWl#28y3^#f_sO3{
zA$#v!nltIlkwuG+c&+=f=*p8ndmc?%bm@uEIw!Xdt#x}uI&u^zn>%M~BnMRoC~j^u
zUu+(%9pdQand+z&uIQ~Xd4H>Uv$=8r<LWl^VvXe9`Sa&ruUB9YzV}VlbMD6094W~j
zNsh%g9Ni4Jr7@H)pXpFq+)#RZ!K*ZeSA4073{8_%M1(B_)lv@mJUEogq<o|0$|MzA
zc#uKD!fK;wnky(QfS4bMQ&P(^lT+hMDhpC^21wrI%!L>sQjQuT;P4P()ML|NK(lcL
zKiAGnK$~2Fq0bGZfl)+h@V8Mb_>)Nqeo*L7zM-g=%3ul%d(+|u)7uNO(ipNJVZY=E
zB(NEoM3_N|9(&RQN`OH_BZ$h(gM~zZH>y7LXoBcsU}$LM0R|&lq@im@4{?O%I-q8x
z@DK22Wdlhv1K~O#UC9LE!7>jd+wQNDKWPIL1z}JgOD{@Hi8lacT@GMgjR$dsI6;{h
z!sL&Kv*S}TbBlqQnaw#rFFCOUk%>JR5iZ7FHURAbgC*c#0+$r%TF~=8LQ61^iBuB6
a0v5Y>5Zbl_3lX^IkV9r6P>mD|NFe}&+IWNj

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2b3813c79aee7e103dc5f73d025d324273148ae5
GIT binary patch
literal 3409
zcmZu!2T+q+(+(gYMFiy1ReC3M2uKrzP$hJwNG}lx#e|{(M4E&m-GmMaB|vDxMMU&c
z6i~Y01*u9kH0fd}U(o-~|6S+XcXsy7oPBoZ%)ImLJ`iK7b8LV!dmkljYHnQ8El)n5
zIW^h+1Aqap^mK0iFn1^%>JOu*ad&~cIKlrfda}~^Bj5oDI5$9mLHC~|7hlM~igt8|
z>{&Ac0B24|E((Q_K|ol*{eRNV|Lo0QIjp}x0{{?d0RZ}+Y2?E9$a!I<84qqTe&_Da
z{WJtgh7TKEb8R*KgaKys9nHwFdZgpB(e7nj)9!c6dZk^S{263FrVkvug3<%7zEb-X
z1F^H?Vak;bWLv$1mr{PZth&5ba5_NhCaJ$ZI5<Y0kd~<9&6pDAIC0!kffjhguwD5u
zvb<6JCJb5L!tIB$k<&85TH|L6M8%fAj<$t3Rtp$TlY;l*_X6i-7EiT09budrPKD^k
z9t}IJb*m_GxwK2X(e8}^EYs#P0aI%{OvG9~@69_9-$0I$X5Do$*Y!bQ3|Eftg4BFZ
zlDy_QEAhni(W!+e9T&=z(z$v-Z4_=^7;})QGzb(rk~<UD>~3K02^KjBOj+zYu^&4i
zpfXq{jV1N2u0)(BDzB7ws<x3{2V{giPm+Arhpa-c?p<#WFHb$OW_hb3FMEECqw5`<
zQXR7tm7egmf?d)Dc#0lN`dW-)zju`0`sfF!rnJiD5@y8aiKnq!C2<>^QW3*c<IuZ6
zC1?m)NqC9Wbl3zxY{f<8#g&yT+yYn&egBfMz?Nw6zL*{L#&nO<x1sQ{FzxG-`<EUd
z<>hny5?b^fg=66kd+2S!C5Wl6(7M!9w>MCqwX3_S0zbyFx0N`}S|8Q5W6{(g#;DKJ
z?!42R(WDu@-O@2g=(&rc!M2^gDn1Z|cvggeTMyEbF!bT%TC88zLnkH9Pd;Bm#}on0
zwG0sC1O?rG#+>MA=Tf3G(O_(NiA7HF!J>+Z<lJYKiBO`q43_%Ad|Ix5a_8HG(eE<%
zcC%U)YB{Mz4wZ)tI!U7oe<XMz)DxCjnRGnahL*v^gi{B)4F>96L7kQP7q>I(>$T|O
zpMI#=j>p~0Pnu}09$J@=1GZV{exB87&M(LIxJ#s;oCp%QWtk{5_$!Y8%x_Yk5l1}N
zWGpPL#VUVhhKs$e1BWC9PHaACfIToejx>j)`b8Rz7rwuP@`pIjh5CYQh1RG1Qf@`j
zy?bPvQ5-0aAsD4vRpzc#KyKEqhxQ=Xqlg8a3y66kpCLB$(N$5_mSz<FowlM#n1;4~
zyeQV{8U>~5)jcrl@C4?(OqHLMOQj^n+m&w)*=ey&fKK0M5HjS!WdSuusK|evst!{k
z*aCU*Cioq3mp?IydT5&QuZi;9aUOqKv`Fmdn%YaC)_rrxys`v6gZ5NE9+Xqs@zNvi
zyMZEy5^$@r&5hog;fW#0?++=%3jNgK`^xp{U|XIB=yMnOFk}`g9NWhqsZJ$)F~@s1
zM<N@cncKA<?w!&YgV7dPZ}=8}hyp2YusIX|o~eK@w9ht0yKap0cTIAtHM<6foAGYk
zm?=}@Fcx=*Qmhjk;s%$|BcDpFrk>%;4yEfC9OmM+vZ_xbGm}WKt{)PTP4aAX*>g?n
zi(Ew-C}SUk6fYBK*`sGAq6mkI+#5S|1>(|jK|y>eq^iK+J!g>KqPeA(+c!n^;E#J!
zi#I-+J6fweAaEtyMY;ua540x)YFVxzG_!IK`4$T<7_C40%*BSf<jbAnKtn@d?-2E7
z6RGycF_xbn-TR~DPa|4;g=_nk_oVY@xeOPjFB+tA$2&?|;9a=dy@y<@KT$PIHRn0{
zs2MU0Fdq)0<8bya2Ctaod0o_8_sY?evvKp;cq*92V5ubCBI{~tY<hCyUgNh#lkvpG
zao(pQ(nq~)p1D%8lkzM)_n(O>D)2J}>LJ@MD7tCjY>Dy3>u_oD0>`4TehjO*i$9h9
zAf()$=GoMJ8wpT9*er&{GV(NTiuPkFvF3`8ygy`Rigu*Fw6!<FOfATv%+0$)Pt9@c
zuSy#C4}*Jzt{nxztaZ9=!bvcE=aSRo5CwM6h^KyGT#xr1<Gn2>W$D#ybQwrfILD`D
z!<6laY+*Hn)ofJVrXAlG2VU{YS2+=n!%Su7%dk4isr?#C6sWR^o(i#|>g4FU@3EJ^
zm|jga4Q9LIIgN{V^7m3}5fqQ*&fJRQ(_YYpT}(s^Zh|iV0BdUOtCSi{LzSJgDa$;=
z@uT-O-b6~2#UY$VsAttH;b>HHl}SL2FWZZd&@TlRded4lo$E&w^h~Tp%eG2KC|$n0
zVQ=9q4=}SW>OmLk+S@I&y_XJM@HJP19qL7!2A!cx?TaJbenCffKiu3P%m=Pm%Xdh#
z7i)7xBOJ9XNL-d(-$k!SJV)z)^uxi;88l|^=odqaL+80os#;|}RCCD3I97^oNy!1V
zSksy{pGxlrbsI{`3Hh{ne;b<>i_{U5xG~T2V)5EO{<BxlRvulgZ51y0TJ=QAf`^rw
z_=UYDbAx3gjOHK>jXh1fXD;ntRZ9H(q2U#7+a8NdB3mix>c3~9TD_8|9(e{nCi_LQ
zyL$vf-8`J&VF4az12K1vgUG|&+XHzEArz$B<vHMBrA4p|XmJg;s36<P$|4iZQRtMJ
zPR|-*+$VmM8pYuy#Ezpp<jzr}Z=z?T0Mdv70RQf&_(H6TNCg0Rp926G$Q^&p^<U#R
zDcpn&&ii(gF9yadz)WkrKH9q##3~AZIlW0TVG+4f0RJmVrS{rJ=q1obax=g$IaEP}
zwV1VKu?P|*(!|I*Zyja8Z1|Llf2k{NBC~X56o~LSp5EyWA!!bJ71HQ+42<6ZmOtDn
zFsM=}<<fJ}G5-ooyCr7yH&4GlqQJ<1lz4*2j;e~}H>r43?jSzS+6(NnFuadB6@7WE
z3=PYkI&wW#Y%^aj^y!uJ!YAt^<jg!qOTX@zcw&Y7`xFA4ATQO|6UUqdm#u^Xl(LMX
z3C($u%Qi31H7<QYULO2r^r*jvSrt6cL>W92G<gn&*C`&~>dRRMGd(L)8><t@$UUs>
zJ!vr<$Y{0pw4?rdU51)zkh<ZN9%OU<<xqAf)B)`|GQtw1`VQ=3##&iIl{((S!%%L_
z=WsrL`5ovJ^AXJkbgB;Or8uBZl)be|kyaVyd3kNFI>a;jRkT&1n!a23`!&2KBQ`B~
z&M%C^(Y`iyJ16x;{v(%H!rH5MsVUtuS6l<X2lECHE@%gsoG)RzX&EcV;<iu0xvtRu
z0jkuhu42UT+MsmfP|t$Hz4-W;GUpSx4k3NL-!OF&f4oLqQ<dQ5Lm@?(4i&a!8NG@%
zi+C~FcSKH{sXV{0%H2C<Hr-ff3X$=l^OvZH49^QyXO+!)+u*Ag<*(iZJJfYca}?g1
z0EV;T>R<H~Fl)c|c;*}AdKq@`B>l&X>|&pFoUtA6)?^3ZSh&U+R><`3=tpd5ruTdJ
ztd`)ZwIHqy+8*`J$q@5OY;xrOqmKnr;I=_n^#tkC-hpYoX7}9}^7YySEH`m`Yb8U)
zGM!5&jIq1t+_ohB8<x6<Z{6L*2{!+2-WMWh)%ZoN$q9wkdF@ke6ck5%Cj>$$ao^Uw
zgUjJ8+2Jz#8u$^m+hbe5J0n)%bx>5YENiXQQnJu~*L{b>5JKq4uUv&M&(HT|e$mST
zXsG2mbq5>DCTt?#zRVl6SiYDHS4lU+mqje0t7&F?3+$#ZTI}nKn~sGWbQWLOo&V81
zY_%f+gvU6yNHaGp+SzFq0=C|Kz&+mT#PX=!;7yEmKe?hH9Pvf~sF=DLOfj(g!eu~}
z(nEx{(X>X`u`a%PyDws5dQ$v3|1E*jSicpdtM(&fIVK-@$iX)+JMK9_LHVoVv%FYS
zH&ZJhrkvD$&ER%rmx2@Z81RYZHkIlZ8Pin7+o&xqq2267W=Z<(?v}rC=TF5Rt4AE>
zb7M1C(tM*E4t@R{UNRk}72#hoO!j@zpWno_Z@!vhVUkn{8OV?`jt1m;+Md^(HDS`7
z_;&kJT)e$K`ZiYC=jfCMVoX8FMs+3!{`X2wzW8Ymqs{^p_RoU;JMMS-`h@}j9!4>e
zm;2wy?04|*O!WWY2r~G;cnM-mO_mD)02TSGA;+{c-Xhc5KX3TolIASu%y2*7@KfOY
h4*6ZM`~sXotjWj!k}tpdQvxUfcgYvSWPJkw{0AGA1YiIF

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip
new file mode 100644
index 0000000000000000000000000000000000000000..37b399e1584e5191a9f2992f4c8a9dd188650c1d
GIT binary patch
literal 1632
zcmWIWW@Zs#0D%jQy1q$oJ~iD3vO$;?i1Q2d3X-@OBe>YoGIL5&i-Z^>r5Kc0ob&U_
zw79s~vQvxlQgb92wPd*j((;RvQ{xkJ((~iX(~A;QB>uwyqZTI@YkE<BX#r5BG#4L|
z%CvHHbp?qj@i`I<AT1zf5|GKm#Zg>Rl$nwmp8+%g!7B&ypvsDX%23^qg6;-hF4p{_
zl++?67C&c?aG1st35Eca5O~DKW8=xpzyQLWz!1pH%P%d_%PXjYDdPYKl&o&YnU_H6
zVnzlAF0itK(vo7Jy2{ya#9v7ZoIDY5EMV=tNl|lRmgFs65chcfy2T6TO^Xi+nleLt
ze*Bb4Tta2uXEgm-yTsE~#FjAqlybA0WyQ6XYYM|Uwq;NDT)NZbwfD)NMIn3dT$(fK
z%#lTlj(Dy6vFOT^KYJccT6F1&&^jl#4y|>2L^^U5C!0HGYa|C%2PkfCGhb{TtR3R$
z<(cZJ6|U&5F?oNhd9%540ORU5^J0zU-ud(AU$0kS5We?K)pPE~)*LCx9!ZYHHyYg<
zeV7kV;#r`WsTSaDaiHzS0;xj`CMr{&{#xYu+j~*~&m$Mp4(1hIPp9sW^$g1LoFoSi
zH%Ne3Z8S}D1qMV15c30ZN@`gqFg=u17Np_~mAuKB3o(MF95q<Lfg-}F$ELx6W~0Q~
zqJuYqHu(UffE!2y<B8HZXropfB$E;cpeUGpqfxDqm-!$t3M7G1U~F+9?Z$#x2N_J1
zr>Oed=f`^nB|*ZRkx7IZlwYwYL!bm0G&F*!%sg0t1bCzBLys+pE(V5%MnPa`qQxD$
zX7qqZXl@2-Mv8_2Z&o&tBr_1M1JbQbARa6uK}wvW`ww^pfZ9M9l;1$vHr@cW)Zt=_
zj0cGeaYITW2!|~KOasd!9#DDY4pfktnpdL5&BdOVAD@(&SPU#5hD#yk!H9?j>_s&w
zXhC2}BZveS+~``+%Qlcg5LnX41Ppzok`9(wutyO@6G&ST3sQt3$M!;?nw6jmgaH7R
CvJiIw

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip
new file mode 100644
index 0000000000000000000000000000000000000000..11701717ee81e209952e926b69626e2de9749d17
GIT binary patch
literal 1355
zcmWIWW@Zs#0D)hPy1vtNrmnICvO$;?i1Q2d3X-@OgSi+Zh1l{-OA1O$q!^SWob!t_
z^HN=NlTuSsGV{`%i!w`6i!$@`w0ODLaubWvGxJ1&ni&F6%)TU+GjSF(0|N+i0%bGv
zfClR26;#2Lae$q0;a#w)1S12(1t8`ED+5|u3{+P+`;GW3X@Qd`0*(c&oi{0JPRx?L
zr3>O7uV1%#!MthlAwg4Sh|iCoGKov5%=?U{A8VI*x{BBmrk_%7R<o?QwsK8jSjV>P
z$(~Dhn!NTt`Lifw@109?CY?F5Xwea`bw3tedGcq^qe+V{JrP>x<kq3JZjVSuj^bo<
z=WLDSpy~j{&28q3&4aZ=9KAeK9ks$0y)`E9Z#8c=R}NrY-DX~_k=#3f{`~9p3Jk*c
zzNvc7-PoEVCD|j%vG|6gTjAC;hSKFT9ZHKEN^dWCmB#Rj@9GMR&ll_-Ka|{kvNrVF
z^36YFLn`jww!gIc*8ls9ZrHtCe`R0Kl+)Gq5pkQeJ)JhT3MYE2oe|T!v#nu94-XHI
z!XJ6AgqoC(la^eWq+$yXW=Qy1Z8S}D1%_J(5c30ZN@`hVa%y}@WkD*=K+Kz*xey~1
z%TYrS9EKu{dTbhqh=tgA#_HtU-=LTR#uzt{hNKgUqO6TtQI<?fl!0Pw@(o8dP>h)Z
zW6ZR;!SwcmtTcu!NQ~WE9)GE>@txJ|%aMQhv~1qL<v#lV7PrWc8|ymf=e$q5mi_<m
zr%AD%K}nvI<bbil$RxrH%39ddHBbT!8X7@VW*#h+qw7PDPlzrChK5E5V0fd&F1lv)
zs6lAn1k{WaQ32kpY#>QyAY2EeuQ7pmu$&1g95SkpKMw$j0x>89rxzur#2dg12rh1j
zdGR1|A+}&J4J<7baFrJBKy{g^c_nD2h6f|U7uZWBpxt1w1RQkWG74P_dSQdm5(8u+
c6+N&>!tO<cHdz)V4<m=xLZF%{KuHD$0Edpn9{>OV

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip
new file mode 100644
index 0000000000000000000000000000000000000000..9b87be841729ba6871263fc7fa5d8d58cf952eb2
GIT binary patch
literal 1101
zcmWIWW@Zs#00E{ZUEizek#}8yY!GGz;`{==f+Q|BPrrcBkYFJ$|IiR1Bi_f=Plzqd
z(I?b3Sc*YO#5u&-DK#-SI5n{-Im0D2IX@+}NQ;k)D+wqVU!Iv#k|DtufMU&q%r#f+
znHd;Bm=kDCW?p`2iC$ho6-*fi*fn2P{9bhpDDBC}z`zAoR!~|}3{+P+`;GW3X@Qd`
z0*(c&oi{0JPRx?Lr3>O7uV1%#!MthlAwg4Sh|iCoGKov5%=?U{A8VI*x{BBmrk_%6
zR<o?QwsK8jSjV>P$(~Dhn!NTt`Lifw@109?CY?F5Xwea`bw3tedGcq^qe+V{JrP>x
z<kq3JZjVSuj^bo<=WLDSpy~j{&28q3&6Bl59KAeK9ks$0y)`E9Z#8c=R}NrY-DX~_
zk=#3f{`~9p3Jk*czNvc7-PoEV*(1p@`Nl!D#)N9N3k@d1D;Q-Gm~#Sn6BwifP6bU}
z%4njwQN@#=0oC_b8%@((fxhklVtyb_NiEAvPK__AEJ(!}1bLG)7h;4#IcgYyLqLR4
zk4=LC&BjH?dyO6eZAt`&H#d+5Mhc|??nbQu4<;qRr9dIR_=cmK;kGn}()3vlrNs@U
zw->xhV|c}vn#j;JNkv51!com&OIkrm8`IJt?@3>PLBYr*!VJm^*b^U60t^}&K~!cQ
zEM(C2p+_4;7Xw2><3Au1EfUc+qsIV3a{(|;km4i2o0SbD$qa<+fOItzh=(u^dv*q?
p0f8l8KY%kgx)$`T3DE%3k`81dWmJg$Obkp6SwM-iz?{s$006iFTBHB~

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip
new file mode 100644
index 0000000000000000000000000000000000000000..c6a3150b72d008098056e8cace3ed5f8da43c033
GIT binary patch
literal 1133
zcmWIWW@Zs#00Ge^UEhPnm+!a$*&xgc#Q6n!1xZ|No_+zLA;Cgi{-Gg2M!b)!pAcJ^
zqfe-7uoQ!mfOCkmdr@j?N~KF`a(+r`k(Lk_Uv6qqdTM-8YC&pZNotA&V*rXZ8@9A;
zif3kE0AWs`HJN$&r6qcK1ywL*9AMX+wQ5`R6)2s`$iTn_R#s42QVdjAIs1+HD`|m~
zCjyQIterP0YEI0Oyrm1`9<N`wc)`4B@gYG|W{A&^pE8L{sLcC}rXOpUc)E(%5~iP0
zZC10axVCalVOYnu?8%->cbdHRKKZjKWbd6zb0(cRvS`r}uXR5bU3v0n&!b6;E<F)i
z=j7I*wQi3{M~>oTbLVW0<e=&R#m#Nzi_Md@Lma(4QysO!6}>ek?{774HdhW{T-|0~
ztdZP1fByXI^$HBa_r9rm&fVCWBiSR#G5Ln0nqgx?HQR*-6X6w%G6~E%0lWzeQUa?I
zRzFEr*EpShe^KqtcYe2WZkxM@-%$M(bz4)c^Va-b-)?VH(cGxw$<KfqI93}?(_Ddp
z(gDQ$K%A0VmYJLyUs73+iZg`rCTA|h2%>V-AOZ)F2%{dG1_PRnZgC%XJ_bbtFgmz_
zG%(I6jSn|!#YZqH@gW6@ki|C?-BKA!(`Px97B`gMUhpc7;T7N0gp`JcNh%`27LLmd
zwxt!6v~4nIS{m#<=?gF{7@0(vL0JWR;si>7K|>>m%FKfW4Z1$`c!TI-U}$KR1%@kH
zETU^hj{=0|exPQg2nq0JWdlhv1K~O#J(~%{Ll}oWyMxq#z!I<@z?mOi3wqXtXaH$R
Y2QrZ|GQ@r+1}26qpu}5XPG?{M060)?(*OVf

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip
new file mode 100644
index 0000000000000000000000000000000000000000..867437401d2f480a657655d7e2e9cfb02b93a6e9
GIT binary patch
literal 1199
zcmWIWW@Zs#0D&XTy1qG~E8a>1*&xgc#Q6n!1xZ|t5nPOsLd-7yAyN!VTrT+~0Y&*K
zrO71$D0+RPbW6>c85lsA6R0;cFTb=zFR!2qri=q@#l8)a%6Ea%sf-K^TwrAdr6t8c
zb(OQ<h`*8+IC&!ASistOlcMItEXiBCAnx({b&D6wn-(7uG-ZbP{P-!8xP;2Q&uIFw
zc8RB}h%I6IDdlE0%Zh6&*A#|zY|EbPxpb$=Ywwdki$eC^xin|enInr99r0TCW6_l-
zfA&0@wCK_kp><Af9a`)5h;-yAPBwSW)<_Pj4p7|OX1>@wSUbeg%QMwcD_qfAWAgr1
z^Ja790LImA=EWMxz4PbKzh1AvAbjtes^{E|tvOPXJ(3)YZ#cRcZcAe*T|U#Hw78-4
z_JUVw46pc76B(K&sfY+$EL1Bvur%0v(iaBQK(N|qn&t}hd<PKo193`fS!Qx-d`V?N
zD$bC|o1D22BPhyIg9027B8+-$8VqPQuHfg|c?oEfD=^r(fiy5~C=K^EYK40;Dd7$Z
z_Q^LC)lwNufq`yX++ccpK~@?=79`M@9GRqI3-mf8lL#{?Sz}LgKnXBtXarH2d9aW`
z*M}ZW5M2xm4UMe8U_^^Fbj|1?j?i2V)QlAV0p6@^AW3E*TnD6!m_R&Of`Mer)#fGj
z7C=!D24$}FqQsPV19;}-;s9pOc#xnFTLhGj1k=DAD}>0g?m!ipsd*(Hj0o3a&-p+*
wz+eeDh`{9lx)$^dkI-TRWFlpMSY%-L3PRgVV95dZ7;>mA1giM~lw@E40GX+6RR910

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip
new file mode 100644
index 0000000000000000000000000000000000000000..ef9530a9a11ce2ad6dad62e2297647f55da4568d
GIT binary patch
literal 1215
zcmWIWW@Zs#0D*>PUEiYJ7kTx7Y!GGz;`{==f+Q}+2qDHuDF!8Wm!kZF{L&IFZZ7uB
z;`ox(;t~l)Egmk`qQsKSd{Ll!1}laD6a&L1`@GX;W?%qePM~~d9#FqtUO^R183)+j
z75rQ~F9D@p85tP3z{(0rONxQ&DrdhDe<dw&@<hO~fVJ}`Ma_v>lDBk0+~f7@7B84L
zEj}b@$_(-O@lz&o36*)D(ez{O5>HnVTf+2Hs?BPa71vg-DGck_mOa^X=}wc^-Y0(+
zh3vg^Y0jiGM;0wQ;<fI_qAO4S?0Gb4(WNIs>zv#=wASqr>Bv!>Z0?+`ksMSVpt!ls
ze6e}5c8H^wXR4!CxT3el<o&JY&F0DhjH}zsi#3va=g*&iy<UMq_}({F&$%00b0m8t
zIVRswR7+(rT|U#nw79|a_JXW5hAh6+M24nGOO8xZv4sZ^#OGEUP19U~KJEZwejrXs
zEz3+!jW4MzNX6;>yvdmhG5lYS>VL5BMHuziG#JoqL=J0ippC!?8P0*t$RxrHO5oTN
z8c+fZ8X7@VW*#gg0=!Z6p+^%$7ckQ_{sJ=5A`M+LdWa)5=K#|?QuqgWv$BCCnSpQ}
zkWOa;@nBg6l10z)c1^GYih?jGN2M1fro<b7vMF0+JcuL276GP#8I~KGVcmhkcr&dB
zBf`ztO9G%xV6X%nP~dU_T?=|%M`#HIGLdpVENHQN2%&8;u+V^e4>@cW0@ZM^fD{4%
DNTO)s

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a8e45abf3c8ea538ccce5c21864bc7d9559d496e
GIT binary patch
literal 1280
zcmWIWW@Zs#0D-^Fy1o|S_g5(b*&xgc#Q6n!1xZ|t5nPOkT#QLVjFD0dN`kIAsky0n
zCFPmLsXm!`sfk4aC}yZ{csHYnnSlX>Ie}(m=H-``=;aku!IW`;tx}Q}XW?LEV3-BO
zTwrAdr6t8cb(OQ<h`*8+IC&!ASistOlcMItEXiBCAnx({b&D6wn-(7uG-ZbP{P-!8
zxP;2Q&uIFwc8RB}h%I6IDRs<hmKE1lt|<)b*p@xnbLmc#*WM?87KQA+b7{_`Ge;IJ
zI^wnN$D%7w{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(Wv37`~muIS@
zR=A?K#^n92=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXUo&dn7p)-*9v@tTf)5
z%J6mhOoy+<4PS3Bc$LoZieL1`cJ;gW{vWN}J#A{*w_D=RZ%$QNIO8~{*?QCF#l};T
zuO??!ufO5GEyHWagee>;z8g}_(par@x2kyZGoXf-)kf1aS711G05Lxhr=*r;Ca1=i
zR2HP-479w-nF}#OtsFJfz+ooBsK=(kfM#Rb{FSR818oWh#t=7<21X;LQPf7QC`u+J
zia;?m`G%t!B!;#EV`yt}!`9miveFr{_(f-|SIv*#;dsFAO!LKmXZ_<A%<NS4tOAAw
zBa;X-C~II(uRsYfXlMjcnR&3FLDz>KZxCG!3=NG2z;H#2MRd*RQGn3A2&fq;LIS*5
z*+7!aK)4P_uVVu7U?~Waxwq9^H3g+05C&z;^rFO+cmq%dXN!ynaUdC8h%Exl6Jkq*
z(n(+%nDr$wvc5Y|TPCnD@L)tZ9eYs%v=9uIfWr%1u%K%}FF6oeT!2iZas(DL*!_jj
Ub{AL#!F`DwL<@mxJXt^r0o0<L>Hq)$

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip
new file mode 100644
index 0000000000000000000000000000000000000000..02cb6d516ae68e9a1c2ce3d81899e923f8a11d67
GIT binary patch
literal 1050
zcmWIWW@Zs#00HY3U0<=AX$G1=HVCr<aejebK@t~Zgb-t-6oV3%XR&W;Zc=KIf0~vc
z7i(E!PHCzEg8{PvivgPfhZREridnseiJj`q3=AO52{bD+FTb=zFR!2qri=q@Wq({w
z#wDP%CnEy`7g$+AX-P3qUFGaI;;*CyPM!!j7O-~Sq^LPDOY)X3h<m(#-QorFrp1Q@
zO_?D+KYq$2E}=5-Gn#&^UE=8~VoR8QO0`+dvf|pxHHBdv+p;HnF5PMJ+WX|sqL96J
zF3p*A=E$N&N4(bkSajvdpFNKzExPnXXq}T=ht|41A{{x3lg*v8HIjp>0~9y6nJ+d^
z)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3d9g-v@BI1muh%Ou2;cjr>N$5~YmQ`(B*)|%
zifXA0rs;=FiW^LCFUU$`$l`mNkk-&JX~~gEDz@+dg81BOqiLEe(8nD>%n!sVsb!hT
zsqrP11*tf_pEo&kA%_3UQT-40y$GWon+5}#jRDP8_2+;#IRV3(8%P5qWXK1$x}=k7
zal+*@3|C}Morst;C1TPNcA&QznM9aD=^A??14@8FLnDaF%!7pjx<2$Mg6LvkXlVQe
zWTHhFx@PpyMrh6fhCfo+2Y9oxfh3uMa2=3NW&-gL#$nH!AT=Pc1ndWJPDR&(o&_Nq
aKw5l&Or#76v7d>7i6ILpF$<Vu85jWPkU|as

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip
new file mode 100644
index 0000000000000000000000000000000000000000..61e709080e5c22ab568c8e013b29a0aaf9143132
GIT binary patch
literal 1072
zcmWIWW@Zs#00HY3UElm|&U?jyY!GGz;`{==f+Q|RXD-GFF2+b9#waNUCD!na)S^@?
zh5!_u($`kC`7<*xfG{UeXJ%f0X^CE5K^06H2iO9QCf4RJK<Oon3=CXgWd)@r#XxnH
zv)_onk`_35BH&oS+If?r=EN+?Te=|b@%nX(7tEU$9}+ZWhWPyWDU-N_%Dm5L`muJ2
zr>lr9Vfra`%xabu*H*464C~mIJ=t^VPLtQ(Cw~@&?7eeo&ZIL(7A-pBweH8FD^LFH
zc{FL!r6)q`oZLFJ*6k7L$WfeZ?wqZW98?{kxVg=Iv3ap}h@+Qhs-sr8qPNE6{jKKB
z=E?z#tJ}<rHIjSh&!2z2UV%aQ-Zxdxxf@$^q$GPJITqh=bTizR#!#v*=~P;raQQ65
z71>iKA||ygnk&4}_AJY)x!h~+vUQhDyD40EYr)I|yrzxZ8h`cv&y4j9%JQ5f2M<C>
z*jQ~eO>+f?Ne2+~193`fS!Qx-d`V?ND$W4Po1D22BZSIPLkJu`B8+-$8VqPQp8scj
z^a{`>S72;#18HECQ5qd>)QXN|QlbMC8<TG+s--fRE}!XOTHIiIdqGwjLl$4^OxuUc
zeLaJcAi>SZB*F|z?bs6|Py!4Z8bMTM9xN;ZyixU`M-)UC14Bcj3@{+kq77X$dYB_L
z_W?B{#Xx{JD;r3X83@+_>24+v4`Cek91Btd0!zSt0OwzHE$CSlq5-7E3dlstxDfl9
Q7?>EcfD#*kIh%n20LfWX>;M1&

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip
new file mode 100644
index 0000000000000000000000000000000000000000..407edcbba9a970e50d7d579ee262a4209e36b136
GIT binary patch
literal 3427
zcmb7H2{=@HA0Ea!mcn4Nm9bRBEm`7<veYocwKG$Uk+Du@j2TPmex__?$x@-n(vbVn
zLYT`<NRdlO+Jv&ZvL#mw-M({(sq<Ce^PPEq&zbX_^S<wY`Mu}&J9bz&9})&0LD5<^
zmo{RyZ3X^=!4CwLrcLug@VFs(@RB?PIT#vlM-9^uLkN4(sU+`1q!0#*M?(m~M+)~3
zLGjpq#dS>FO&=u)gMlFoaQO#OnGEe9+L6`Agq9^2w^{T&;f28tz+o`N@?$h6BLsLZ
zqA)EG<H&=G4I8br_ezzpuey3+i%US*8%2piMF-){`wA!Nj8smJqGe0l!A6@dRk-lZ
z*6uk6A44fkvR+NRx4+I$NTbFr`14`+vCSgUb&J17&epzYee<eiK{i*&*pfDAD9cB;
zR=dP6`0h9x#*!zlS2EvOp=z`(_MBtLYkMp9DX);TW|G)4PG{r&yGj2_{bbsR9r&|g
zsK86AkGw;7L`hn<B|E*dLN!%(pJREZpR{>%3+iH;o8Rk#5fP$V#<1U=Xx%Axmo-LG
z)%Av3%FdHz$Y1=mMg=BS3{@LsDADY*nV7k^Z%s{F+n3M{7^d&pDl`84cX8#1?9I}!
zgw)4uu}hxH10D(M7bD`|ItazZ@G&J6#4AgM(`7`8^uxT&`VuE#QYPxA&1IvT!^`yH
z4X*=4*L80i43xT^j_oA86L7B4NA3!|j?UYVYjbQDE-T+P&_0aKt5tMF_qU6Kdws;K
zo`|8DG|`zV_ZVuZQksH8=K)gvD9dAV(eY$W7do-ZWvHh^4)+UgQ}<Ts@|iHt>I~j~
zbrlb%dC$hbOxh+D_685DxVheHx`s9N6rVnRq7bR|bB{#%oPqVc|MmsW%jb*@k7G>g
z^FNx;wrHFCeMF*igafnX*UtwC7>2!}u@8#iPYN`Q3EJ0;iL|4Pf_>W(r*COz&M{|u
z-NmN-)E-~1vyknJdPEQtH5qyu_(JHK@97=dIhLaz(Y-x6>jeMSrR2<CPmGMp`Ji*K
za)w0duxM;E58owwDe)HTbuyi{v+3n-Qd88M?R04)(GlnRt@)zAD&p@MJv)7Zc&*-!
zDOLI8)q{8|?@8;Jieh=q4B3W|4_b_{hkL0@e09%9QcC&@I!Au4dS{DneHxt-s?{?%
zkGcL+)ITok*!OGQoS(-JFA;|2Zs76f!pkYPhR4rFV;rBpx|2NPlWifM9BFUy(j~Pd
zUek+>RAGJ+`8xfL%Jh6JfwYeYVjU3P<WT=ZWFjMiMg~1_VN!NNetpY5yC%CU7t;vY
z2iZY|c|qy^`ByLHT|7s0ayXMIn@c>ChL8-ywr#cLKPBs{D#PY|FL&MuQHenEXbWtC
z!M;|+P9~9R0K;hlh7$o40atPzcL0+PIPNWVf<n?rwctxOro>*RTV*ZO`n8X9UKCgO
z7q=ZQXHCz}xNc7vr37SIw<E43)OYKcONdUQE8r*1LN}##O%&;96_+}jJ$6v8nXY~J
z@@QS$*hEOkv!e^6It6t5LNn_1SK*dv2ZL3y5?Zdv1L3SE=S8$`_Rn5+3B*ncP^kt1
zR|ICR_97LfwM;cW<=#{9%Be2xCP<Xrn2mkfRW8}U9*H;{Q<`EhzRl;4`i#H$vXbxx
z-aa3@9;y3SUZoTjhvwQ8cyBlttOY-ke>))MS!DTL`~kc~Yd8;4O|!)(nWn&5%rHne
zp_V^xaqE7^J=CZ3Y>)Ib=^K)d<NlUT^w?h(@NA%N9P`f;<vAnqn_d<L@6R)S-}tDZ
z4x2FcVfVd~M|)Cbgq>dm;K-E^|JcmbeV`VnQV8oHsG;O+nls`?Fn$4foOOdK)hVaP
z=W%T}O>J&um_EFGM`%2hKPsu(N-9=8S}fbruhp`=B-g?E_Y-Dxr%aFHX167|UYjLb
zj=K54nE3v@>WxzqLr$|ki5k@!JsupHAw>mmf-Ir=!VamS6NxFPPQlF@&bGQ87@A7k
zsj>7iQc^&^AxHS`)IafCJZZ<Mn?sYV%gRPj;wJs4@`}acBklQHe|vr=^+>F4pV*yt
zJv65dWr}9_i4^a}?a||X@1m1j+SvTelMygj&-5j+IjNG}wVk->{ax4NBHe9!JF@cf
zV~^@ihQ8b_%Qs+W)P4Wauy0>%f2{b#1$Umlv)5EJ1+98jVlJ0SrBxh>uAM$U%+M6m
zIKF=3TtU%CNq5Cgm54s8o=D#mT$86UxhKz-X1}B-uQZieBp!iopSro{-{r=bDvYu?
zE=Fezi8Eb%Fxs-ehZCY}TB9_5YQsb@eOAFF%6S1(noVG3ER^&UA0RBt>VVS2!;6H2
zh3wl^1-P+%M6tk=f6!{iLvc_$uApGBDAp&yG7xh@!BA&$!NGvnzVMkHmR|t82?u^x
zfwR-{)v8TbO0k-K^Pif)JutxX=}RYh6SpsyW4xpQo(Nt81zFUb8Wh4H1u^#dQz#mu
z2>wtKg-MnMMr?|K6lBWm1#bm_35+$y9V?88&x~I%!yTDkPzw1-13w1<4#paV&gULs
zjlykpa6jx8fWZoZeFHIsK}W-Yrh=bJ_a@WPe75EoSE#U0?rpd-APUBs@J>)zU_}if
zKv-6l{Co}eDmF|Cl;;Dv%Iv=;p!+2c(CK_l@!fXk|C&IQzOu0@FnBQ5CZM;na*{GD
z+esub8B||#5RpbF6X|4dd*L{$FLVy-EEdujpaElzN*|>94iAEd@FR(DEYsGeIi<Y-
zJs4{e20xNmUH<P3;XB(Q;<Q`N9nb({O=s7SbinOdOMWp&=3xi2H}en~=O0MsHlqJ?
zXI5Vp-|l*W!7Lwa7I<3jfw@*+fk2zv6#xvzX5oOpU$nYkD?v@og(U!8@|wk=YEb8L
zVSTGG(8o|1)U90DCZW%2pm(7#sMol#tW_B3JSYrmEiP<e6$Tm&3WFMh3nKs-vAiUp
zO`tHS3AnJrRTyXqC=8m|T$m8>@v*E1ra2S_%~LLnwh9BYcJ;k30yI{5pkB;_A2hK4
E0FnlA{{R30

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip
new file mode 100644
index 0000000000000000000000000000000000000000..21b93d67f95a1d2bc76990891c22603c8c06f3f5
GIT binary patch
literal 1191
zcmWIWW@Zs#0D**7UEjT*62FQA*&xgc#Q6n!1xZ|t5nQar8Hoj{LX4483`%US6$OcT
zDFG-tUC-Dpc4TH?0AWs`&dj|0(h|MAf-0CY4zLB)e{ZgM0hI1xWMJR|D=R21DF&*m
zoc%`pm9)Ug69LBp*3O$0H790C-qHnekJqnTykOq6_>iC}GsNe|PnpCeROWp~(~q@F
zJY7X>3DZw0H>+7zTwA%OFsx%+_GHhcJ564DpZr-AviHuVIg`#DS+wYg*Sa5zt~~j(
z=h37^m!1f%b8_p@TDM1}BS&$vxpTHga!_@E;^sE<#pc1<A&y?2sg7FViryNN_qUoi
zn=1z}u5L3g)=2K1KY#x9dIbjId*4(&=WcAxk&^6@<XC*e(amsM8bj&wnGU7J4W+ji
zyh>wu#h04M&@@R!MA$-5E#=Ug=Z7+x*p-=Crl>5Prt*@J0X00VHkzin0>hyLi1~py
zCABOwIW@kdvLF>_pyW-?T!;}W<*1<o4igbZJvI#nG#gj&bM3qYw8<411KdCw7)_K$
zK^wKAAeodX0L8%M8;WYF45q;FH!W^3y}ckSjUfvX{!5NbQn3Ykosmg|8I-=UCqJMB
z7&J73sLVWANTBOOk0yvN28M=40bnqqMH;$h^bkjAZUSmX3jY9aRyL3%GZ3x=(zQ$=
z9xTB?a_DXL-s$Q<Q4j{@v-G0Glz0PBE@g|12XTP8RfsJD%mn6DK1g152TEk7=9PFb
zA{>W3#{-Q7gC*b)0q1{oE$CStp~VWwM9TQEu*dFlgtkS%G6U`p<Um*mRPzBS$-n>r
DtmSMH

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip
new file mode 100644
index 0000000000000000000000000000000000000000..334cc694c49e57efd7c1f940693fd43e00d80b26
GIT binary patch
literal 1219
zcmWIWW@Zs#0D;A=y1u?1`ZvXZY!GGz;`{==f+Q}+2rkBOF2*Dw=KRtUDF!7bH)kt`
z02GaFE55z<W@cajVNRgN%sik{y}W`dm@*Eq`ETDm=>H6q?qp<O-~uZvC@m=ls;iv+
zM*Nkuz{wK<#{$;Qn-n!CW=Y=C1#yqpuUoue-n96TpeZxN=f_W(#3fYbeMZxdwM#r*
zMQjPvPpM;8v#hwba!p}a$F}Uro=bO{y!Jl%vnXWmolA2jojI~-(Gjn8KNeki@@LPZ
zNsBH$5nAWu)}gg-k4Q(3;$(B@Y>nig>Hx*fZRU&3i?u@>y*yJLwZav>H74(GHE%Xo
z4q#l}W?rn3+&h2%{Ok1!48r%msd~=c*qS3H*(1rZ_=ck!qv_UEhOf(KI(#i|_<DQ6
zt8|7}{H#J6zbqSnIl46@da}8vYj`s+-@rRdOK-!Zuni?M&)e^wwB*Vp6<c@+LW0I>
zqiLEeFi1Lpm>-B!Qp+-vQ{zi23sP~0Pu}Frg%|-;jv7GV;1OZeW7A+jv+-;XKhs5^
zP0ql`;0Dsb7^5^c+Nc#9$)v;vC^9DBa8zUbx+RriD=;#)7B_6Yy&x-{AqyB8ovNNy
zK%X-*i7<l_I`)JJlmLT<Mi7;m2MY*vedw_S(Z#^f&?pTIMYK3W*Nh(E2+h4f%}Bu?
z;LXYgl4J(LbwIkE3B-eCB}i7y{XC7`3MdM~pd6Q8l$a84fXJ>KK=a~3f<kQJP&x@p
zM}TQyM&@#Jb_c4+OwB8?V(?%@xEFhA0JH@RmVg5ZTsEL<K`#doS}cJ~q*4JE9oRjC
V(6$;_puoL`94-riYIs;c3IPf#dXNAB

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip
new file mode 100644
index 0000000000000000000000000000000000000000..08d071b420c216185e873f2899d723653f06674a
GIT binary patch
literal 1057
zcmWIWW@Zs#00EXZU0*(#X97k*HVCr<aejebL6Q)2erbsmgA$KRW@37NUSf`0W=@V4
zHy3MhMq)v#0kZ+K77rI|Sz=CUswe}4f`bu50E(fne5=Y;nHd;Bm=kDdW**Qyy}W`d
zm@*Eq-R3U3%!h!|VvGz7TwrAdr6t8cb(OQ<h`*8+IC&!ASistOlcMItEXiBCAnx({
zb&D6wn-(7uG-ZbP{P-!8xP;2Q&uIFwc8RB}h%I6IDdlE0%Zh6&*A#|zY|EbPxpb$=
zYwwdki$eC^xin|enInr99r0TCW6_l-fA&0@wCK_kp><Af9a`)5h;-yAPBwSW)<_Pj
z4p7|OX1>@wSUbeg%QMwcD_qfAWAgr1^Ja790LImA=EWMxz4PbKzh1AvAbjtes^{E|
ztvON~Ti-Zj=06K5XBT5L@X+<V#0(D*h)=CHnx?q|eb@oS{6L(NT9%od8edXbkc!j0
zd6P32V)(Zl)xTihiZJT2X)vJKC{kOs?+(zWbYR$W18HCcP#U;x)C$~WQUX`9N0MXm
z4M#P@YUAyx3|p7abl6(lu=VzWtaOGf{?NpPfPw@E#u<MmEjcnt#TFO{j7%cTpyZ7`
zi2)_RprH{&W#+-c1YI9`gh6yMFf=s20y5E}4_z~Q*dsK@0Am9wCIY-!*+7!aK)4P_
zCo+L}2;;D4QIHxCSOWF~IK!fALC=5?4InL{KqgXlgxJr-z{HRRlvoJNuM7+TliNYS

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip
new file mode 100644
index 0000000000000000000000000000000000000000..338cb152f91aef3bbc496f36fd64ad626bd17f45
GIT binary patch
literal 1054
zcmWIWW@Zs#00EXZU0;oyU>{u|8-!VbIKM!zAc>1TCp9m<B%@e}FEcN>C^fM-GcP^9
zIJHoUK}o;|A{CUFm!9gDnUfQMV&2LqGe&i01_lu31e%wbmtR_<msd~)Q^o<dR@g4T
z>;h2Qk&%Ic3#_c5w4@lQu5$Jp@mJCUCr<<%3s^gEQq-K7C3#C1#64cWZt;S7)8a#d
zrpyqZA3tRhmr$Aa8BIUdF7b2~u_a7DrP{1!S#fRUn!>P-ZP}AOm+mxq?S1lRQOMpq
zm*z}5b7aw?BVOx%EV}aK&z?t<7F~KGw9d(`Lu=h0k&Yb2$>z@48p%P`0g9X3%om#{
zYlk>`d8RsQg)4e%Oy1vW-fXTMz__~2yjUZ-cmDkO*XtD+gztS*^_;u0HAk{Xl4J4>
zM>WGOX$+?6vm8u|8%%F6$Vy|#;$s!+RQ0T4K=rrPM$<G`pnp4nm>-B!Qp+-vQ{zi2
z3sP}<K5ug7LJZ%Rqxv4~cM(QCHVp<e8<9hr8)zdiI)-mhGct)VgVHwkL<W=qgN8;B
zm6-<%hX8L>edv({(Z#^f(D(z$M2j+X&FJBc(3}NK>qwy=;LXYgl4J(LbwE0q3B*Gf
uhdqCS)PTSeuphv=6<rH@7KCU3X$b%_kuoI2ekKMchAg1OJYbGxU;qGtD?k1K

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip b/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip
new file mode 100644
index 0000000000000000000000000000000000000000..23cc6cdedec7f1125e640c3442f34ad6d98a9149
GIT binary patch
literal 1051
zcmWIWW@Zs#00E;mUElXLO9X|1Y!GGz;`{==f+Q~H%#;c)#t0$CN+|{<7Prj2lmHZ6
znr+j5*)cOPfG{UeS7u&*X^CE5K^06H2iWlbZBOq#14%P7FmQpD6_l0~1JzZ|ek1-$
zTHxe~fMWq`=S_;56SE|5>4Lb&>(?z_FmGCXNYIoS;`8IDOyUwM^FE{L$J!;Ht|GRC
z>8F&N)hsKntz1(W*0C*nvggvBCa=9u{wxaFd*{-eNoS5MT6Dx~-H%0Ap8VPKXwsrf
zPlVPvxpio*+auDEqd3{zIa?z+s5(G#bDQ~M^I+`|M=#G*N3C#0Z;i?OTg{uzl>-=8
zx0x4fB=^prKmU5Y0)z0qZ>pYiH@4<TN%lx`EWV-Wmda3?KFguBxS{m+f>&t_ulSxO
zq$eaKus)ikA`)!jxXfT%T0u$MrVR~CgS{txVL%NHtBs~<uE0>}0AhY1PDw4xOiqn2
zsVqpv86<g=GZ$inNjYkmfI~!tQIAc70nNtdo;#iwfi}4U!=D>S10#vj2xy~L1SFFZ
z0if`od_z$!mBADk{-(tZrneVlr7>jjr7{|NPZENJH6xP<GbmwWPkTTKFlcB5QJHzL
zfC%tL)rTHS5M2xm4UK%jP(+I}bj|1ij?mly)QlAT0p6@^AW3E*TnD79m_R&)ao95_
qNDT-q0s8@*RnfJe=S+wOkQOr_6DglU>}O(NV#oqY%m!v;1_l5*Moh2(

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip
new file mode 100644
index 0000000000000000000000000000000000000000..234c13761bd8a8e48560f4daf8eaa0eced30356a
GIT binary patch
literal 1258
zcmWIWW@Zs#00I9tU0;Rvt0kI1HVCr<aejebK@t~Z1Q%nr5L;McPHAef5W8nyN@j9u
zu@r+6k6TV^MP^b?YDj*8w-G}CidonA-SZ-u85lsA6KGauUVdqbUS2^JOc@8*$`z)4
z3tt1J`xqG*xWLK^N=u4?>MCcy5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VIS
zXvz%n`SDXGaS4@qpV9PV?GjH{5nICaQ_9V1mKE1lt|<)b*p@xnbLmc#*WM?87KQA+
zb7{_`Ge;IJI^wnN$D%7w{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(W
zuy%-}muIS@R=A?K#^n92=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXUo&dn7p)
z-*9weOpIm=sGr3dV$UA2pLbOq+k$@qVr&W;o=%3%!Z~i*Ct7@FaPKIXb>QF$21ZrS
zm8zaq@Q{N9iPc8aG*@6ybO13w5T~S;0Rud~q_Q9tXSn1|&RmEQFy*KL0}d7uMm;tS
z1~eOaWv-b%2invFj0A2V4UDNF9t(!s(ilpY&vYm)ZYaII;8hyKE56i3hNej>BElAe
zYAJ`_JU^7l#Lg@-MP=zUm6wdb@L*&TVFu*{?8y=+0R|0?ASyEt7Ch+s(BlrGi-Dn`
zQ5YDuXfcVd89f>hnp=UIks>9)o0SbD$qa<+fb>)*5D%8IAbI)uQAa@speP7~@?UyU
zVoJOLJU?@BL-KPxNSq6nsfE}gzyiR`ErOi6-GPcSQ}arU7(5seuEt(y0PO;UCE%a}
p7a{0c(8~sd79SuJsic5K3U)6cw4DJKGH?$fht5Kv8e<laLIB3+e4GFP

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip b/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip
new file mode 100644
index 0000000000000000000000000000000000000000..91b7a6ea038fd2d8def17ba40ca1ea57ceb7d33c
GIT binary patch
literal 1030
zcmWIWW@Zs#0D<H-UEkj<-J*g(HVCr<aejebK@t~Zgb-t-6oV40TTXs{krhJ#ik`|)
zk6vwN1_lu31nSAm%P%d_%PXjYDdPYey@H=>=Ov)DD<cB~7g$+AX-P3qUFGaI;;*Cy
zPM!!j7O-~Sq^LPDOY)X3h<m(#-QorFrp1Q@O_?D+KYq$2E}=5-Gn#&^UE=8~VoR8Q
zO0`+dvf|pxHHBdv+p;HnF5PMJ+WX|sqL96JF3p*A=E$N&N4(bkSajvdpFNKzExPnX
zXq}T=ht|41A{{x3lg*v8HIjp>0~9y6nJ+d^)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3
zd9g-v@BI1muh%Ou2;cjr>N$5~YmQ`(B*)|%ifXA0rpsqKm=-sf-d>QE#*oFAn#j;J
zX~~gEDz@+dg81BOqiLEe(8nD>%n!sVsb!hTsqrP11*tf_pEo&kA%_3UQT-40y$GWo
zn+5}#jmTlm4YUy$A;USa8JR?wL1`F!LIX;GK|>>m%FKg>M1VJ{KJ;jU=we`KX#53a
zqD30IX7mt8XwCtqcckzS@MdKLNiqZBIv}0S1mYo#!=4X8YCvEK*bm@biLM1b8$vXI
Zw3q;yNSP91KNABJLl#h?4VY^g7y$Y2J&XVV

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2886b5689db08e87fc4bda7c5790a629ec696d81
GIT binary patch
literal 1234
zcmWIWW@Zs#0D;MEy1peUwjN4AHVCr<aejebK@t~BN@7VO7jtG_iV$0VX-Pq8i4=np
zw^L$Ca)x_iNk(c>0E!_mc)9p|m>C#Am=kD7W*$(JUS2^JOc@8*E=`rI(yxHhvltl|
zxWLMQ1{VX>RnC4R{z_Wl<cWY|0c+<?ikcI%ByZ`0xX0_)EnYBhT6{>*lo{gl<EKpG
z5-RgPqv^-mC7!M#wuI@Yl$+HoE3U0vQyA8<Eqk)((w!!+y-)ru3fX(-(ws?Wjx1Vq
z#B1G;MOU8u+4E@9qDxPN);YO#Xsz2L(vhP$+1xo>BRQx#Kyh=M`C{{6?GQ&V&s0aP
za7Ay8$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`anP-?pqlMM
zi;3_GPMHMZoB-bhhFJny2O8MCCI$FBVVTL%lB@99DDhe0Lz8~iB)2TjNpcLRA!4=B
zG|d$l5*<Ly55y^{WtqvT@g<c7sW^irZ*t~BjBqJO4Hs~zh%oB0X)vJKxO7kc?At(_
zB7kwg4WuElOHm}WQ7aOXNr?nd98A8^sFqn_+}QIuP3t7Xtcht%rp?SbZM>@_c^r(L
zSS2nvcm^dwLY<LGgc+3Tv8P0!1Q;|lf~d?qSh@`GM%9NNUl3gk3=NG!!0<$iJ#@|J
zL66Yf0@REY5dq$;Y#>QyAY2EeJD5N`SjvIq+vc~mi8eq{5C&zr^rFO+cmveD%f$^b
zG#(@=#0kpNU?wmx3nS%acc5%$YF>#4Bf`bl3kRS*V6X%nOyHsdT?=|qfY1U;=ST$u
bEMT#F2cc~TuuOq_4mo5N0@X;ffD{4%S@3su

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip
new file mode 100644
index 0000000000000000000000000000000000000000..53885e5840d91f2d668b021e4bc79bc77456c773
GIT binary patch
literal 1853
zcmZuy2{e>z82*Pb7+gyuTVq#aFs@RAWQ}aqn52blW1FEN1`WCP35CpPxERaGXu_0C
z_qGs<N~E$iMfQkFopjr{r~415Q{3MFyyyMS`QGPyzW4vmc^;grC=>w*gF!ReEv1+z
zj|A7k6azCN5)&B!hX(pn{NbWRa-ae%B8CzfLs12^VY~b(p+Ql&Rd~&VQkOkp03b|B
z5Kkn7{TOm&{P#AAWg4G?>)5ZN08j%30QhnncxE)%N3CXr;re3`w76?`9NTu9YHn}v
zreHM(q4dC{nWHuYzsVjvx{&$Y!NxD{n;}#x^N3s<4J9E?fSJ~?OZ6TBUJN-IM|8Rt
z*@wf)=S<Mgrxi;Zk@`MY?q$h_OgGK2W`_FYYBRG&du9|D?zfY6S7xt^kk9IxY=>wc
zV=}iy-cOY_su`^7FxFpyZ)G<9?#1mpNbck$hluP{WPjU|+U$8Dm3`s}vBIT`D6(+s
zy|lSY?mi}bvssRd7cO)x+XI$Tmeit$!i|2LtDkB@i&-~XxJMY}FTC*FFAFExbNA>x
z8bZ}g*uAhcAH`ofd#_$`>AgT8c)7o8*egnTb6D?(SbF#$A6rn1TM@pPGuDS#N3{aQ
ztTip8@-!2QS{vo)4uh6ujQhy6T`e3;o;07Xvyo?J+wc&=?X+rF`*gcnT{56U>T+e)
zCfqe1f~B_ZaqnB2AncSb`WUAtpXef~qQk;Sj$F}uNpZW=9dU&6$4KTrWAVwN2G;3Y
z)=$;U_Y$7<--^N>pKxO-drr<O`zWQMG0K(EZOTQ9>CN%&>u$fLZtmmAU%!gqHKTj;
ze7U8KL5JF-1)|EZ)rDl=R>O?hzqRMDu0Igdb7oumWZq3@)uy3Vo<L&u3xD+K`$y=E
z5$6v2h}&yVapXIa-}G!Eg>AKg8{OHRv8f}#W_@6<WAUs;HT^o2|CMS-Wf!?wT5nJv
z6XaCA>F%>uFRv}UccM<Z^4Y7v_a^So3uH*f8^`~-Yk6#ZFm9?};E+Z=89L{#Wp(D6
zj+4IaB`yu${eoYnvD;uEr|w0qzs0`{ct7dnP%v`BH?)SMK`BBG=(JyeUS!3HCp2cC
z9>w$?Oqczz%O_=CUov`>kk)<vf%3(r3?D?O{4vk0j%l~o<vnfJ#+pqZv(o+wN#MpO
zY77MDUe=F4dDHTP7FTEcAqr(f?&AE;q->E`TmSx8b7|yRAJlKotq<)Yl&g^M3R~o*
z&@^h8<mn-jlLg%oTaxNmq7Kb*$-Wz>9^8kl=etge7A}l`rp4gf$H?Ah`y^6`$-QyZ
z3fm&7yoZ+`n>=xCwio}it23u4oN)ZrBCL4tND9gy;VD~I@ZcW)@pE%mlN}wQBTOgk
z#L_sauCgIObpHHg{#0?g7*^b4#{tZZCST}`@iqp%>n^w2@Pqf^3a4l+V_uNUpM}JJ
zN(@_i#uvB#IEyRaY-i<z<tCX)$=|#&u1Ms>l^6&HUgh$u4K+%S!pTX)_Tyg2ip1?m
z+uDz&b4Vv%7jG;u=<49^$UQjsdAwOM^Z6J~=908ZH?f9mm>Oa7Myqqfpzh@UdP(L&
zo?q%W1vmdgs`kNSqpKCdqG4;W1E>s{V9J6S7!*q+1o={^kwHS4D0!_krzY2V4|fMw
zUq(M%>V1w}T|s6LD|uBFWlUcWd_j>?xo-g-t`O(it-nVsTPZ|${W*~@s>K%YD{uq^
z149A8YDSyu18xH#lLG_*q`<l&bjzajOE#a$i~l2wT}?)_Sf(ogcreaXQ}1x9OY_tc
z+oO57x0K%9&Stl}@IJHqr<R%~=<MDp1)iN_CYslT$PCwFVC}<D1bhNctC5DS!V(NT
zY+am#Y|#g_Tv3dc3~U-!2M4L8VWUxmobvMWCrO&Xx{;51&a8%vaFx?45)6UuMw8>w
z)KmNrK*niHifSulTPth;fK@;$ZL}M7on^u4AtDG-q4fV+lfjMUNKO$JBKf;Quf?r>
zsTC9eB&TSD>b~aP)`Hjm^dH~?5d6~<;9SK(O922;@Z*7%E_A1#X85&z344TbZh3}f
iGy8Fa706o0S^)?lHsJCn|61V}0Ym^2xY4|PoBseys@zxr

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip
new file mode 100644
index 0000000000000000000000000000000000000000..8396b7975691ac2642673e77405cd1a54bb38d04
GIT binary patch
literal 1103
zcmWIWW@Zs#0D+@zy1sMtJm2X9*&xgc#Q6n!1xZ{iDTyVCTx>;&dFiReLTve^B?YA=
zLhL!IdFdq?#ZnAPJno4l8L34<U<CmvCi*SjeItySfdPa$fhK0=0k!Mp6;#2Laeyt>
zsPXB21(aUE$iTn_RtB`97^tpt_8ak6(gG(>1RM)kJ8x3doR}qfOBcjFUcYYff_c;8
zLxQHv5T74EWfGTAnfDn@Kh`esbQQ5BOh2XEtY%qpZRMK6u#RonlRcO2G<ofP@@G-V
z-aD7(OgeL9(V`<>>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z&e<BtLDd0@o7>D6
zn+I!$IC^=eI%<V0dTUJH-)i1$t{lL)y3M>;Be{3}{Q1}G6&Qr?eN*+EyRkJ#O0q|i
zWAP0~w~gD<7)qDVbSN!uD80SlRT{%9KGDPjOkR@$e4en(<Y@AFs}O9QxUA@*$yDaV
zpe)Zxatx>;VztpU%@r6D9YD+v#3`v|naQc~C6xuKID;i`a^^yea4AO(7jUSEFzT^s
zFre9J7Nz;_DbS`CU>tA*X-GT|<w!8xf)NR<LK?d)9d}6{+-01&YiGf$v_yucNlUIw
zQn3Yw1|yRQGbl4)Pm@3iFlcB5QJHzL6pF47J?0>~7#JEFg@7T87LVwf(IWw&85Au@
zQ4-+I$_A2T2EuhfdJ+?ehcFI%h6bqtfhAx+fU`Hc7W5no(E!pC1Y{!RUx@uo3``7J
MK#B9f49>s+0Dxs@Qvd(}

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip
new file mode 100644
index 0000000000000000000000000000000000000000..012725912304cd73ceb34fc03eb0941fa13b265c
GIT binary patch
literal 1106
zcmWIWW@Zs#0D+@zy1ucGi_Vz?*&xgc#Q6n!1xZ{iDTyVCTx>;&dFiReLd+5I22u=4
z!tRMB8L34<V8M`lm(;xCR4r~U_MFtb^pcEX1I7RpljYvt77b)(U;trGpvjqe`K2X#
zc?DH4WgK7!h}`j8@f9e&osofo3#_c5w4@lQu5$Jp@mJCUCr<<%3s^gEQq-K7C3#C1
z#64cWZt;S7)8a#drpyqZA3tRhmr$Aa8BIUdF7b2~u_a7DrH)z6vf|pxHHBdv+p;Hn
zF5PMJ+WX|sqL96JF3p*A=E$N&N4(bkSajvdpFNKzExPnXXq}T=ht|41A{{x3lg*v8
zHIjp>0~9y6nJ+dk)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3d9g-v@BI1muh%Ou2;cjr
z>N$5~YmSs;k0i(98;)*<ThbUxm(O%4Ep8~iz2H?E!z(^kA&p&@j=LNutdvc0Qae+p
zH%E=bU;(3v@Xmv3+tLb3+lmYhO<gig<s~BnYT#IHG);2_21*AI^8;~8YFTD-YJ5p$
zK`PD=%A1_I5F?1nQG*B^Kq8EKY#Iz`Ha;(5;lB#B$sHIS+&~%_XOzZA8@1vinUwed
zMaSeD2h}Q#x1};{1;)qL;)boa7i6U~WbvmaGBizEa%7T<EztLjOd`yn{D3_<0wutp
zp%Fx7=D`93T_1W(L3A-NG&ITrLlQ0C&^4n6Izn?lP%}~#1bDNufh3uMa2=5DWdiXK
u#$nIXAT=Pc1ndWJ4oBC5o>?IpKw6@JOr)#}v7d>7i6ILpaS52W85jUdH)iGl

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip
new file mode 100644
index 0000000000000000000000000000000000000000..9fd13e82fb758d5d8ad377234eb31821059781ff
GIT binary patch
literal 1080
zcmWIWW@Zs#00G{1U0(^df;2H88-!VbIKM!zAc>1HLWnU^ib09lJ*QNQor^goGgpE!
z07c{bHeQJcW(Ecj<^*cY%*!t=(aS5Sf+^zwoByn>@6c<YbOs{>0~c6XL1{@bP+jHh
zH{!3P1x}s_I2N#W-lV8GF-!86E{J=)e%;~)^QOgz1WlPCK0kiSBrc&c?=zZytX<;i
zDq>5ReoD1j&9dU!$~A>y9ow=edoJB+^4k04&!UjMcP`DDbmqvSMMu2W{aAG6$)7!s
zCM~-3L};CpTZh)VJt7@Bij&Qqvo(@~ssj`^x0x?CPu31`^zuw~)CyPh)|kA%)x6nU
zIe>9>n|ZNDa_{{4^RL${FbLoKrs_F&V{49Nk0i(B8;xp>JjqN+(<bpOkjzvIFt#|5
zc4NV;gA69hA&J|bC0a{td*;9S@t##X{&%|kt(y_y8I<HXNsa+ENUS!Rrnv%xq63Ke
zfjA|#EHgPZzNE4s6=%5QP0n115isSb0Rs*e5k@^W4F)tDZ9NvxdIq#90vHM0KpGfR
zLp&CEnVqmj!h)qU?&ej#RgFFv{<6$R+VYLpBq8rfUx2~E$RxrHO7+-NB~Stk8X7@V
zW*#hb(Dk859YhxcLqnr5Flf;t5?wQTEFd(u0yQJWNq{#i8%UBF2-gAWDNG<9!Z_?X
r7o-LRmVo^L&d2Cl(DN)r14xS{kcpIoA@(ydFfn8SC3XO_I0FL!apqN$

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip b/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip
new file mode 100644
index 0000000000000000000000000000000000000000..72fe453f766f850a480aa109123361bda2dab071
GIT binary patch
literal 1433
zcmWIWW@Zs#00E13UEhDYtR0a+HVCr<aejebK@t~Z1Q%<1Vs36C7fVuVNum&Aq!3GP
zYGR%cduCo)d~r#N6oV3{dr^LAfnR=6t`;8`dul;(W=?*d=$fPI=31OU-5^N`MlD`0
z*8HNB)FLGoKWC5d02I3oQ<_R9GBYrMFelLN%)I>461}{FDwr}3uoEp!+EZB=85pJm
zF&9`_L1{@bP+jHhH{!3P1x}s_I2N#W-lV8GF-!86E{J=)e%;~)^QOgz1WlPCK0kiS
zBrc&c?=zZytX<;iDq>5Reo7s)nq|ecm1_#aI<{p`_FTHt<hA$7pG6^i?_8QQ>CBNu
zi;j4$`?2WClRtYNO<Hv6iO@PHw+^j!dqg^N6epWIXKN$}RR<_;ZZltOUaTGB=;fK}
zs1>g0tuc9jt9i4zascD%HuGYQ<lg!7=U=Z^U=Y6dP1SSm#?~Au$sS3L#Wxz=8hw(#
zq<u=$I>{h4DUHdrg*m5#ca<WKgR>K>%~GMa;kVXrvj4UAT%1zK(RC`EM+5vebgq?@
znY&P@-8AUU#U(o@EMYE9XDD4hP30vc18P`VZ8S}D1%^=v5c30ZN@`hVa%y}@WkD*=
z0Lz=4xez17%27iM99|-fdTbgDXg2aK^WtU$#>{LW<_6jbjKbj?JxteSHk`h(@zB1E
z>2qp#eQ5cs>Z#FI<m8i*bNayxp45}4c(%I-A81{az`T5hvu9A2=Oj5`a4|B8FoSX&
z_T&td0E31`5S5t+3q5pw=ur#N#lX<er~?dow1`I6j2=q}&GUeok>V`Co0SbD$qa<+
zfb>}=5D%8fAZ5m_YfF!|14Tg?lyB3E5>w(0K;;HoWITw2UVd<a3lA<>;UUBp0ahl&
z1ukPCY)()?17-rt7D0H~;tn(}Gc~V-_%g<W5#f34WgO7SV6X%n;@}bwT?=~AgwRq0
eWFi$*u(-t@G6-!ZEJ)#m9LNiSY90Y485jUk0lt6$

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip b/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip
new file mode 100644
index 0000000000000000000000000000000000000000..ce0af76aa78704a8c724fdf2984d346d2c63a703
GIT binary patch
literal 1628
zcmWIWW@Zs#0D-)AUEgfu8qO_1HVCr<aejebK@yj2NosLPd|qO1YJ72WenF~!MrKM%
zYF>Ovd_hra8J9S^bb2CCB))`80$rjcGZ$!5YJm`j49r9+1|@d)pwQ5~%n~gTE`j3I
z!qU{d<ka|_)V%bPjA97}Eq*TUl%o8C_~H_fEkFSwF7CAaqV&{~_@vClVo{*08H^YL
zP`tJBV5(FiGXnz%a{}cv^YTke^zsU-V9GeazJJT_oXEn+z_1;NxxmT_N=u4?>MCcy
z5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXGaS4@qpV9PV?GjH{
z5nICaQ|g%2EGw?9TvHg<u`PSD=hB@fuf0$HEDG6s=hB=>XO1jdbi`}jk40CW{Mqwp
z(xOXGgw{E^b!e^IBhrzhIN97eTO&EBIzVxAoB3k%V(kz|FV9p*t#Czejmi64&6~}Y
z0~lAgnHOs$_s*X`|9ZUwgYdm?s-AN<w&qAl_DFIpzTxQBm{`qrp~Xaa1*c4ca87`4
z0>dnUQ$dB#6fbCq1Y0C7GuWP9P}24(#l|MteCPU8|4PL6GtUar+i)ptLh4L|(>n^I
zj~zO|kg{Z&%1cIgkV3-BYNKhID=>^YfS4bMQ&P(^f$>yQS&)h|!15+%F2o42a?}t5
zhnEPW9-9UOnvM7U-1l4q+T;n09c~~Ej6zDIr;S?ClT1qVfMRFzje}~93BcHC0LBiZ
zOagOG0B-_=l)$M?N54y4SaM{NiY?Ilj7%cTpv;Lqc>*QCprH{&W#+*`16?0_R6%qx
zFf=sk0D}@O;?OmthdM&@JfLQzI0*1&Wdlhv1K~O#J(mf@gJm#C@$#-Q*G(R%4TM4Y
zFuf=-CEkE0MGR))11n<i6lWm!2q~eLS}={^(k>pXMh0CCw8#U?0?ROdXc^`XG%zzY
zkFv7NgAoxa*o$dUNQ1zVMi2=uw9&Po7l9y!Ah4wII*>#v5MgN!dyIkgfVIW5AjKbY
N<SztDF9lT|3;+fy4IKag

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip
new file mode 100644
index 0000000000000000000000000000000000000000..1e1eceec001f50c5ca5f46041600740edb4dde2d
GIT binary patch
literal 1163
zcmWIWW@Zs#00EZ{UEhrdO8EqUY!GGz;`{==f+Q}+2qDHuDF!7L=ltRfD~13REj%-8
z#dMe%7(kd4s3kKmzqCXzub>L1j00@)3VyDgmw?i)j0_B1U}XiRCB;B>m9yW7zmgU>
zc_QFgz}k6}qUOXb$y>T0?(zC{ix<qB79SEcWrq0t_$iaPgvz|nX!@~siKnZGEn)g8
z)n+xzifb#^6oz$d%bx7Hbf?K{?~^}^LiXOdG-uM8Ba0Rt@mlv|(Um8E_B@)j=+YCR
zbxv*_TI=?RbmS;bHh0d}NDitFP~6;RzSuljJH*k;Gu2TmT+v%&^8QxyW^?5L#?@`+
z#Tv=I^XJdMUa!C)eD9m8=iH60Ig&k+9FuP-s--fRE}!XOTHIiIdqGwjLl$3ZB16-p
zB}XQy*unz{;&ZEwrfIG~A9nyTKM<#+mSrZV#+OtUq~i2`-sH@M82&Fu^*`A6B8+-$
z8VqPQo{X2;av5k7D6F}GG%!MjU0^3<FwB}1F=+`q(C>^)BFvyPj6J0RCBUGe5kzI?
z!GZ!^A9_4NbTKeAH2wlI(P9l<GkTCCH0J;#0Vx6kyjj^mlFUH34oGJ(fq1aI0?CCB
z6-qtSfubM`%2Me?i7D|0pq$7S84u!sawJ;>m<i05JfLjp4iwHz%`35D@L)tZ4tuT!
x8VLqVz##(8<LFw@b2UPXF_4Lr$6;ZQ-Q@^vZNTCI?hoWZSO`>e1t`hD0010XSX=-A

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip
new file mode 100644
index 0000000000000000000000000000000000000000..e2295f316c3a3e34087af99a2f4ac1c66ec8f6be
GIT binary patch
literal 1162
zcmWIWW@Zs#00EZ{UEdQsCMXC1*&xgc#Q6n!1xZ|t5kicSQVdEg!I^m(Rty0sTKpDW
zJfY3ZzyQLWKrNYh`K2X#c?DH4WgK9WSMYP~yabeXWn^IB0xK&hEhz@7tDOBt{FSu8
z$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm(
zsWz)wR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFok
zt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=L+98f!o~e#n;fmfGllQlpH=8R5Fs^Ph
zFV;xzoj-s6^?C&c;d|dyJ?Cz0&5`Vp<d}RzQ7x6hboopN)8Yox+Y7SN7_#_M6B(K&
zEjcnt#TFhw5T9FZG);2_`nUs#`GGhkwJb9^HNK>>AQh+g^Co95#PEMPs{g^h7h%+6
z(_lce5jm{6fi?moWH<*lBa;X-C<$XvXg~=tXlMjcnR&2~2=GSLhaOE3T?`BjjlY0Q
zv`9nOj2_|$%{jpIjuid@-mGjONoF8i2c*-PKs;Dhfn>s6Z4YAAfubM`%2DY>i7D|0
zpp3{C84u!sG9+6Bm<i04JfK|Z4iwHz%`35D@L)tZ4tu5s8VLqVz##(8;^<n?Gc`hs
hF_4Lr#bIHH-Q@^vt-#U&?hoWZSO`>e87RrX002}-TjKx#

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip
new file mode 100644
index 0000000000000000000000000000000000000000..87eab5db10f06128aec96cd4a4a500828b7753b4
GIT binary patch
literal 1162
zcmWIWW@Zs#00EZ{UEhs|pYjR-*&xgc#Q6n!1xZ|t5kicSQVdEgA&GezRty0sTKpDW
zJfY3ZzyQLWKrNYh`K2X#c?DH4WgK9WSMYP~yabeXWn^IB0xK&hEhz@7tDOBt{FSu8
z$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm(
zsWz)wR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFok
zt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=L+98f!o~e#n;fmfGllQlpH=8R5Fs^Ph
zFV;xzoj-s6^?C&c;d|dyJ?Cz0&5`Vp<d}RzQ7x6hboopN)8Yox+Y7SN7_#_M6B(K&
zEjcnt#TFhw5T9FZG);2_`nUs#`GGhkwJb9^HNK>>AQh+g^Co95#PEMPs{g^h7h%+6
z(_lce5jm{6fi?moWH<*lBa;X-C<$XvXg~=tXlMjcnR&2~2=GSLhaOE3T?`BjjlY0Q
zv`9nOj2_|$%{jpIjuid@-mGjONoF8i2c*-PKs;Dhfn>tA*jXIvKv56|<*4+c#FTgg
za7K*eVvCFi@rBqTz%(#d@_=%sJ5Ve$HLt{q!GjUuIP94kXe1ac0fz`Ui=%5n&(sJl
h#y}=g7Keo)c9$cxwE{~AxId5sVIfe>WuPPj0{|M3T2cT2

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip
new file mode 100644
index 0000000000000000000000000000000000000000..00a30f3611f954449c57141ef5a4b73dd862149a
GIT binary patch
literal 1075
zcmWIWW@Zs#0D-CwUEhQ=|9tF#Y!GGz;`{==f+Q~X%)FG$<kVszzKq1;jMS8PC|`;}
ziPJMLCAGo>D6GZ7#ZsJ_nj*oV#lyvxo1apelP|HLmn{Irg2QZPC$yOv7(kd4XhCLP
zerbtbUO^R183))MT+%B;F9D@p85tP3z{(0rONxQ&DrdhDe<dw&@<hO~fVJ}`Ma_v>
zlDBk0+~f7@7B84LEj}b@$_(-O@lz&o36*)D(ez{O5>HnVTf+2Hs?BPa71vg-DGck_
zmOa^X=}wc^-Y0(+h3vg^Y0jiGM;0wQ;<fI_qAO4S?0Gb4(WNIs>zv#=wASqr>Bv!>
zZ0?+`ksMSVpt!lse6e}5c8H^wXR4!CxT3el<o&JY&F0DhjH}zsi#3va=g*&iy<UMq
z_}({F&$%00b0m8tIVRswR7+(rO`qjpTHIiIdqGwjLl$3ZB16-pB}XQy*fOB{+-jp~
znk&%99YD+v#3`v|z@U#WsVqpv>HWOPnF}%eUyka3u<u0}_1H8R&}>8wYi^*8zz7-6
zfz8My!VF6A*b^F10t^}&K~!cQEF=QFQT3rm6GRsSLqp>)AQLUp&^4onI6`v{Fufy%
ze}FeD8%UBF2-gAWbS4lFVI1~M3sM6DOTc~rXJK?L=-CjW0i-1n$VAGN5c`=Jm>9Bv
L5}Sd!mVp5P`)W@5

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip
new file mode 100644
index 0000000000000000000000000000000000000000..fa9214050033581395e23938d57016178699921c
GIT binary patch
literal 3032
zcmai$2UHX37RN&gC7}pJ0tkzsR6!9$I-$5L5D3{26humZP^5(f5KI<i=|yTN4-x4_
z7X%d~p(8>lQWO;wl`b7=O6LXGU0u(<^S+sT=AQZP|NCa{ob#U_63)a70ql>$NO=>y
zvj1rldcGe(dU)YZ<Lp5|S5KU;4+uzbz_?>U%=WGr0xye~uMhpE5I~&6(39YU@pQl<
zyzm~fTp(5~j^OI<<$1JDC0tpS69n|aJ7V$T%m{sh%ZdQxKlVf2|Fk{M4gl;2{RFsr
z(w9B$iM#Pv8QZ~QWuUV{RhR*QQF?$5%II4Y=yd@F$-f!*`wF)$s>VeyxwA_0>Hi0K
zofa}CShp4UAy<MMrb$+fg$%^k5F%*arpRoAXc{sjn{nnBW0SB;niG&vP7<vHZuT-P
zTWlCDWE6uxN~N}x&VAGlB31Qlt-PIW_^O?9<JAmDPPRe0W#KE!b54UO*kCHeviz>Q
ze(>P6NzOr_Q|ZE@zeGcO?xRoqT3-EwUI7<ycICDuANK^?S-FJtG4&OyXfwOlVUtkL
zhm*@rO15(C=V#9WSz=yYEv_&!ZWc7M@9id;aAjue$#<0)D@ToICh(cp0v^A%&qLVb
z(Hi+Jct(T^YYQHc&+qwb3ED8|?Wc`VyXPex=hWKQ^mbsf&NW8eYB+I3mzvd@Uwb$W
z%A59yccPOw*g7foI4|2!$B;J(!J&P5W4I2bax#{?H-jTcU}zyJ>E$TWnWw7+>f+|c
zo8j9{>qs7L>X)pop`^Z7^izy~+D6`D>^F#vZrfgvTb+~BC0HwFj!I2!zw#flTvn<f
z%==z=t2w=C!pCx~W#B$lS4BPrB1GXee>HZlN*^?x7t*m-zmhXU;><9#g@~RT8C^h*
z8`ey<P%xX7V#20YF#}0)Kc~BJ>J?z*pGApG*F1bm7AA(D-<1T~64LS!1p<LLQmlo{
z%&rx~o`chx2C8oHZi&8Z&*#lpzb1yI#y=FO5LW5bbdgg_yw|W$&^nB+{y?F9As2<z
z*EN17p4BXrZ12lsUS9mI@r=W8=T7<?73F?`_0UQcWn`FpwC`0ly(MjxDUzSsG14iH
z^Veng<It!bvOr!&V4*?)=KT(-7Pcro9l@`=-4bQLAy6GYS0u6qQlEga`dW*gQu(C-
zvo#+Qof&<^J;!?0lFZxl8q(B2JVi+h7)cTGS|<nO*6P4#MJ+z}<q=!1d<;%I$M`&o
z)g|?Y@@b6$mmA#i=9isLJh^=_$X?*Xwj;ayG~*D=?b*URuWXc0%uLFP0=%!-r(ZNT
ztXan&Jy$f*w^H#0ABngQFD{-1Jd1;4u8!Ptl{&-xtfMgy)JU81$f(n2?J2jp#w{%A
zRmrcVCR-hLada-aQ|?M~M0Bu=fNAOocN*pK9c<2(V><U|NjYW2*8C5G(%a~;O+T3z
zMrEkoC~nKPGTiFVu>3rx3gm#|64vzWPYqPAaCYhUC~EP<0?KDqzHn0bY=_&U0WB-V
zD54M<3MELJzU~{Z^qVQ|X{&l{FxTO0x7IceQNGSFTK`#(CI%KsgW9PMK%;?W7PC{g
z)|Uz*HQryi*y>JJNJ5aniC;3=dAyXyHxPWVKco`vHMBfBo%I7?y(M`j=o)e1`OQb;
z+YTHQpfxHyv&7zmPi$RcTI?B?YHwNDZPSsqIi+sqQ3p?X`y0+#h9(tL^#!;KP2L8B
zYwDAEL#)W!yPfmt-5GqF(0|&CdWEvn&vaXfqz5-W9I<|`4p>{C033GTV{XquXO10d
zry*OBrCCW=GOy-)7G!%SyXKVTWoM<@qEX~@=v`ZK5{UOYyh{cFjD|X&<j-f^5Xw*o
zRe~T4r&*){fBW8{p(Sv;wn);=ESVl)`i_TY`lq7>Az%6mv^@#dV~%77k*P|7aPYI!
zneSE_d004F8H*yva1GV24zV7`?>C8^{k8Uaw0^J7MoJXh@>nbn31%xUQL&D3{vC<V
zETVZ2h>*#v4kg#EgTC&LQHURUW^MMgj_P?Qxrinw_V}w9<n(v?>R$CrkWrJ46(OhO
zDNW^PbP3<l+c2$z-+O&(zRpJCmFzt6^Xu{i1wXN^y_SM>i8td|w_CyiaftTzZnI-t
zM!xDLeG+Cx<WDZEEY?{=!%3YaBZ>>e^Z`YV;i;TcM+XWo@;lI&;pH|5QXuw90lX|P
zJw;f?jxF8K&b+))x9c2xSaWE@T!138OM*xb&AQb7-mbFRph**BYNKws(Xb<OP{i`x
zUA4@+s{VVv3UhJqy}3AA-K$x|O(T*0N?cGIbwJfk$P4G?7HRF<xUYU=#)GPon|=va
z6BFxW^Ezy2V4SiqkKYI#aOp#7a1u_eZY3*Q;Hx1Q?^z{w-f0bU1y_Mjm24|2)i+<u
zmpD_CGt((c#jHv1UY%#T5*1fFDWD&3Ez@Lgb_P3khoR7y%w-&ZGc=H^yv!p6&~Vp|
z@CYQZTZUeGuJ&R4mS!Uc+y{gYyEsYyL8(cMM9?B-M@~*u1QqmDt!qYphT7ntiW{3i
z6GV31A1*K5KIvx69Dbq!<xomo|JeV~z}sBY)uv0Lxf#utWh8ZWIwyaJQc_<zciv+2
z`2}&k2LR{C38AU5=FIC!3bhM74^PQSQ$B`;yg8px6VDpVqupfnWbJ&cp-k3R4EKol
zPprQ3`sjD<E7EQ<x_h^*%Ns7;Vm0O>o4dC@UL#B?tskk5=pLENO<&CFi+%(D#jaio
z;`Aa8V`Q&cArw*ZE?YAB1<M?H@OZFKF4c}_WoNdriX@2Ud7?AB#NIMjRxm5kRU9R$
zENGz;+39?gwq;{z6+^{s4KqDliqNolKLEDWkXq6!Vs3EwNjF?Jv<*^`YfNM+w;0ru
z<WIHR4JW`<bc|{Ochm#<OLNT6OUZFQI5ArN>lLD@?$%IA#B7`A?<<$(v$On<3ceCX
zGiQqFC*>VS*%S<TGs*^JLl6}9rl2~$t0IpQ<h?ALAhTSeu`2rLoWvmqLCmH55}hVT
zNpDz`HhOnF4E8Rubua7G$G7UaBFy>Xh4{M`R`WD9Y;IaVjTrF>(&89*8yP018N3`~
zo2YyaH4cn0crSS|i{K@n=(s3$Dj3>sX(=b&C4S_qLN5#_Xa<{Uly~bibWF-vEPc=%
zn>*$|4c0n&V=oSClcH(}wev{?LSgewF@|Hka}>dk>q;Eggq+t4*I~GeL{WUMgheuy
zd$PwW=Y@P2c__Z+1?N~Vby86>+}|N{o3N|)5wvU<39qHz_I`VU|J{q&DOT{8yJR89
z%mxM|oPiO-v=46oPDu2LgK;x>|8l@@2k!Et?nmo8Q~>}tgKg>7^n(%pDE`sKzKQ$j
z;_o~S2?w(5PcqSuGWzxSKI{E$iHGR<wE&|WM8CTq`_s;NjH9i>|G-ZW_;7XG{VHD2
z!TbH0A3*A1&c08o2Pqv>A>U)_MY_xt>*@1<an<0vPX9Y1(;xL<<R5hU0GPk^=}__`
l<$jak=;?RN``0)9_l|s%5$O>8cYnTbZVtUhZ2yM>_%F3F1;zjX

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a878aedf18c943383db392be5472f7471ed05700
GIT binary patch
literal 1747
zcmWIWW@Zs#0D-+7y1qe;eda1aHVCr<aejebK@t~Z1Q%0Eq!3d|gcO64sApbDYI;#(
zj%RLSda8R-VoGLeUP%CoF;4$yhdc%92VqX2F`0S!r6qcK1ywL*9AMkzql`s%Gchn2
zF*7i5ft3}MmJ|ckRW6;7?Gel<(RTjgshv8z5;LWQC+bd>OjX<Ux%S+d+ljMhxr><I
zeR?K!*UrZ?P4rWr8Ju;}_Fn5FpyCy9O5lHslcvzpnT=nQXC{fOD6LhQm=mY*=kD*v
z&F^dD=gGehRbA}iq1>so$Yo73_trTUJ}P>loy@LDdpjRnZZ$l8<mHW2>!`WmC#?QB
zHScOD{WGb#@i((;>Z1P>H{9{e-tt5|Vri7{aj(z1m47@<kFQ*NNT=`Wte#ez^oyUI
zjCbA=k}6%j@@A)64$sP!UPjj58I#I3tracVv$a8Vmbro7s(@JwJlzu)x0I!w2wEB>
zx&O(9Qwt|v4@sY8?#Z*{q5P?%jZw=cZsC4=NLogq_Jfqm2BwK?+XeVIJ|Bq-VPCsA
zZie6oo>P1Jl}y?n_I{MuBOTZf#IOJ6OZnkF-(H8*P8Q2IJvFiK%#ov=cQZVH`f08S
zi+LiZ@N1&Xo8-<LFHg)|o~q$}U-LIl`bK**kw^3Pv(0bSdLh6Uq&|u7K~Alh+15Vi
zZr+LiEm(eqJ60@FluuXQmgrn9JSF4affZW9{i?o)$~<|#wTn0=-Immuo}S#bchV}k
zU2H#-j~Z0G<2$k3?!fzq^seh`qz`?4FTfOaK$_+1Nfu9mojqLg{^5>guS80Bd%1*`
zu_v~*v-)4xEL2hAZLin7pnpMjN22^3LDxmIKM1)z{nA+6E>rW%&oST~bEKd#W6+$o
zhcfy(Vdt7d6^}jWv)Qn8+o>;}A7@NG@Zk2k!#lpWtIv!7bZF-ssfrzzjO_P(gRIUf
z^33m9qLsC4<xfk)xI33Qwn^6i3EIr<K575SxO>t~zR~O}v~`m=U7S#O%KoeRmV+!u
zu8BSV%e^n)^~>(*@iPJ@OLW)SKD;6Ppv`vO${V|nN`F7Q+xhJ8*&j65iTB+uPq^VU
zT}MmGAnTd7KmcP-75@jRY1cjeGoU7MtBs~<uE3<-0mS@3oRV6WnVcG5Qdy7+O6RlR
zh`*8+IC&!ASistOlcMItEXiBCAa8Q!!gY%m%$pV;5;SFo`26@OlemP+z0YX+v37~4
ztB5UO`YAQbifb#E2%{dG1_PRnIw_O57Bez1@BlLwH_%34&V%JDthU8HUXNi}8LDM%
zR$%L<Fsx%+_GHhcJ564DpZr-AviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@
zTDM1}BS&$vxpTHga!_@E;^sE<#pcP{A&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1
zKY#x9dIbjId*4(&=WcAxk?fJ=n0%vAt&#V0+V?d5lMJ&arZJf|Gv~DNu9D<&Fm~!s
zY5jG3wa&{e@ukJP{(jzF{KmArX5;zDly?`NPJ3&R{?oF$|7;JRz6`IdOihps(|x~!
z9~QaWot<vxvR^;c{cZWwC$db+n`O_vjehds({)yM$#TBq)23hQGuwBG?|0N}*_1>4
z7Bzlp1%Cury!fGT{^~?ejk6zDt!h8~^V3bfhnKd?GR3IMsP4aE_3qaE#}y`9ZU@?1
zT}<r0meqc{JI>l9Ld&w~?6-2B8<(C~tj^o_*t;>T{q@^&S(oUgvgvC%%5*BOJ-Tsm
z_nC_aAGRcW1|@k;k^{yiBa;X-s6fVEL;@wiprH{&W#+-67F{2Dx`XIqU}$K33``ej
zsSsT=dJ;lt4h5FfNC_&yo0SbD$qa<+fV3j0&Oq`5_DTh$1_YLX{Q$0F5H5gdL9aj{
Z8bDgyflQ?81Y$oE0~13QP$C%Y8UTn-r$_(*

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip
new file mode 100644
index 0000000000000000000000000000000000000000..1425d94c588e3a95c51fe7399dccba107a6644d0
GIT binary patch
literal 1608
zcmWIWW@Zs#0D-+7y1p(2Wj7>%Y!GGz;`{==f+Q}c%v>SHN+|{<UeCOe)bygn9M9au
z^wa<pt*T$yD{inbFn};8P-|vherbtbUO^R183)*g8aI)fiy0Xhc$gR%xWLK^N=u4?
z>MCcy5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXGaS4@qpV9PV
z?GjH{5nICaQ>x8smKE1lt|<)b*p@xnbLmc#*WM?87KQA+b7{_`Ge;IJI^wnN$D%7w
z{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(WvUZ50muIS@R=A?K#^n92
z=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXQ6PBsnJEXjE(D{hanaP5&grtcht%
zrp?SbZM>@_c^r(L`d4u8+wRNLx#|Db(#>~`_xF|+A6kB+OaJu7{72t5R0nT8^X{M|
z-|y6-;x|_1Y&W;xeY4g0c3AHH3wP5F-`RTf-Op6EYTo}0*3oLYzun)&Zd(_8`fW8!
z>09|bv3co#@7}cYOPej*?b>y=v)A#P*5}|i|6Sj1+x?|@>fDzbgMJI&EBt0Z@5#;M
z-=uGx{xapJ{;x$Dr{(L9ZwuXgJ<lwD&fleyGG%|{oW5<m$NFs_<My|+mYv-m{D0G_
zyz}!tew%&{yCu3NWozmN!7cTlK6S-<1|@k;l4C%PN~?{gX|BL1>;PhZAWlgw%S=v<
zFR3g@#TlP@lQS1$L})o`go2|}gi()8g8|LPI|sDV*8vlcH!$&V18oGRupys%o_%<x
zfth+FBqTeSUNybXy?sXO4S(U9E$cgXX8q57X#4w^mM;70`)Pa%mBzo_%PP6nzws|V
zX<cA@u4F}EMnS^mo3B57yPUY;&m58b--ohq@!79avs9VAzHr5>+b;#mdQN}5sL$y>
z;m^jYf4|kA6SpdvGw1z*qlLE<b43<E?mN1z&uBLD!B1h5yWdP^60=<LukO)<GrI48
ze3-g6`NvK%_q4Bd8}@9w$NBq{=4+X0rW-8J85xw`$rO5R@&0wr@ppH%4}MB-vA>~x
zQ?AS+PU-oMg$5@!ekz~(c<++0eXgeLGTxJfyeEADMk^ze2s5bM!Coo=CBUGe5kzI?
z!D1O*A9@mm=we`KXgm!}CTIx~T{C)WLTI)HrZ1$l72wUv29jh3!gWAe1X#o&k|w0&
zEc*R;mjO@|gh9n0u#`-RH-MI&Tx^x`AORuv%v>-7SeA;Sl%?)K+04|u5)Vd%E3p^J
yK(oPM2{>rL1vI)A^r9G{#Tv*&DwtuBfZYoSZC1d#0PZ2=&{zmmGZ9o#FaQ8hj$S4J

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip
new file mode 100644
index 0000000000000000000000000000000000000000..e723b255e952f70ff8a8c4633db4256967a2275f
GIT binary patch
literal 1314
zcmWIWW@Zs#0D-$5y1stiX?c=BHVCr<aejebK@t~Z0T*MT5Mz!MgA$KZYEfQ(X--b2
zSFlR}ieAg9^|2e785lsA6R0;6sIEjWub>L1j00@NvA=uQ=`%7gtOa5&u(E>El478`
z%GqzkUr7s`JP~j#VC}p~QFCIJ<Sktg_jvuf#S7+5iw_B!GDCcR{FF&tLS^1(H2qk+
z#M4#8mN5O4a<iIc#kG}d3d1_KWl#28y3^#f_sO3{A$#v!nltIlkwuG+c&+=f=*p8n
zdmc?%bm@uEIw!Xdt#x}uI&u^zn>%M~BnMRoC~j^uUu+(%9pdQand+z&uIQ~Xd4H>U
zv$=8r<LWl^VvXe9`Sa&ruUB9YzV}VlbMD6094W~jNsh%g9Nh}HrZJQ*pXpFq+)#RZ
z!K*ZeSA173%y!;ir=W2p@AvhL-KF=-Z|QCk+!{UoRl<6Sy~oZaY+hWOx?suOy)%3k
zglcRnaLRqiAXWG@W>c-hzFm7epZV?94*m2^a{Z&XcVmR!-rnDN`~S`7iTgc1t-O1?
zdsUgL=So%2DhAYGwc2Q!<_ZkX4j|?S;*`{~%;ePglFEWqoMD?cIddUK;FhBXE;wjK
z81>jR7|?9=-uWQ?Hz)#u5y=gtfw4_#Otw)gCX-2tNl-*izTv0_ibzvnM4A>inBHEH
zmBx_8cQxXYOjh}$7reKbYbKh>pDVik>+amV=?0cZk1g!FJWa9r_^XD{rnd_#yWe_E
z67rt(1sFz*Od`yn9ECl(10}$qp%Fx7=D~stT_1XULUb`OG&F_-!y7Gj(KVw-4MOut
zpk}0q3h-uS14%Li;W{AwmI=gzr7TF9@JsTdumMmMghBZ_y(lpy-T+iCu;s*qIMDKe
ztpLme77`-Jg@ij$MP_PVi3cOXmDmd#pxI!s1RONrq6b|IdKrVzVgqC%l{m0S!0rWv
UHW3yi4<U!fLZF&#pd<qW0FA!9w*UYD

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip b/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip
new file mode 100644
index 0000000000000000000000000000000000000000..0bdb30fab892a37c8744022057523026257daa4a
GIT binary patch
literal 1128
zcmWIWW@Zs#0D<HXLthon=O+VzY!GGz;`{==f+Q}M^!(BiE@lwHlAD^C$HiJ)l9F1+
z#gv&R#GIa)C&i#7=#yBPTI82sl<QuUn39>ASE9wo#hzMFoSBoKCn~=EqJ<U*7fWJA
zX0ZfA0E(4oSM9vv$IQS0!kj=WGxLD<=;aku!IW`;T_$|pWX&g_^h!ns1}?C&g3^*=
zpt{Q0Z^U0o3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180tNnAo@-e)xZ
zSi8j2Rm7Gs{ghf}HOq=?E7uf;b!^L??74KO$!qVEKZ`>4-nleq(wQTR79H_g_hZqO
zCx7-lnzZQB6QOlZZXH_d_K0-kC{8wa&eli{st!=x+-APmJX<@&(aST{Q7c^0TVwM6
zR`X_a<p9RjZRW)q$-VRE&%a);z#x3@o2uvBjjcISl0A|fi*Go(F(yW{1=P>t46$dA
z*w4GFj%~re05LWN4bM{v?uM<xIbPXp>`siEOvERdTrW+PHp$G}AuaGL*n83!cmP5|
z#%iN!nkz6wI)IoTh=DPanVcG5Qdy9SGkEeQXD-ADpK{dj0f&wVqaK?EFg-w{3}WN+
z5*GfeK%3lwals9wfe}V&WVBH$GLlJ&3{YH5zHv~k(s)}c!`9_99kv!XY`wi8E1e;W
zKQ)n|Y0{D-lT>VhzGq|-VFu+A>}e4w0R|0?ASyEt78vOI&|?asi-Dn`Q3@E6Xz_-w
z89mSuntOnnk)j~Lo0SbD$qa<+fOHoVh=(u^drk+b0f8l8KY;T+x)$`D3ef=4QVe7w
V<y(mTObkp6SwM*wz>Lkn002!&WF`Or

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip
new file mode 100644
index 0000000000000000000000000000000000000000..b38de79996df64e30b888f95c6440f6e6b3908e1
GIT binary patch
literal 1107
zcmWIWW@Zs#0D;_2UEc%Wtohx6Y!GGz;`{==f+Q}coFXok^rFNRF80*4G!R3GGrzQ?
zptK}Dr$~xHNyH~Lu_!MyFFh!+B-Jq`v7jU~KTnIFizBBfJ~5{tBT<y=y0T3GicK@J
zXT>`)GcbTKC(x$MJfMksc?DH4WgK7!vG=$ZJqJp+FfuT3ft3MmEe5Kqoc%`pm9)Ug
z69LBp*3O$0H790C-qHnekJqnTykOq6_>iC}GsNe|PnpCeROWp~(~q@FJY7X>3DZxh
zV^*`QxVCalVOYnu?8%->cbdHRKKZjKWbd6zb0(cRvS`r}uXR5bU3v0n&!b6;E<F)i
z=j7I*wQi3{M~>oTbLVW0<e=&R#m#Nzi_MF*Lma(4QysO!6}>ek?{774HdhW{T-|0~
ztdZP1fByXI^$HBa_r9rm&fVCWBPH1*$+7r`qnqKjG=|dUGaX8c8%l34c$LQRiZAuy
zu5~%VlUkCP)UUAcEihpgSm0Tl=&^j7%1cHD)Znn%Xqx5<42BLM<_F@G)UwRv)cBIh
zf>fMgk~cYXAx5B-qXr5%NJJR**fbc>Y-ExwId%nTlPfR+xPdez-YJTKHfqH{GAS_t
zih#*C6xC80Oo72~THIiIdqGwjLl!Lfr$kIz!VdI2Ba;X-C_`XRe?SQ^XlMjcnR&3(
zh^`Mkq9D2$7#bS+fdPpYZRncO!yKWx5vUm{1_HcU*+7!aK)4P_*D-;32;;D4YLFTb
mSOWF~IE$leLC>8K4InL9KqgXNh1k!;z{CKOxD3q93=9C3&0rb;

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip
new file mode 100644
index 0000000000000000000000000000000000000000..0d37ccb28e3ed2d9824742b46c735c1197fbb844
GIT binary patch
literal 1135
zcmWIWW@Zs#0D;a<U0)6P2-b2S8-!VbIKM!zAc>15vm~`hh$*K?ib09TCpEDsFEcMa
zD6u3}i=T_FAU`KFxl)NWF$btfi;s&vDX};;KBq`DP07SwOMr_fF}WnOEHyqYv#7X4
zf>BF^i$60DDprtMl$oC*(ZQ-E#Kns$Hianw#bu^V>#g*d85lsA6X>$cy!_G<y}W`d
zm@*Eq2OcW68(slQdonUGaDkN-l$I0&)m6@ZBmPQS;N*#bV*zXDO^TWmvm|fng1E=)
z*DYQ!Z(4jv(3BbC^W&#X;u0$JKBMW!+9jT@BDRF-r&OEOEGw?9TvHg<u`PSD=hB@f
zuf0$HEDG6s=hB=>XO1jdbi`}jk40CW{Mqwp(xOXGgw{E^b!e^IBhrzhIN97eTO&EB
zIzVxAoB3k%WbF_~FV9p*t#Czejmi64&6~}Y0~lAgnHOs$_s*X`|9ZUwgYdm?s-AN<
zw&qCoNODZRp{SP1V48l&q`1NK_JXW5hAh6P7gi*vFf>kDa%7T<Ed#3Gtu~sbxdQ#%
z0mS@3oRV6WnVcG5Qdy9SGXU}?XD-ADfpXLk0Q+BrQIAc70nJ9^PmZmZfi^h<Lz^2&
z1EXXp2X~`dW*WoRWiuVN7By_WwIC~%A&WOPk)a75+Kfyh%%IGIJ*@#Hz@VWKL}li|
zf+D~hRUdjhL3A-NG&KGJGSOlUT{C)+BQ)m$BLOJ_0=!w-K$6TrxDH5XF@bmp<FIFV
qkQxwJ0`>zq`=e_?&xsHXAT6_iOr(4Xv7d>7i6ILp@gA6Q85jUJ17-^V

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip
new file mode 100644
index 0000000000000000000000000000000000000000..cdc2079dc2775881c79099ca6a359dc0f384193e
GIT binary patch
literal 1093
zcmWIWW@Zs#0D<G3y1r*-moC%>vO$;?i1Q2d3X-^ZGxG{cOX71<^U_N)in-YH;)@G%
zGE0OQBc&LWczhtj!62ak6w{iQDrpBWGcbTKC(yLay!_G<y}W`dm@*Eqr8e~|&b<Ok
z=Q1)daDkNpZ7T+<tDOBt{FSu8$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9
z$4{BWB~<2pM$?bAOFUggYzfm(DL1QGR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%
z99gvJh}XIwi>^HRv**#IMVFokt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=D+98f!
zo~e#n;fmfGllQlpH=8R5Fs^PhFV;xzoj-s6^?C&c;d|dyJ?Cz0&5@Gqk>pr>!_m!f
zdm2M&`Yeai;)c@O3tpu$yyAPBn3j-`z?c-Kpy%mi*eRSMl$_1xp02^|VzpJplb-=K
zM65QNrnv$`q63KefjA|#EHgPZzNE4s6=$&IP0n115iaGZ;Q|g75k@^W4Pc}|qY4rS
zuHR23+y~lJ3ycG9APtNtN+Y3-T9J@UN+f{dVDb$|wU59!Fa^eeX>o(;?FCtB3|V|z
z6H^ir8WJTETn^tj;;=(Sb0Z|=8JR?wL0JHMY6MDvK|>>m0_Oy9+6?eU)rTHu5M2xm
z4UIy;Fhz?&bj|3&kI>u#)Ql7v0p6@^AW3E*TnD6km_R&)aoDplNDT-q0s8@*snNBd
e=T(RXkQP566Dj9H>}O(NV#oqY90g`=1_l6uKVU2X

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip
new file mode 100644
index 0000000000000000000000000000000000000000..86d5742dd78e16124ff80d99e2ec11d0f3f03247
GIT binary patch
literal 1114
zcmWIWW@Zs#0D;w=y1vG2tUH5&Y!GGz;`{==f+Q}Ml*E!mF7}+%y!4WcVj*r2H@+Y-
zB_%aQib09X2O=Jjn4%@Z#S0S8%uA0iOUx-v6$P5Xpe4w~Tas9mo>~$QF+zed0L4~m
z^OYU8%nS@5%n8(xnU`N$qL)`t1yjZWcAQ_$57Q?==?+E)1}?C&g3^*=pt{Q0Z^U0o
z3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180tNnAo@-e)xZSi8j2Rm7Gs
z{giUEnq|ecm1_#aI<{p`_FTHt<hA$7pG6^i?_8QQ>CBNui;j4$`?2WClRtYNO<Hv6
ziO@PHw+^j!dqg^N6epWIXKN$}RR<_;ZZltO9;_YW=;fK}s1>g0tuc9jt9i4zascD%
zHuGYQ<lg!7=U=Z^U=Y6dP1SSm#?~Au$sS3L#Wx(?47a5*lrEp?P+HtjdV9gEG=^7v
zsfi3tlT<{6Ed<q44*5Jdl*^=iqvgsZ6<Y?>u&~-_n&t`$3n1nP;*`{~%;ePglFEWq
zoB@(IIddUKh?Jv-2sk`M81>jR7|?87!Oykx63`}BVCZuLX<!sl8vJe43jSnLf*%z6
zlW!=hr81ZT!``&G!SwcmtTcu!NZ2nq0tswJCJ|;(zQCUJfD&NP&<LV3^I#zn;Ek#e
zJ(?i87#JEFd4R!)7HR03(L)@exellqDf|PxS=m66%s{vfNLMm}cnIUL=WUQ05Lg2C
k130InYeCPL5Dg$LWk4oUHig*F#K6Rm1(di2%*PB206rgIF8}}l

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip b/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip
new file mode 100644
index 0000000000000000000000000000000000000000..ddaa761fab5a68ab80c3e91857515a665309774f
GIT binary patch
literal 1207
zcmWIWW@Zs#0D&8wy1q^+tK`&xY!GGz;`{==f+Q}Ml*E!mF7}+%y!4WcVj;Ht(vpJG
z5-A2HZXbwHNM=rI0E$7(8;;alFf%ZKFelKU%sikby}W`dm@*EqeFfrHw;lqeI~W-l
zxWLMQ1{VX>RnC4R{z_Wl<cWY|0c+<?ikcI%ByZ`0xX0_)EnYBhT6{>*lo{gl<EKpG
z5-RgPqv^-mC7!M#wuI@Yl$+HoE3U0vQyA8<Eqk)((w!!+y-)ru3fX(-(ws?Wjx1Vq
z#B1G;MOU8u+4E@9qDxPN);YO#Xsz2L(vhP$+1xo>BRQx#Kyh=M`C{{6?GQ&V&s0aP
za7Ay8$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`;pk?#Esddc
z`Amn>;)c@O3tpu$yy8nuWN4bCA|h-dsFrfb=Ydj+Q<mo>IR?~#u-a&v<_Zc3Am#_+
zl+?1!<ka|*%7Rp!p^-N^b0J1>l%oa*I50#Q_1H8R&}>{YW7oP1K${$aLC+1OA(2W^
z*tbzD?2}0edr;6%zTv25xFwCj6d3fT#SNyn7i6U|Wbv^Ib*g$+0X@#hB*F|z<=9gl
zPy!4Z8bMTM9xM%_>qCzuh%N?(hDJ_cAfiPXx@Pq7Mrf`EYDNnE0B=?{kR&q@t^?8~
zOduXCxj^zM^G6*|L!c-KgR)n8QDREG0VuC>LrjVXi6F8pCnyJl#elh%3z}<#OLILK
z5zfS30ssvMgC*dw0hb5pTF~=4LW?JmiInqUv4GtV2yH8Xg$CS5$icA?q=p4V005(3
Bb0Po$

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2ef77b58d9b0d4e985c4c6bbf838f0b38ef8b8c3
GIT binary patch
literal 1240
zcmWIWW@Zs#0D-riy1tU88}H}>*&xgc#Q6n!1xZ|t5nPOsLTqJ;Ii;z^LhPA&DVfQs
z#ZnAP+&-y!=_MJ(A^8Q~TC7}**%FKaDCRXjHj;2+W?%qePM~?2dHJO!dU*v^Fl8KI
zYZU`Hx7`IwH!w0VaDkN-l$I0&)m6@ZBmPQS;N*#bV*zXDO^TWmvm|fng1E=)*DYQ!
zZ(4jv(3BbC^W&#X;u0$JKBMW!+9jT@BDRF-r<9x3EGw?9TvHg<u`PSD=hB@fuf0$H
zEDG6s=hB=>XO1jdbi`}jk40CW{Mqwp(xOXGgw{E^b!e^IBhrzhIN97eTO&EBIzVxA
zoB3k%VC@h`FV9p*t#Czejmi64&6~}Y0~lAgnHOs$_s*X`|9ZUwgYdm?s-AN<w&qAl
z_DFIpzTxO*xFwCDbooq&(&C2F+Y4T$F}&hq71G#cDY(lxao5fdFa2~qFEKNq27=W_
z(==D0=R1IyABa;@%YeZgUs73+iZdkgCTA|h2#Rvlpa2Ji2%{dG1_PRnfhTHa-UHgy
z0t|L;APtP0As+6Grdv}Pz5)aOYjML@V8Ev{yy6E2{4dLiU(*kLDSmi)e$=ET?7(1P
zWD;QpC421Y4=4c!4UHfwGY=Lj==#v345AB|jvHBlL5dcE=$g?(AECJ%s2M3P0=!w-
zK$6TrxDH6yF@bon!~@B=JgFPcIRZsN7?jb{ixN}f4ZxW<k_(n&xws)&I36S`#1;Xj
zf!SCXDI2>3r7~0VN}$=<1DK&<VTHYj09pnHOBz8GxWGWyf?f(hG%zqMX$$}|k;(*A
Zo6!A)(6$p;oWOmE96SqwY7|*O3IPaagFgTO

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip b/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2de39276663ad2b16bcbe54f9e7edc12a7ceafed
GIT binary patch
literal 1277
zcmWIWW@Zs#0D*=sU0<)VbsG9WHVCr<aejebK@t~Z1Q%nt5M!higA$XEvlcHGdv<D3
zUTRLU0iyvUkjt81l#*Jc#Ny}d5pKm0fMR0X<?b2Y%nS@5%n39xGcUiiL@%$P3Z{$$
zZ1LUWn;l*QrOO!^7`VX73Q9|gf$Az}zY%{WEpYNgz_Ea}^Cm^jiCL1jbV1zX_3IWd
zm^UpxBxuSE@%iynCUFUsd7shrW9<@8R}ovn^i#^sYL*q(R<0=w>)4h(*>mYmlh@uS
ze-?%8y>n^Kq%%hrEjr@0?#H4lPyXzAG-=VLCqnC-+&Z+@?Gfq7QJie<oUM@@R2`tW
zxy^jBd9ZefqnBr@qgJ@0x5niCt>(?<$^ne4+sunKl6&XRpMSkxfkF7*H&xHM8(VXv
zBzq(|7T;)eYxH40Jc(z4Vy0Svv&Dh78w;ckF_@@KS^DtcC0kWb4YoohX%@pC<`qT<
zjIXa#(cG%y$qx@XNRU`<G);2_21N%D^8;~8YFTD-YJ5p$K`PF0$(x+H5F=p9Q3D1X
zEFz40Y#Iz`HmX$gir)m<<O7TZZXgYeDN19ZjaspgOiC<(B4P54Mzuy>=7YdUkOW48
zvBiP38w+L~WH3>lqWb5+nY*f<ovNNyzz|?$5@81A0PG17C;<iyjUXyB4;CQk`p{zw
zq6?U-8ij$Oi57S0n$ZIup}7^P87Ue9yjj^mlFUH34oJ5%fq1a21j)nK=C9(828x0(
zD95E2C8oq1!1FN|TVy;)NQf;QN=JZcVCLrXadrpFWTxhoXmN9~=jF#IWhNF&fU-E=
z9Ph!1@DTQL1n3MfSON}3aH)c>1-+m^XbA!`k%|pid|~%3Lfa8wi39gLa=<MFs&Qih
GDFgta0Dx`)

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip b/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip
new file mode 100644
index 0000000000000000000000000000000000000000..0d9b65472803fc2ef673b7fcc38d716df7e15aa5
GIT binary patch
literal 1179
zcmWIWW@Zs#0D&1@y1x5<A6z2_WP>m(5a$=@6(n&nMhG!hN--$$IF_XrC8npQ_~aKC
z2cT$VPLW|TU}j(dVNRgN%)I>461}{FDwr}3u=&5Q+q}O5ly+rgVBi8PD<~}~2CA!^
z{YLziw7|&|0mlN?&YKi9CuT|B(gksk*RNZ=VBWO&kf13u#OKFPnZzYj=6y!fkF`rY
zT}5mO(@&{3t65fDTe+q%tYcgDWY48LO<sGS{8<#T_s*p`lg=DjwCIS}x*v<KJo&Td
z(WFI}o(QdTa_i7qw@0KSM{%;bbGAluP<4Rf<~H-i=E>S2j$WRrj#}Z0-Wrqlx0*Mb
zD+e&HZZj{|Nba3KfByA)1qR`J-&8&4Zfwnw?2+V{d_z$!mBDoROb65A2GiRMveFo`
z_)<MfZ={A#ni4T-2|EL-->o*9rnv(B+yTV=K%A0VmYJLyUs73+iZcN6CTA|h2!V3c
z5CHpMgi()8g8|LP1g-MRSAjM;14Ek|NCTr}C<k|=T4ox<)@3stwiY#Py|o}Kl_854
zExZ|-M3_OT8GC92N`OH_BZ$h(g9S!_H>y7Ln1bkHU}$Lk17xDb8@gunKu2iK14aW<
z6a;v)vVkO-fp8s=&S3)aU}*-DAIrWvSLp&pK^T;+(u)#P;te2ql8db}9>f=7ivZKW
z3@U=0LEV9JnW=dt9*hVlV$bD3gTY`4I9$Ma9$gE1{zhoA1Tv9wJ1iuzyC0!#0<fTf
R`vy5E76R2g0ZKA3003ZMY)=3H

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip b/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip
new file mode 100644
index 0000000000000000000000000000000000000000..4c2a40e90619e013ce9369b73c60002aca656999
GIT binary patch
literal 1398
zcmWIWW@Zs#0D<a0UEkgtfBLO}Y!GGz;`{==f+Q}c2tyz?;$n;xV#&!bE|y|Y67x+g
zO3%y-O3cg7%u9DJ$}CAO%FNHx;^kt?1<Qy6bu$E@815`(vT!Fe0|N+i0%bGv@=Htf
z@(QY8$~eGoXk#|m?ZwEzAk4(Tzy(%TP+C$9R989sjrc2Rfs-c!js>ipHz{gP%#ysN
z3*sKHU$=O{ylL?vK~rXk&ySxniA$)=`;4X^YnOPsir5mSpHj!HW?6A<<(k5<j&0eK
zJ(un@dF_4jXHm%BJD286I&);vq9b1Gek{83<j<Z*lNMciBDBuQtwU?w9+8e5#mVN*
z*&4|~)d7l|+sqf67i)(&dU>WgYK1F$YfRqXYTj(F9Kg7`&AeD6xp)5j`Pb_e7=-VA
zQ}vv?u{B3ZvPY6*@eN0}!mVixrORhJlomIX-d^x3jo}sF)fE<>FW5bPD7pJ&ZRoe<
zn}5iLRNT95e`)ot|MwT&uzR`w%D$c{r>pBD;x=h}I&Ewf&f;djz|qT=xZ8Er?l+J1
ztZ&L2{K!20_R9^uZ@;v5+i&~*cBV`7x6~@#4au=$TjkGk=(RW-C-OZ#_BOzcjg3u$
zjq`AZg!D0ePcL21OU&?yfrPl#M$<G`U`TfWF+UKeq?TnSr^c657Np_~_PoiN3o*jI
z95vj*p)SIx$EJZu3J@EQ%vSyK0u;Bvc;*Juz=S|)B)3s3l9Ne^WKbMWzTv0_iepn?
z9Gez5nBHEHmBx_8cQt{5segO!H-%fJio2Par82_yWnKSJpm<@~kx44Hz$jp35@7~q
zJ?zOIC;<iyjUXyB4;DV?`p_c}qKko{p|J=UxM)#{t{FWZ5SpI=H6z7JfHx}}NRk-{
z*8%CTz|4t=R7l~Gw4-Z&G*A?TLHRqqC^0480JQ+&;sg~S@gON7_6S25!wAd(mMe1D
z%N2K^`pne4614KggAw5&>;)Lm8DOvk9E#xL3|$L)v4qeP3uGb{Qn2{K?puU5TNWh0
PBM00<pql+aNd^W0yvWEo

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip b/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip
new file mode 100644
index 0000000000000000000000000000000000000000..6255a751d925c25feaa01a1136afbc68cf932bd1
GIT binary patch
literal 1027
zcmWIWW@Zs#0D&2Oy1pMOgY)@-Y!GGz;`{==f+Q}+2qDHuDF!8$;LP;A02B=k$@|Z1
zGcz!NFegw$W?p`2iC$ho6-*fi*xVKTTstoTrCk{r7`VX73Q9|gf$Az}zY%{WEpYNg
zz_Ea}^Cm^jiCL1jbV1zX_3IWdm^UpxBxuSE@%iynCUFUsd7shrW9<@8R}ovn^i!(M
zYL*q(R<0=w>)4h(*>mYmlh@uSe-?%8y>n^Kq%%hrEjr@0?#H4lPyXzAG-=VLCqnC-
z+&Z+@?Gfq7QJie<oUM@@R2`tWxy^jBd9rqhqnBr@qgJ@0x5niCt>(?<$^ne4+sunK
zl6&XRpMSkxfkF7*H&xHM8(VWEdn7p~-%wObWiVYn)4{a3!SwcmtTcu!zSKm9rb$bV
zOj5CBK=rxRM$<G`ppQF%m>-B!Qp+-vQ{zi23sP}<KW}p8LJa?xqxv80dl5!GHVp<e
z8<E4B8)zdiLWXl-Gct)VgAy?Iga(uVgN8;Bm6-<%i2!d@edy5y(Z#^f(D)0;M2j?Z
z&FCSH(3}HI??~Yv;LXYgl4J(LbwE0u3B*GfhdmR5)PTSeuphu#5?u>=HiT#ZX)y#c
WkuoL3ekKMchAf~&GceaOFaQ9JvO(kk

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip
new file mode 100644
index 0000000000000000000000000000000000000000..305f6fa75e5fa6130f58015317fcff51e1f90480
GIT binary patch
literal 1393
zcmWIWW@Zs#0D+Tzy1vq9Y!gg?Y!GGz;`{==f+Q}+2rkA*A?Ezj5-A2HZl}bO<P6`$
z65rAsEp9IMlA^@C;`l@f1_UQbf*}CKSP8CQU4hID3?R%2G&VC2XqH}HK^06H2iW$%
zz1K=V1Ep&i85p?0$_h$Lih=4XXTK4DB`t9BM8L6tweu!L&52o(w{$_=<MrznFPJwi
zJ|t+$4DtE#Qzmf<m3g1h^keN3PgfCJ!t_&Wnbj;SuB}{C7}l{Zd$Q-!ohGlnPyQ?l
z*?Z^GoJnVnELwELYu%4USDyUY^JvndOHYK>Ik|Oct=l8gk)t@-+&Nn#IjA~7adVsb
zV)Jb65JxZ1R7b6FMQ@GC`&-SM&6NWfSGSoLYb5v1pFjV4y#j;qy>F_Xb2ql;NJ;ic
zaxA{#=*E~B%@$BUi!;QYJz_uasyem>{{p5NZ~u35s>;F{2RY4~nO7`OTy~gk#U|lb
zySB{vb3`avd7aa`E0a`g;UNeK8mo<_X|BK^=>TGWAWlgw%S=v<FR3g@#Th<%lQS1$
z1W-9@0D*%?gi()8g8|LPr_WXLuLEsz2Sx@rkOsyWrLoaQt=LE=B{o2jG5N+pwMye{
zsSI0}&ve*Y+_3fbf~<6gEdEqZs{?t#lcq#WTEY(WKO>U}Gbl4)PmVwdFlcB5QJHzL
z;6T@h9#;@u3=9p8(!j7pi#c@7=)sQA+zZr<6bS*|tZX1jW*}S#q<fh_JXnT;WM}Qv
zmIcN@Q4j{@yY!;Olz0PBmgWGO6c6HpayDB8n29rUCrL1RFru0qJiTf)hRKG|!UAr%
z5L+aElM^8(!@?bVAp~?d7%TyYJGe+fgc?K(dI5vb5)Nb{6*s6hp@$Db+j(F~1rIHB
f+t59a(5A@()dmU*SPZc+urS;Js%ZvFGB5xDsTQXt

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip
new file mode 100644
index 0000000000000000000000000000000000000000..3a72d02000f275c99b627aa67c38595e3b2dff8e
GIT binary patch
literal 1396
zcmWIWW@Zs#0D&KUy1qen?fUvaHVCr<aejebK@t~Z1Q%nZ5OaQMi4=npn{Q%?Z)uJe
zHy3+JQDR<ke4+#cf|De{5P)Lhr<CVAT$vddK$sI~VrCxD7`?oLDwr}3u*D~heU>~2
zN@p`NFmQpD6_l0~1JzZ|ek1-$THxe~fMWq`=S_;56SE|5>4Lb&>(?z_FmGCXNYIoS
z;`8IDOyUwM^FE{L$J!;Ht|GRC>8F&M)hsKntz1(W*0C*nvggvBCa=9u{wxaFd*{-e
zNoS5MT6Dx~-H%0Ap8VPKXwsrfPlVPvxpio*+auDEqd3{zIa?z+s5(G#bDQ~Mb8qbs
zM=#G*N3C#0Z;i?OTg{uzl>-=8x0x4fB=^prKmU5Y0)z0qZ>pYiH@4<TN%lx`EWY9B
z#+Vq*7EnKnGsK=fVn6SyI<^J>0;XBd+f%0MsgYK2=<EXqtr-Uvv4&i@IWua~5_Wi~
zL4w0-qiLEeFc><3m>-B!Qp+-vQ{zi23sP~0N#5klg&2WSjv6T7AQ55IW7A+jv++Hv
zsqS^4P42)5;0DsbSfVrr+Nc!+$)v;pC;}$mIH*=>ye*Ys>++cnTZ<dE-d>QE&XC2Q
z8W^+p_A(XCjVhk}K>sr`i7<oGKK3LClmLT<Mi7;m2MZ2#eduuo(Z#^f(8v!AOSG6n
z*Nh(Q2+fT^%}9|D;LXYgl4J(LbwIk23B-eCB1rD_-uXYv7AOkBpuCn|l$a840LsA}
zK$GG@Twq29W@jO`2rv&@c21ID)Z*e|ORUH&jt8<m7*XxWuedxH!wy4eDS>Q_5L+Zp
zI}(xX0J1@Lz~TaXIRx|!7%TzD1-Mi~*MeTqAhZMlnMlPBs!ix&hS0VTSYp9L5ZyNP
cU_fY7Vu5M{1qv+ISQuCsZUEJ^0VNq209Bi!lmGw#

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip
new file mode 100644
index 0000000000000000000000000000000000000000..bffbc66c8ea8b1f8936aa60c762e5a40a9f2663a
GIT binary patch
literal 1118
zcmWIWW@Zs#0D+%<y1wPsUw=CT*&xgc#Q6n!1xZ|t5nPOsLd^N4B~lDZ+)jxl$r-+h
zCBCIOTD)AGNk#dIDanb&B@&EU++6G>MTvRE@re=)2u_j&LjZ~`?enAj0+|^YK$sI~
zOJ*L>XuZ6GDwr}3uw%Sd?wtM^C|$$Iz`zAoR!~|}3{+P+`;GW3X@Qd`0*(c&oi{0J
zPRx?Lr3>O7uV1%#!MthlAwg4Sh|iCoGKov5%=?U{A8VI*x{BBmrk_&FtY%qpZRMK6
zu#RonlRcO2G<ofP@@G-V-aD7(OgeL9(V`<>>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS
z9L34z&e<BtLDd0@o7>D6n`di>IC^=eI%<V0dTUJH-)i1$t{lL)y3M>;Be{3}{Q1}G
z6&Qr?eN*+EyRkJ#O0q|iWAP0~H^#(hwt)IsoFVq?5&L;p)v+!37ch-=UHu!cNdadb
zw9G!p7^09Uw_AM0CgE4Rw#@l+L?~H#ozuE2lT>WsAqWW?tBs~<uD~Ga0AhY1PDw4x
zOiqn2sVqpv89sTFGZ$h6P&sM<frCecQIAc70nJA10KekvK%3lwk--h5fiXsDY_w4;
zHj+t+4NzoEzHv~k(s)}c!`9_99kv!XY`wi8E1e;WKh^Vhd9lA|P?F~)IiUX;nM9aD
zc>{ZL1WJHGLnDaF%!367x<2%{g6LvkXlRrMh9z3ep=(ADc7*0$pk|~<2=HcQ14%Li
z;W{AQ%LL*fjKiM4L25u?3D^(d+>WjVJ+neIfV89mnMhd|Vm}iD6GIkI;vO(>GcW)E
D_Z4gi

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2cf316c359a4157760ef3f219e2b6859ff7050ad
GIT binary patch
literal 1165
zcmWIWW@Zs#0D;(kUEhnAtBUx6Y!GGz;`{==f+Q}c2m>L;NGS#-7T?swyZ{s(J10dI
z8Z$F6fG{UeM`m7rX^CE5K^06H2iV}r8!eVz14_FyGB9v~l@*kh6a&>&&VD2QN?PFL
ziGX7PYv)ahniI1mZ|Q=#$LrTEUNCQ3d`Qrg8RGNfr%d7!D)T<0>Brh7o~|Ocgz2YL
zo7F5UuB}{C7}l{Zd$Q-!ohGlnPyQ?l*?Z^GoJnVnELwELYu%4USDyUY^JvndOHYK>
zIk|Oct=l8gk)t@-+&Nn#IjA~7adVsbV)JC}5JxZ1R7b6FMQ@GC`&-SM&6NWfSGSoL
zYb5v1pFjV4y#j;qy>F_Xb2ql;NcKo_OulhYt<rc~D#O<0Gaa@TH*CGVAS<0Ai$8Ux
z#b<#Gza-B|atx@xx7ui$<_h$62M~jTHYK$zGdVTBq_Q9tXAtB~&RmEQ2IZ(>01g2W
zMm;tS1~eOy!<!puBQR2idw?@Ci7<mwGWG-qlmLT<Mi7;m2Mdh=Z&ZEgQ3cV(z|he6
z56DD|ICRbEp^ng808IBtaS-6m$_A2T2EuhfI-d!|gC!bBUi_Xn`?E4o6of%JE4?T&
zCEfs(AK4<~K^!6W2m>$!m@|1mInx~|n3<Ya;=zb;81{S(G!6`wfI|bE%h9!<XKaKP
hBOntgo5R8nyQ>k}dVnPa+z-eBun?%`8c>pf0RXvFX6XO`

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip
new file mode 100644
index 0000000000000000000000000000000000000000..652b0dc0909105f1f0fc7faac8659abf3f935031
GIT binary patch
literal 1122
zcmWIWW@Zs#00EW>y1qhQ56`#&*&xgc#Q6n!1xZ|-IjMQ+B^kx>23#Cvi8-aIK$Z|+
zZfa3_YDzp*PKX!EhbWL@P~!0gDez43$t*4@4nVPn+imjg5M~Aj5atA0lbM%aTB4U%
zPz6)Q0d`Hu<1+r|K<QRS1_my$vVzi*VxYRp*>A*ONei4j5pXPE?Yv1*b7Gd{EnN`z
zc>TJ?3+7FW4+)wwLwtVxlu2AdW!`5r{aCxi(^bTlF#VKrvzleawUuiM!#cKQPxf58
z)8w`H$)80bd+%JDGwIBcMT?Gjt^2X)%9B5P9!*+w>50%fC$|o*b$diQaug?<J7;So
z2UQ0sZf-MQY#yu~;^^g>>Zld2=&dn%f2(=3xpDyG>NfLYjpW|>^XFf$S6~pn_f6Gv
z?#9*}Dajs5j>R_|-8ODZV<=6Z<xpDOP<ng8t2BmJe4>dC%xfHCCbc9fsb6v7TTtY-
zz{E<=(@WR$5;Fs8a9C|LO>+eXLkAG^193`fS!Qx-d`V?ND$X#;o1D22BT&jw0|gu;
zB8+-$8VqPQRve4p_zY-MDKG-Kfiy6dhGz`)Kx5#9%du-bedhwiI!dxUC&>YWgON#u
z8I(V;r%0d#7&J73sLVWA=%DLEk2;7h28M=4eqhj|MI^dr^jJV>ZUkyZijx3uRyL3%
zGZ3x=(i50KJcM!B^EgNi2rL2n0i5H}wV>x$hz5|BbRZKc_d@JvVqjv(0!lmrW^V=t
E01^UaJ^%m!

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip
new file mode 100644
index 0000000000000000000000000000000000000000..3ff14dcf91ec12897da4de64275f04c5746b7918
GIT binary patch
literal 1268
zcmWIWW@Zs#0D;H}y1uhN&wFMDWP>m(5a$=@6(n&nMhLOwrY7bIag-$%WhUk&r%Ev>
zvHRxdrskCtYjJR~Bvzyr8!%|`aB*a(rWT}R<`zpZ1fUq~ujUkB$IQS0!kj>ZGxPFG
zOZ4&zs$j}E!1h0X^xxzPP}-G|fq@IGte~`{7^tpt_8ak6(gG(>1RM)kJ8x3doR}qf
zOBcjFUcYYff_c;8LxQHv5T74EWfGTAnfDn@Kh`esbQQ5BOh2XCtY%qpZRMK6u#Ron
zlRcO2G<ofP@@G-V-aD7(OgeL9(V`<>>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z
z&e<BtLDd0@o7>D6n<s0BIC^=eI%<V0dTUJH-)i1$t{lL)y3M>;Be{3}{Q1}G6&Qr?
zeN*+EyRkJ#vPY6*@(o3`R0h-KGaXEe8%%F6$Vy|#;!CxxK9=t68I<HXNe&)F5Wib(
zG);2_`ndy$`GGhkwJb9^HNK>>AQfi-<W0_8h!Fzis38FMzX+oqn+5}#jsGt$T>cnn
zQz9_5xq&n=N+=C(H);iUFe$+;1q$!QHyYhC(-^)ko9XbisNw6a1+P*WUhzT$T&Szg
zZ$lm50u^R~DJn~+sk~$ah6N*&2s0?{V^4cP2{34A1W}oJu%JQLhaPVbT?`Bjjemel
zv{*#fj2;CD&3V8`L5h$7Z&o&tBr_1M1JYGYARa9DKyt78I;W`-Kv56|WwP|5#FTgg
zP!8q<W#4!ZpNkikhap_X2rfw8W{Us|0CP4!G-ta5m1hF8IIbM-!HDn;_TmHR1~6Cx
s4nlBYg02O<U_fYz1Tv9|30P!d_bNi$K41w0_c(H>Ed;8uWdSJ!0N%rf#Q*>R

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip
new file mode 100644
index 0000000000000000000000000000000000000000..437698665e19adbff0b0d08b1f0ba790e069c0eb
GIT binary patch
literal 1187
zcmWIWW@Zs#0D;yCy1t4Jg*)PaY!GGz;`{==f+Q}M^rFNRF1~`qqQu<z-2B|sypqyf
zE~cC!A+U@Pnv4{K61Oi@Rj|8@77rIkUTSekYEgcf1cMeo7YED;(X_NQ`v4RNygc}0
zel;@#0|;{h9gvxqUs|G<S5O60#sPMx+novVoQw<%bAXr&tgN84q!_5Ka`qeXSJDC}
zPXrtbSUYc0)SQ?lc}o|>Jzl?V@q&5N;zNR_%n+X+KV=e^P?`4`O+VHy@pKijB}_l1
zj#<sJ;@ZkJg<&1rvL|~k-D&dL`{d7}kiB;<&6#xO$f89@yw?3#bmhsPJ&z_Wy7WY7
zos(OK*1A0+9XX1V&7HF~l7p%P6gRh-FE%gM4srDIOm)->SM=7Hyua1F*<3k*adn${
zu|{(5{Q2{**DEjx-}|QOId@}gj+A7NB*)?#j&2*bq%o8(pXpFq+)#RZ!K*ZeSA3!Y
zm)>kRy?xS#?)oRW)v77;V?BksGUo(bwrKE@Ty}2j=C}J3zAvA1@!Zni+Pi}ECIq={
z2%7fpOTvQ3LEe+TFrWsQ)kf1aS71PO05Lxhr=*r;Ca1=iR2HP-47I$;nF}$3tsFJj
zz=0;hsK=(kfM#RV-d};Qfi~3wqlg<w1LKj>IC7&_90gM?jw&AfIsdNwzP|Ozoke9u
zi?%r^2l_m2nRTo&G&9R{k{mEh7@0(vK^Y8t!UalzK|>>m%FKfW5xPF~_=D(TU}$JG
z1conKY@%yMj|zn5#X!wS5fk9e$_A2T2EuhfdOH({hcFI%ApueY0!zSt02dkPTF|pK
aL<2|*NDopbhuF`=z{HRRlu%{?DFgtu)sv$D

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip
new file mode 100644
index 0000000000000000000000000000000000000000..3998133efd35457d886b4f888d6e7f31e39d4199
GIT binary patch
literal 1182
zcmWIWW@Zs#0D;yCy1vgl>)#du*&xgc#Q6n!1xZ{i=|zbtT#UI~OgTkdtObchiMd?t
znRzLh$*ILcyoo7UrNt$wDe)jxLX5dW5EW7kO5(vlwZ*Bv`MIfiC8fE+?k=GPDTyVi
zT0C4Fd8t4>MfqhC3|jnL9Jx>>qG!*Xu{B}{Kyj3uaO$2UW(Ecj<^(z_GcUiiL@%$P
z3Z{$$>~_(p30HmrrC%~KFmQpD6_l0~1JzZ|ek1-$THxe~fMWq`=S_;56SE|5>4Lb&
z>(?z_FmGCXNYIoS;`8IDOyUwM^FE{L$J!;Ht|GRC>8I2(t65fDTe+q%tYcgDWY48L
zO<sGS{8<#T_s*p`lg=DjwCIS}x*v<KJo&Td(WFI}o(QdTa_i7qw@0KSM{%;bbGAlu
zP<4Rf<~H-i=GodIj$WRrj#}Z0-Wrqlx0*MbD+e&HZZj{|Nba3KfByA)1qR`J-&8&4
zZfwnwlI)S>SbW3L&2U>9L+SFF4yDBnrMDNnN@IA%mzv1XG)YB7*kYktLBjNBPLu4a
zpY`a@GBK~aYs2w8^^nc+?#q=jJOKfdrbJCz!VV8eNC;VNG);2_hD-+#^8;~8Y8fyJ
z<4Y<FQgH@T-sH@M7~xco8cyI)5@FP1(_lce@y}h~(+`0*wE*LU8%P5qj?zduMy*IG
zrctCEn&7AFd5IYq5{yhD%%Hr5J$(Wtz@VWKL}li|0tQ_ldaOZoF)%bVDgi?kEe_E&
zqelQj^Aw<Fr059nW@Q6OG6Uf{AU%%>#6uW|y?g+v0f8l8KY&XMbS>z)7oq{AWh#(~
Tl$Rm)GchnRWC11QSU?H^8%cx6

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip
new file mode 100644
index 0000000000000000000000000000000000000000..855579a23a352436ed3e32477eb597d4c8e415d8
GIT binary patch
literal 1161
zcmWIWW@Zs#0D&VDbbY5c2J=V&*&xgc#Q6n!1xZ|t5kicSQVdFbeyQn+C8_R3i7A<>
zc_jfTTBps{?a^jtU;trGpw`U1{L&J=yn-s2G7hi}EBLu~UII$HGBPl5ft3}MmJ|ck
zRnC4R{z_Wl<cWY|0c+<?ikcI%ByZ`0xX0_)EnYBhT6{>*lo{gl<EKpG5-RgPqv^-m
zC7!M#wuI@YRGZZ-E3U0vQyA8<Eqk)((w!!+y-)ru3fX(-(ws?Wjx1Vq#B1G;MOU8u
z+4E@9qDxPN);YO#Xsz2L(vhP$+1xo>BRQx#Kyh=M`C{{A?GQ&V&s0aPa7Ay8$@^Q)
zo6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=1BHPa!kIVsFuoLx_qXCX>o(;?FCtB
z3|V}si40AXmK>R+V#|Q)bE}P}X|6yYcK|Uz5T~S;WhSS_msA#{;`Dyr<jjQ_{x3)M
zKiKyojCyPu3}`kYhc!3QMqq>t=fGxU5@7}<W$XzJC;<iyjUXyB4;B&u-l+P}qY0vm
zfuW)C7m$e-Y3Q2KLmZ(w2bkWG!au;9l?^1x420`|bUG7=2TL%J%xGg$6Cwu`1z}K*
zN(bh<cmq&|WQ&XkafH|+z%(#xa)7gDW?8BSBf?GC^EA*TFjxW(2yhNZ*Mgpx5n8N)
fOr)F*3p(r$Mri8=77B3ZBl~?JP|YQvBm)Bg;;vq{

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip b/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip
new file mode 100644
index 0000000000000000000000000000000000000000..8c09282a249a2d77a2fa2d7ae8ede9df0965a251
GIT binary patch
literal 1081
zcmWIWW@Zs#0D-R)bbVJ%+4))t$Od6nAkHt)D@fvE&CDw(E#YD>N-aoDEGZUZ&Mz&I
zVo>7rE6pva49Uz%4L~uZ>3*KSFEaxJ2y+4r$;<<)*UKxYf+^zw+jUK9#oE_E=?RPs
z3|wGkKud~&>MCcy5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXG
zaS4@qpV9PV?GjH{5nICaQ_9V1mKE1lt|<)b*p@xnbLmc#*WM?87KQA+b7{_`Ge;IJ
zI^wnN$D%7w{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(Wuy%-}muIS@
zR=A?K#^n92=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXUo&dn7p)-*9v@+?K{r
zy8K{CaYO0t1+UT=UhzHsQQ!0T&i8!zMz@7C4s{x+3a4?2=LJVdC@fIX+^XWq&wv^v
zRvS&zT!BH+0mS@3oRV6WnVcG5Qdy9SGhFf}XD-ADm~zyB0SAi+qaK?E1DcI*zPonZ
z2HKPej0A2V4U8#DW1)>&v5-tkEPx_m@(o9|ja$+fOqU-tDQ+;my&x-%A&V~*771?)
zFRFTWs(Mxd!+?=Vgc+3Vu_r~K1Q;|lf+%q62PaH)edzH8(Z#^f&?pQHPqf%W*Nh(Y
z2+gfP%}5au;LXYgl4J(LbwIk43B*Gfhdt|p)PTSeupht~8C?r{MulhqX#pidq-+bZ
RpNWBqAqyz66PT|V7yuYkXGs76

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip
new file mode 100644
index 0000000000000000000000000000000000000000..76d2671c4fbcbe88921d2340dfca6ed535841f8c
GIT binary patch
literal 1099
zcmWIWW@Zs#0D;1Zy1vN|_m$}a*&xgc#Q6n!1xZ|t5nL=ju71H>Tu%O>elCtdp02?{
zjFD0dN`g*_CCM31rODZ;CH{G-9{D8!DCX_7-QW<y%)kJ`oIvw3^YTke^zsU-V9Gea
z)(Wo^e)|O|y^@iEfeWmxptPhIsIGGM8}V1t0w+%d91B=GZ&K8pm?e2j7sNeYzi#n@
zdDG%Uf~L$6pC3PE5|>b!_Zdw;)-LgM6|p5uKc$XY&9dU!$~A>y9ow=edoJB+^4k04
z&!UjMcP`DDbmqvSMMu2W{aAG6$)7!sCM~-3L};CpTZh)VJt7@Bij&Qqvo(@~ssj`^
zx0x?CFV+rm^zuw~)CyPh)|kA%)x6nUIe>9>n|ZNDa_{{4^RL${FbLoKrs_F&V{49-
zWRE1r;v0@`jET`~0rj&uL+sfj_Vcc)V_WbqK#Wa6!_&#ISvbc{`$UV+49=Jxz6nQ6
z8l|(C*aNEWUOLOC>bX+Yvx)&VY^*k#rnv&cqyvcgfjA|#EHgPZzNE4s6=wkDP0n11
z5klptAp{N|5k@^W4F)tD4YaG?-vrv^4U7$LAPtN%N~5EVTG5eAN_2o?WAcrIYL&)Y
zQyI1{pXso*xMAz<1zG6~S^Q5E5;8WVGc-+Fa%7T<EieQanM9aD83B8W1WJHGLnDaF
z%!368x<2&Sg6LvkXlRrHh9+9vp=(ADc!cIYpk}0K2=HcQ14%Li;W{8afeFMz7>7MG
rgVcb)60je@SsPsodVYmy0BH#TGLdpG#C|3QCWb7a#A#smW?%pS>r-8*

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip
new file mode 100644
index 0000000000000000000000000000000000000000..d0db2a7d3384957a0e3daa51087b9b267fe4e210
GIT binary patch
literal 1071
zcmWIWW@Zs#0D;1Zy1oy02J<Nb*&xgc#Q6n!1xZ|t5nL=ju71H>EMbm5!9t9YQVdGm
zPKhPS8UA^x9{D8!C}zxiYd*<^nSlX>Ie}(m=H-``=;aku!IW`;t=hEs|Kk@x={80N
z1}?C&g3^*=pt{Q0Z^U0o3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180t
zNnAo@-e)xZSi8j2Rm7Gs{ggUpHOq=?E7uf;b!^L??74KO$!qVEKZ`>4-nleq(wQTR
z79H_g_hZqOCx7-lnzZQB6QOlZZXH_d_K0-kC{8wa&eli{st!=x+-APmyjVNL(aST{
zQ7c^0TVwM6R`X_a<p9RjZRW)q$-VRE&%a);z#x3@o2uvBjjcISl0A|fi*Go(F(yW{
z1=KI&46$dA*w4GFj%~re05LWN4NoV-X5k#SY&Q0QS%Sf{xILv7S#MSG<Yzz)537x)
zX|BL<=m27VAWlgw%S=v<FR3g@#Th7hlQS1$gi1MTsDQ&ngi()8g8|LPYVK5?t3aFF
zfib`hq=C^yX%w_kD+-cHi2_gzOulhYt<rc~D#O<FSq@u^8@AqFkd@Am#h;p(lE7dj
z<UQ#N(D#f?BFvz~jy(keCBUGe5kzI?!2$zaA9_qdbTKeAGztJi5-r})HKPYQLUR*P
zGg1@;c(byBB$<J59guEd0`U;WVb8E2H6XAA><4i6Mc0C!KOq`GT3mokq}&RzpNWBq
NAqyz69+;gO7yu3OPsIQL

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip
new file mode 100644
index 0000000000000000000000000000000000000000..20e95bdf8633c81089a734bff4851051ce613448
GIT binary patch
literal 1070
zcmWIWW@Zs#0D;1Zy1xBCUUjMh*&xgc#Q6n!1xZ}o8TlpgnRzLh$*IL$EXA2ssX}b|
zr6mQWB~lDZZ2ozv9{D8!DCW4Ogm~C9GcbTKC(xYCJfM2Lyn-s2G7hkHkw@Ncc?6Ws
zW@KRC0xJWWSqxNHIs1+HD`|m~CjyQIterP0YEI0Oyrm1`9<N`wc)`4B@gYG|W{A&^
zpE8L{sLcC}rXOpUc)E(%5~iP0ZdS9bxVCalVOYnu?8%->cbdHRKKZjKWbd6zb0(cR
zvS`r}uXR5bU3v0n&!b6;E<F)i=j7I*wQi3{M~>oTbLVW0<e=&R#m#Nzi_L?zLma(4
zQysO!6}>ek?{774HdhW{T-|0~tdZP1fByXI^$HBa_r9rm&fVCWBPH1*$+7r`qnqKj
zG=|djLnXxxrMDNnN@IA%_cWoQX_AUautnptjI;s<QG-LGOQxy3WMn`M39F5!X|BK!
z=m27VAWlgw1BP~dNo7GQ&fv(KoVgGqJjzkS0~{J6jCyPu3}`mCsfaLL1KQ*c40~=M
z4T(^S0>6z~fuBrD;Df?`@{NOPmB!mr8MZE;>9Do9Ve9P$S?LT}{Hciv84R-~M@(A6
z4)i`FlL#{?tz%DoKnXBtXarH2d9b92t`9w`Ai5YB8XCEPL5UV|=$g?(9ih1fs2M2^
z0=!w-K$6TrxDH5HGl6&r<FMyfkQxwJ0`>zq_o8b-&zcYoAT1t1CQ?R)*w4hk#E=D)
KSO?6>3=9DIbydp%

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip
new file mode 100644
index 0000000000000000000000000000000000000000..f6917e3c4ffa234bbed9039692b8eb8402e6bc58
GIT binary patch
literal 1090
zcmWIWW@Zs#0D;1Zy1vuqY&&fVWP>m(5a$=@6(n)7=cMMPmt+)kacAV0#AoKEWG1H;
zbFma>R;3ED<(HNel$J;_DDeiTrst;SmH6kSdgPY`pqR^6Rwm`l%)kJ`oIrCk^MKm)
z@(QY8$~eH*FMPhd?=?_*AtM6=7g!n4=whI{%GqzkUr7s`JP~j#VC}p~QFCIJ<Sktg
z_jvuf#S7+5iw_B!GDCcR{FF&tLS^1(H2qk+#M4#8mN5O4I%YM?ifb#^6oz$d%bx7H
zbf?K{?~^}^LiXOdG-uM8Ba0Rt@mlv|(Um8E_B@)j=+YCRbxv*_TI=?RbmS;bHh0d}
zNDitFP~6;RzSz82JH*k;Gu2TmT+v%&^8QxyW^?5L#?@`+#Tv=I^XJdMUa!C)eD9m8
z=iH60IZ~26k{pX~IJy~bNn<EYpXE?m+)#RZ!K*ZeSA48O8oMkfmK=WbOexCTJYaho
zTX?z#yD~G&6qTjZR9-SNpazN6M$<G`U{G`bF+UKeq?Q3gKfa{0AQfl0<W0_8h!HU5
zr~v~G77<20HVp<e8_)LeGhGDQ<P3}iZXgYbV2Wa)jaspgOiC<(B4P3kM>WQ;TT&Ue
zE}!YJwYXvH?FCus3|aiFLY=CfRY0FJGKnyQ(m(bD2$TSWhDH#TnFmXb==#uO38IUE
zp`lS27>a0dhOQYsz!92TftrzmKfs%n4J647gzJEGJrjtBFb;bj2B`soC15{*b2PdZ
f^y~@I0MZfxWFlo&i2Y0qObl5-i9^8L%)kHuwohDy

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip
new file mode 100644
index 0000000000000000000000000000000000000000..5b29d053454923273b4282559736cd4e16810e79
GIT binary patch
literal 1472
zcmb_cZAep57(RFN&h5-hZYw_`&|0O7%!rr_A~$QGE4zqLh_tTTT|0HVTf29b_M@p7
z67{1gk!HE36%j3^4<ZafA`0x!5R#IRvV!5iJ}Br%=iF({Wq+!3c{%sI=XuU~@AI5{
zScf{p07yG`ZsTrmkx<h{0YHMrIiJ7O-w3mhaL1i;0QsFRUr>~x&gB!`NJc&pLJtZ^
zvJ;6&hI)x76Uu<eOipl&b}q{?nX(rJ+)l0ltk&(RsoA&Jiv-^Eih>!9K6kS%PBE(>
zt<az*Nn&V)f(dG?+s)*_pJSR0AW>$lj9x%Tkc<lxB^l$gD#;5c%623OCLN^Ra<iY$
zqsXh|<@|gZWy*qd6DI`u<p4ajF~+$?Vd_eZR{0W<KZe&-?W(g-$dW-)mSlmJxdLrJ
zaklwPvP4_5GcCoca80~Za{2tclm|=<%=U6piki4KoO2_IYZOv&Sx`%GWY0W`U0~9V
zhQdv<8UPYHY!VWELAg})x6H$|ikHLXU-~HQ<9i%Xf%${7gmKLmXLA!PbDlkApRl(s
zhwJ+q2F0t9{cmr_1|pZwJL_t@FXZ++yTg!aa*UVM(5G_`8dnUez7?#q^jhFO*bQQu
zYcnJ9skZ7zGvA|ikEY^%;qk}O=;P|x*XYE|_mMZ@Xncl=+3cOB*odK1DC#m-9xis(
zo?wgaoH7rX9UJPb)m6OJw71An+%?)^Za1%Cshg+FQnBl7e}DgrlZ9Z}>o04oE)91G
z1^H+4wF8r_cH{w{J!78&q0cWP2_7VtygLwT3mXkv$~liQ-}ZcZ%(Hzm%Pv2kDjY4l
z-*)$UPsP2D)3Qphu591ds~cgL9iNWhSX<Qs77Aq<-s;(j1?t4H6o;E{LN4A(4$dFd
z61&iq-{d;BP%cF*168Ui25CD!+}QD97Vk8Gt<mEi`}eQjJo#VVNPppt)xP<@2L!R;
zK`9h`6-jtNBAe$9ctJsXqiX#`M)qvHp%7_?=$<nDgQNApSnCLX!lF<rgPPnvi=S9L
zp|nsd>5w?DQ93W(z$6QRP-_VmIL*#<a=I74$QLm=#TnK?YsjP;KQY`7smSQpa*N-f
m1Vh@O@(s#;noddopOXYaxr!@D`MZ<*tAGj|!k7wjL4zN^`P&8n

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip b/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip
new file mode 100644
index 0000000000000000000000000000000000000000..09b2dd7e7fb953d7db6cc26ad323f61402ea96a4
GIT binary patch
literal 1341
zcmWIWW@Zs#00FT{y1pA!{+`kVvO$;?i1Q2d3X-@ObGR5&gcwVt7?gMd5|guoQ`2)(
z^Gb@f__(-o6D#6#QuESFG9<VIP|VtUJw19WGXnz%a{|rE%*!t=(aS5Sf+^zwTlxC&
zUJ)fm28IqM1_my$vVzi*VxYRprDsoj9d;05dtmIj{_!#Q6MBM9W?WqyYo*2cmMNt?
zbn?;*4$3H=>C=+bnzLxfH`W>v`6I3Qi`0ExwYQ1L{9rP+{Qq-)?(;{VW*yMI^OEs=
z%|iL?Uo{J6&$8=k70EN{aXqm5xpDcV!kY6=<_{~(zn!%W-hKUFamK<Qfp)%W)8x7H
zd_Q-}>~`8=Y$bB1f3Z*3o#`g3UqyPGe&w$+(2#v(8@F`TL5|?lPaPD}IhSW14rFuZ
zkvQ16>|jXyncUlU{|{{bv0>hqd3l$n2~PS@9lATp^|Zj{$IYp`HhaupvyAa_pCf~V
zqgK`ewXB6^uNLSn@~G0*y}Rn^go*8|(!X4FzW!|wqlj(H<@I0dHR>vV9)2zLUi!ex
zwB6Acwkg?eJG?Z#mI*Z^tu~sbxdKD51Bm&7I3=|#GdVTBq_Q9t6r8i)h`*8+IC&!A
zSistOlcMItEXiBCAa8Q!!gY%m%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YAQb
zifb#E2%{dG1_J}Cjn5V`zTpAJ9Ro0mxq%cgf?<(|)wa0D>oF`VL$$2U3T)jJhIMSq
zp6t1Fr^##YlRt|>_TITPXVRG?ixwU6TK8kol_!7pJesuV(i5R|PHr7q>-LCr<S0%y
zch1&G4yq1N+}viq*gRP~#L>$$)ln;4(OYBk{#Ns5bL9ZW)otd*8p*x$=g+@hufQOD
z@0+UU+>Na{l0A|flW#bx6;>8Am!>lCdKfoK9bhy&z;=~^Z2_aAwS;8Df`juKD;_W$
zJ)%+Skl?i7AfxDGW^Ne+L%o6njMm54<SZT>lQ=Mu!HtQjuf>o@!$PQCvf<<Oh)GM>
zfq}-zB*F~JW!RHEPy!4Z8bMTM9xM#e^`S>FL>B`?L!%ck640U>T{C)IAvEs+YDS8=
z0B=?{kR&q@t^?8^ff*AK@sM(2$M?c$bD$^)gYtHIQDREG0ko`OD~Sj3h1gQSG_cg*
z3(YIQQ*L-LA{>jo90FPa21~#p1TK}(wV)R_2rWK9CQ{)8iwo>NL1@!pLGl-JpezKc
KX#+|!FaQ7_@~)`>

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip
new file mode 100644
index 0000000000000000000000000000000000000000..8438af3a9f7867460b8edd67ea6a8850a0f31cec
GIT binary patch
literal 1218
zcmWIWW@Zs#00F&8y1uQ;rtQ)MvO$;?i1Q2d3X-_k$`W%*Q;WISb5irtOEQXunDa|Z
zq!^U=0}_+7gZ%u0QwvK|^O92oP|T`bxAc$|GXnz%a{|rE%meDu%PXjYDdPZJX_PxX
z<{?nJiIIVU3#_c5w4@lQu5$Jp@mJCUCr<<%3s^gEQq-K7C3#C1#64cWZt;S7)8a#d
zrpyqZA3tRhmr$Aa8BIUdF7b2~u_a7DrQED$S#fRUn!>P-ZP}AOm+mxq?S1lRQOMpq
zm*z}5b7aw?BVOx%EV}aK&z?t<7F~KGw9d(`Lu=h0k&Yb2$>z@48p%P`0g9X3%om#n
zYlk>`d8RsQg)4e%Oy1vW-fXTMz__~2yjUZ-cmDkO*XtD+gztS*^_;u0HAhOaN0MXl
z4M#Ud)2*otUzg8x_*&fX_4a~S=?t&<S%oxySswfq?YPT0ao5fdFa2~qFEKNq287i{
z(==CL0CWH`KM<#+mSrZV#+OtUq~Z*XyvdmhF@mEUH8{Y5A;PG~ron(_BlGD;hc5wb
zas>uGH;@L#52az>My;?<CME1aK|lG1qncquX(|J+hjF9S0Y<X}Y*!iB7BDV!P}->C
z$q)28Ba;X-D5+ylcR&d+XlMjcnR&2~K-Y&JO%PoS3=NH(z+gm+G<41AA&$^o4b+Sj
z{sG>sY#>QyAY2Ee%a}ksSb~9M)<wU(-kSqOK^T<5(u)#P;tfC<mIG)~Jcx_PvfPk7
z3>F1uUBS@20@Tdw!H94y_QC*Y1sE&=hY+}EK-Yqv=Mh?bfJ~$k02UY6eS*-o7Fc+|
P{e>JT3xR65SwIQ_`k8oP

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip
new file mode 100644
index 0000000000000000000000000000000000000000..953e275453a5a90638984ed0f5e756fa265bae79
GIT binary patch
literal 1218
zcmWIWW@Zs#00F&8y1u%fE??IMvO$;?i1Q2d3X-_k$`W%*Q;WISb5irtOEQXunDa|Z
zq!^S0L-Ps}le2^T{DM;pOH=cbQv*;;>oj{h(~6ma0fae$re)><_37mmRKb*SfGxed
zNA2SypmYl(0|OUWSwU$@F;HFQ>^I`Cqy<i%2sjq7cHX3@IWbG}mM(~Uynfx{1@orG
zhXhTTAwEBT$|NqKGVe2*eym;M=_+DNn0`vRS<SNI+R8PBVIAADCwng4Y4Y0p<j<mz
zy>~9nnRMpJqD4o%*8Ny?<;kBtk0vd;^h9W#lUs+@x;-KtIf|3bowGHPgQ^1*H@BHD
zHV@VgarE*`b<_%1^wyZXztz0iTseSob(?vyMsn}``SY*WD=-M(`=;tScVlaglw^-2
z$Ko4~ZiWq8QyF|cj2mYiU^F|(c9n%~0p~&orlTx=8#vc0+?{D~H|9sl@@XnB85vMR
z!fK;wnkz5_I)IoTh*MI_GLuu|ODYReaRx`;<jjQ_;Zcqn9^lXrVbo*OU_i6+Y!5%v
zMW9X2z_8~A(!dC!H1OM~75K@d1U@M2C*N>XWBj@$m0|1hnGRcv8@AqFkd@Am#m_3#
zsp?q;^f@Dw2s0?LV^4QL2{34A1W}oJuz*0<haO81T?`Bjja<M`M2j<Y&FBG+&|Cx5
zj1>F<-mGjONoF8i2c*lGKs;D_fn?WQhvW&SKv56|Ww7+3#FTggP?qHYniLP>A~G#E
zBo~84fti;-05$7+Fe03ay)Xb;00v9I;R7xj(6ylFc!U-|AQLJ7!(sxvKM>m11B(v0
PuaJXeAy5rB3rHaVcaL?K

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip
new file mode 100644
index 0000000000000000000000000000000000000000..945f89561e9d77d86e89e00b1ae849f27e97ef0a
GIT binary patch
literal 3245
zcmWIWW@Zs#00Fm2y1piLkM7z5*&xgc#Q6n!1xZ|t5kicSQVdEQ0f{M|xryniTAW<0
z1&Jx~NfHb&W)Xs!gJ70OFa)4jpr7VG`zsd%0|;{hEy&EvFD=o_E2x4g;{dzksc7N<
z?JNuo-atnHRRWb2l$I0&)m1J%lNoS@Rf6rq-(ro;-O7?I6>GZNxOX)*F}?bCzdfL_
zbh7q|=H&E7(SXQrQ@r^!!;MlWD4&rDG%<3`d9op)?a~#FzyOh67FO2}4p)Ml1aDoL
zx$OL2`Fk_JSI++_%GR=$C+cv4S>dJDja<1sZaX3r?#O6}aJIajWMb7Dqv?I=h}OT(
z+NaW6xsuoQPG2o@>~`k96Ok$>1pdr_m^S-}#@EhOA=785^d3?T+#fhy;d;iCl`%8j
z1Sc=K^2+3wglzcPJ(-15H|Fol64p%5mJr^n>yrCQC3VS}Yuj5RVpn+`44vINlcmbj
zGU!-@LHVRvZHBX%nV7<QB%gns<Rtz#a!FN;jqv~VCigV<9$OL?q#e8GV#M|bYZE`-
z68L}pfkbaWpJaRS&(EC_=CRCwCF{Srg{ee&$0&TMY`<pivqbk7&n3R@k4ZtCvfW<}
zyoh1Ua{4DQ`JVX8Z-pyP-|=P*zLp&JvGSwgS%ps7<14ma>t<VQ*R*Y0$G)EA1vZBh
z_x_B(xbN}5fO~P>+twX^^zg3Hm(t}2zrN6YwEC{#3==iIJ#)NrGA-xku3cL2UjGK4
zDZBj-f91XNnca*|9laXJwk;{$y!Q7;IjQ=y@=X0V4e~3r;=)b6*4sE;*jX!PYbC*y
zogIEBd0D`+A1uDT50&Sax85sVw_H|SCSh7e>(TEu5mWcFvQ>$MT#<JCUTvEh|CeLy
z&&o%?c&#I1&n$JSw$GZm@X+$w{u5g?yOe5vMeLq9vqC>z^6iA44NJrCUf8#5z8LGd
z#`lvs18VnA;9Gmw|MRq{hgpYZx4lf|Hoc>>gp++)z(r<uo}E%F>?L+A*>Q4T&8yzu
z#X=V^zvy1Kw|C346Rx+)7R^)Y<Gbj5Qg_=eCGPW2PX`)VFt-JmKd~=8Zgf@3qQJc~
zKC{<5J!J#umL;}jwkbb%Of;JH>Bf<}&m*4q6(rSEt?YT;vgWbbi7h(H$9qFfe(K0S
z*V6Jl=IQp&#{P;8k9({kzwSfD9Z#>G+_m#w;@R!rFD<(ttH2T0GO1hchpP4T-0E+N
z^DL%pIv4%vM1dCbW!1APTT%kT0(?GK`v?8q!1B1<@{oBnU)ok<(J%9t9sC&|>=v$l
zw&dUBw%}!twyovZ(~|P!$;<^NN7<&Uxioj4ZMbkwn$?o?k@Z$55ue}PBC=XZ)66f2
ztyS-ezN46UxghUo?5i(5t!5iLvo_DWeD~ATify+=+Cp^AO5_(<tu@Wk30_fnnyqBs
z{JvR#6f$CNYp#8qo^^`HIowfTRp`V5?o@%0i%!?$QWwtMF4MV9YS&(q2EOg?iaRf{
z1s~>{{3by%^rv@#YVncP+YBF7&rLcVUN~{~>$cPPlpgK>(KgkLMR}<uL#49&iS*_C
za_=`DWm09ow_9#PW}Qj2`R1l?(eVo^y$<bdIr(UrRB+Gce&y)aCfg<d8%(o#QX>P6
zqJG~E-0|6ZS6pu6f`txe-b$-3Rb)DM{&t3t{>&NA{;K>>;j4?vXn&BySh0M)zG3g_
zg|jV{B<db7Hrg@2EK~NRK|!lu*|NZsCrxZ3Z#v)I6|iOc+l0fm*`9}{E=+ymu666T
z-Lqx-i<7zab<E^Bj{ZFun)BjrQPy_3MK?|N2mLxAbjkH}=|#PIX4K-%YNKhIE3jDW
z0AhY1PDw4xOiqn2sVqnZm2tD*h`*8+IC&!ASistOlcMItEXiBCAa8Q!!gY%m%$pV;
z5;SFo`26@OlemP+z0YX+v37~4tB5UO`YAQbifb#E2%{dG1_Qj@#Vk;{fi?n5-O&<t
zv_u^(QAbPEAzPv{GKnyQny#279H_|%0}YKJ7NP+e;Ek#ey(~uPyAN!DqLs|(nvvV7
zpkfsU8X5zEEmx#MHo%*e4J647gzJE`A=rtqS^?7dp4^_)6a<t5VNlyRy$IOEHUKri
z*&^dX93i#{Fb!;q3&5J<?m+R()VvZ{JDiv%xd$V{AK2UdK-<A!2{`b;y#RDA=q-7K
gmP8;Esa+2XZtR{!XsZPVAl%Ey;k6K`<{LP40fR4CZ2$lO

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip b/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2339652431c7d4acf7f4038011dd745a56f06c41
GIT binary patch
literal 1139
zcmWIWW@Zs#0D+iEy1tz0R$MtiHVCr<aejebK@t~Zgb-t-6oZmPKxS%kYI$aHs!wKK
zYGP4HQDR<kT7FTkmJk<PaZY|gYO(ItKUMp6_bL1B*OK94OUf_JODWdv67sgw?U)o}
zulsPfot>^B54XLR7#CM&UP)?Ea%w?IvF<sB<@<Cq0~GfMpg3obqEwzOGXnz%a{`@{
znU`N$qL)`t1yjZWc6DCfl7F{>(%y^=3|wGk1*Ij$Ky{U~--y4G7C3n#;8?)gd6S~%
z#4O2Mx*+cH`gMyJ%$pV;5;SFo`26@OlemP+yw7O*v37~4tB5UO`YF|BHOq=?E7uf;
zb!^L??74KO$!qVEKZ`>4-nleq(wQTR79H_g_hZqOCx7-lnzZQB6QOlZZXH_d_K0-k
zC{8wa&eli{st!=x+-APmJXt%$(aST{Q7c^0TVwM6R`X_a<p9RjZRW)q$-VRE&%a);
zz#x3@o2uvBjjcJ7J(3)gZyZ#sG~SxZuyy%Nhpoj8TW>GON@vL8e|n?rS^wSnu0PFG
zJv&uBs~Av&z-psunk&%j9YD+v#3`v|naQc~C6xuKIKv@ta^^yefG9@|2yifnFzT^s
zFreAUtHs^~3v_Ovjlh^0hM^vC>g)ab^3}ipO<Hnfl8P-b3>cY2m_Zo^d(s0+fI&ke
zh|0`^1qr%7^!S44Vqj=!WCkW;wAe$}j2`p|&80xiND&d>&B_LnWCp@@K)R9%#6uW|
tJ=25KfWQ*4AHYQbx)$_|3DE%3(hp=JWmAa#Obkp6SwM-;z<kWW0087;aI*ja

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip
new file mode 100644
index 0000000000000000000000000000000000000000..7bdda94c3fdba6210bf3e61662b26b4dbf30d4d1
GIT binary patch
literal 1212
zcmWIWW@Zs#00G4&UEia;lRn4**&xgc#Q6n!1xZ{?5e8gL<px5Gkx~pw+~KL2=@})d
zDZ!<=Rty0s+AnAtEp%aKU;trGp!UqX{L&J=yn-s2G7hjU?N?v3Jp)RoGcqu6ft3}M
zmJ|ckRnC4R{z_Wl<cWY|0c+<?ikcI%ByZ`0xX0_)EnYBhT6{>*lo{gl<EKpG5-RgP
zqv^-mC7!M#wuI@Yl$+HoE3U0vQyA8<Eqk)((w!!+y-)ru3fX(-(ws?Wjx1Vq#B1G;
zMOU8u+4E@9qDxPN);YO#Xsz2L(vhP$+1xo>BRQx#Kyh=M`C{{6?GQ&V&s0aPa7Ay8
z$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`;poPg7|j+?KZ`TO
zo;_ke@2Wbs1^)uX-c`vhaMVaEXgZs5_5p*JzOXU7*Yas9FB##11_=$Tjiza?z)<J_
zVtyb_NiEAvPK__AEJ(!}BzcoF7h;4-Ick`ILqvp8k4=LC&Bnl=<-ym1Hn{`CpBqR6
zBZ<-oXropHB$E;Wpzxo3<Dgol@wQZkt;=URY%OlsdV4`uIztwJ>dN0`ht{cRZdCE)
z2l}6pNrV}c&atOJpad8+G=iwiJXmm`>qCz#h%N?(hDJVMSfa%ox@PoXM`&&UYDS8L
z0B=?{kR&q@t^?8yOduXC4?%LO-J&bSCO}aT24%JMqQsPV15j>di;M?x;JKDP!T>A)
z%)7!!dDk7NCNnj!#EQX#5#dto#Q@NJFjxW(9&n+6t_3~oBed89nMefzEF!Rb1EFmd
Tund8F3OPg;0@bjxfD{4%0N!pk

literal 0
HcmV?d00001

diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb
deleted file mode 100644
index ba59745bd14a7ba83499096655b8daf060377400..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 67
zcmd;Li;Pb%N=)HmiZI|}DmM^fk1zlWNHHi0ho@$yXOyI-1efNz1Jz`v=9Osia&dyx
Q$LHt8mrF1hF?cWn0Bc?nyZ`_I

diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz
deleted file mode 100644
index 3f35572017ab82066d16477130bd4512b1aaca86..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 390
zcmWIWW@Zs#U|`^2(3&CU%Og9rTY!;)p&N*~fTEds1*Ij$dU*wvv)_onk`_35BH&oS
z+If?r=EN+?Te=|b@%nX(7tEU$9}+ZWhWPyWDU-N_%Dm5L`muJ2r>lr9VfrcMW;M%-
zYb)0jhIMSqp6t1Fr^##YlRt|>_TITPXVRG?ixwU6TK8kol_!7pJesuV(i5R|PHr7q
z>-LCr<S0%ych1&G4yq1N+}viq*gRM}#L>$$)ln;4(OYBk{#Ns5bL9ZW)otd*8p*x$
z=g+@hufQOD@0+UU+>Na{Qj$HA9E)!_x^1j7-kQqrb@@z(uf+{tZ!dV2&hU!=>Wv+r
zT4Oh!bFsemHS^oj<DcJruFtsrwqVM(^jG{F?#E@Oeb3%K_3t^(>2JLz1-yC4a`W(w
zSqCplo?2vYDf(jCl}ReL3<2JZOd`y<Lk<{tV9?MAq7dO2;LXYg5@7^F3m|O)3~2@i
E04`LU&Hw-a

diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb
deleted file mode 100644
index 8fae4791be423cafa74c78c7f8b47a237a807251..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
ycmd;LiZI|}DmM^fjFe(f;to&EOwTAuO$jc|)#ByiOfO1IiO<iAFPC62VgLZsQwpU3

diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz
deleted file mode 100644
index 543a127bedee0e9d7dfcf96d422cb2974e63923d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 234
zcmWIWW@Zs#U|`^2(3&CUYf`)KSP_u70*JYRIKQ-{ptPh|FR!3-_8ak6(gG(>1RM)k
zJ8x3doR}qfOBdu#PFu8Y@q&5N;zNR_%n+X+KV=e^P`URRO+VHy@pKijB}_l1+N`*i
zS#i0|vf|px^@MTVzop`Dba-dI;d%G_W_}>g=IPreY<j%)*|z<s@^b65Hg39m+o`m=
qGq(CILx49UlL#|z#{u031`UlM3dyMf-mGjOAx0py1k$rX90mYnw@uFg

diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index ad79591fb2e1bf..feb5d8e127cb88 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -7,14 +7,15 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import gradient_checker
 import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, seed, settings
+import hypothesis as hy
 import inspect
-import numpy
+import numpy as np
 import os
 import re
 import shutil
 import sys
 import threading
+from zipfile import ZipFile
 
 operator_test_type = 'operator_test'
 TOP_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -23,10 +24,10 @@
 _output_context = threading.local()
 
 
-def given_and_seeded(*given_args, **given_kwargs):
+def given(*given_args, **given_kwargs):
     def wrapper(f):
-        hyp_func = given(*given_args, **given_kwargs)(f)
-        fixed_seed_func = seed(0)(settings(max_examples=1)(given(
+        hyp_func = hy.given(*given_args, **given_kwargs)(f)
+        fixed_seed_func = hy.seed(0)(hy.settings(max_examples=1)(hy.given(
             *given_args, **given_kwargs)(f)))
 
         def func(self, *args, **kwargs):
@@ -38,21 +39,36 @@ def func(self, *args, **kwargs):
     return wrapper
 
 
+def _getGradientOrNone(op_proto):
+    try:
+        grad_ops, _ = gradient_checker.getGradientForOp(op_proto)
+        return grad_ops
+    except Exception:
+        return []
+
+
+# necessary to support converting jagged lists into numpy arrays
+def _transformList(l):
+    ret = np.empty(len(l), dtype=np.object)
+    for (i, arr) in enumerate(l):
+        ret[i] = arr
+    return ret
+
+
+def _prepare_dir(path):
+    if os.path.exists(path):
+        shutil.rmtree(path)
+    os.makedirs(path)
+
+
 class SerializedTestCase(hu.HypothesisTestCase):
 
     should_serialize = False
 
     def get_output_dir(self):
-        class_path = inspect.getfile(self.__class__)
-        file_name_components = os.path.basename(class_path).split('.')
-        test_file = file_name_components[0]
-
-        function_name_components = self.id().split('.')
-        test_function = function_name_components[-1]
-
         output_dir_arg = getattr(_output_context, 'output_dir', DATA_DIR)
         output_dir = os.path.join(
-            output_dir_arg, operator_test_type, test_file + '.' + test_function)
+            output_dir_arg, operator_test_type)
 
         if os.path.exists(output_dir):
             return output_dir
@@ -65,30 +81,55 @@ def get_output_dir(self):
         output_dir_fallback = os.path.join(cwd, serialized_dir, DATA_SUFFIX)
         output_dir = os.path.join(
             output_dir_fallback,
-            operator_test_type,
-            test_file + '.' + test_function)
+            operator_test_type)
 
         return output_dir
 
+    def get_output_filename(self):
+        class_path = inspect.getfile(self.__class__)
+        file_name_components = os.path.basename(class_path).split('.')
+        test_file = file_name_components[0]
+
+        function_name_components = self.id().split('.')
+        test_function = function_name_components[-1]
+
+        return test_file + '.' + test_function
+
     def serialize_test(self, inputs, outputs, grad_ops, op, device_option):
-        def prepare_dir(path):
-            if os.path.exists(path):
-                shutil.rmtree(path)
-            os.makedirs(path)
         output_dir = self.get_output_dir()
-        prepare_dir(output_dir)
-        for (i, grad) in enumerate(grad_ops):
-            grad_path = os.path.join(output_dir, 'gradient_{}.pb'.format(i))
-            with open(grad_path, 'wb') as f:
-                f.write(grad.SerializeToString())
+        test_name = self.get_output_filename()
+        full_dir = os.path.join(output_dir, test_name)
+        _prepare_dir(full_dir)
+
+        inputs = _transformList(inputs)
+        outputs = _transformList(outputs)
         device_type = int(device_option.device_type)
-        op_path = os.path.join(output_dir, 'operator_{}.pb'.format(device_type))
+
+        op_path = os.path.join(full_dir, 'op.pb')
+        grad_paths = []
+        inout_path = os.path.join(full_dir, 'inout')
+
         with open(op_path, 'wb') as f:
             f.write(op.SerializeToString())
-        numpy.savez_compressed(
-            os.path.join(output_dir, 'inputs'), inputs=inputs)
-        numpy.savez_compressed(
-            os.path.join(output_dir, 'outputs'), outputs=outputs)
+        for (i, grad) in enumerate(grad_ops):
+            grad_path = os.path.join(full_dir, 'grad_{}.pb'.format(i))
+            grad_paths.append(grad_path)
+            with open(grad_path, 'wb') as f:
+                f.write(grad.SerializeToString())
+
+        np.savez_compressed(
+            inout_path,
+            inputs=inputs,
+            outputs=outputs,
+            device_type=device_type)
+
+        with ZipFile(os.path.join(output_dir, test_name + '.zip'), 'w') as z:
+            z.write(op_path, 'op.pb')
+            z.write(inout_path + '.npz', 'inout.npz')
+            for path in grad_paths:
+                z.write(path, os.path.basename(path))
+
+        shutil.rmtree(full_dir)
 
     def compare_test(self, inputs, outputs, grad_ops, atol=1e-7, rtol=1e-7):
 
@@ -98,48 +139,50 @@ def parse_proto(x):
             return proto
 
         source_dir = self.get_output_dir()
+        test_name = self.get_output_filename()
+        full_dir = os.path.join(source_dir, test_name)
+        _prepare_dir(full_dir)
+        with ZipFile(os.path.join(source_dir, test_name + '.zip')) as z:
+            loaded = z.extractall(full_dir)
+
+        op_path = os.path.join(full_dir, 'op.pb')
+        inout_path = os.path.join(full_dir, 'inout.npz')
+        loaded = np.load(inout_path, encoding='bytes')
 
         # load serialized input and output
-        loaded_inputs = numpy.load(
-            os.path.join(source_dir, 'inputs.npz'), encoding='bytes')['inputs']
+        loaded_inputs = loaded['inputs'].tolist()
         inputs_equal = True
         for (x, y) in zip(inputs, loaded_inputs):
-            if not numpy.array_equal(x, y):
+            if not np.array_equal(x, y):
                 inputs_equal = False
-        loaded_outputs = numpy.load(os.path.join(
-            source_dir, 'outputs.npz'), encoding='bytes')['outputs']
+        loaded_outputs = loaded['outputs'].tolist()
 
         # load operator
-        found_op = False
-        for i in os.listdir(source_dir):
-            op_file = os.path.join(source_dir, i)
-            match = re.search('operator_(.+?)\.pb', i)
-            if os.path.isfile(op_file) and match:
-                with open(op_file, 'rb') as f:
-                    loaded_op = f.read()
-                op_proto = parse_proto(loaded_op)
-                device_type = int(match.group(1))
-                device_option = caffe2_pb2.DeviceOption(device_type=device_type)
-                grad_ops, _ = gradient_checker.getGradientForOp(op_proto)
-                found_op = True
-                break
+        with open(op_path, 'rb') as f:
+            loaded_op = f.read()
+
+        op_proto = parse_proto(loaded_op)
+        device_type = loaded['device_type']
+        device_option = caffe2_pb2.DeviceOption(device_type=int(device_type))
 
         # if inputs are not the same, run serialized input through serialized op
         if not inputs_equal:
-            self.assertTrue(found_op)
             outputs = hu.runOpOnInput(device_option, op_proto, loaded_inputs)
+            grad_ops = _getGradientOrNone(op_proto)
 
         # assert outputs are equal
         for (x, y) in zip(outputs, loaded_outputs):
-            numpy.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
+            np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
 
         # assert gradient op is equal
         for i in range(len(grad_ops)):
-            with open(os.path.join(source_dir, 'gradient_{}.pb'.format(i)), 'rb') as f:
+            with open(os.path.join(full_dir, 'grad_{}.pb'.format(i)), 'rb') as f:
                 loaded_grad = f.read()
             grad_proto = parse_proto(loaded_grad)
             self.assertTrue(grad_proto == grad_ops[i])
 
+        shutil.rmtree(full_dir)
+
     def assertSerializedOperatorChecks(
             self,
             inputs,
@@ -149,7 +192,7 @@ def assertSerializedOperatorChecks(
             device_option,
     ):
         if self.should_serialize:
-            if getattr(_output_context, 'should_write_output', False):
+            if getattr(_output_context, 'should_generate_output', False):
                 self.serialize_test(
                     inputs, outputs, gradient_operator, op, device_option)
             else:
@@ -180,29 +223,34 @@ def assertReferenceChecks(
             atol,
             outputs_to_check,
         )
-        grad_ops, _ = gradient_checker.getGradientForOp(op)
-        self.assertSerializedOperatorChecks(
-            inputs,
-            outs,
-            grad_ops,
-            op,
-            device_option,
-        )
+        if not getattr(_output_context, 'disable_serialized_check', False):
+            grad_ops = _getGradientOrNone(op)
+            self.assertSerializedOperatorChecks(
+                inputs,
+                outs,
+                grad_ops,
+                op,
+                device_option,
+            )
 
 
 def testWithArgs():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '-g', '--generate-serialized', action='store_true', dest='write',
+        '-G', '--generate-serialized', action='store_true', dest='generate',
         help='generate output files (default=false, compares to current files)')
     parser.add_argument(
-        '-o', '--output', default=DATA_DIR,
+        '-O', '--output', default=DATA_DIR,
         help='output directory (default: %(default)s)')
+    parser.add_argument(
+        '-D', '--disable-serialized_check', action='store_true', dest='disable',
+        help='disable checking serialized tests')
     parser.add_argument('unittest_args', nargs='*')
     args = parser.parse_args()
     sys.argv[1:] = args.unittest_args
-    _output_context.__setattr__('should_write_output', args.write)
+    _output_context.__setattr__('should_generate_output', args.generate)
     _output_context.__setattr__('output_dir', args.output)
+    _output_context.__setattr__('disable_serialized_check', args.disable)
 
     import unittest
     unittest.main()

From 2732c8bae10881bc7a359af2dfbc812238efb436 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Tue, 18 Sep 2018 10:47:21 -0700
Subject: [PATCH 225/237] improve aten/convolution error message (#11768)

Summary:
fixes https://github.com/pytorch/pytorch/issues/11762
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11768

Differential Revision: D9884185

Pulled By: soumith

fbshipit-source-id: 2a0c3e1f5a4fb4833ae6e9fc791abcf45f7fbea2
---
 aten/src/ATen/native/Convolution.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index b3fcff1b2ccd29..96ddb5ae3928b1 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -157,7 +157,7 @@ static void check_input_shape_forward(const at::Tensor& input,
 
   AT_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
-           "-dimensional weight ", weight.sizes(), ", but got input of size ",
+           "-dimensional weight ", weight.sizes(), ", but got ", k, "-dimensional input of size ",
            input.sizes(), " instead");
   AT_CHECK(weight.size(0) >= groups,
            "Given groups=", groups, ", expected weight to be at least ", groups,

From 540ef9b1fc5506369a48491af8a285a686689b36 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Tue, 18 Sep 2018 10:47:27 -0700
Subject: [PATCH 226/237] Add distributed get_backend (#11715)

Summary:
I have no idea how to run distributed tests locally so I'll let CI do this. Hopefully everything still works with `IntEnum`.

cc mcarilli
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11715

Reviewed By: pietern

Differential Revision: D9889646

Pulled By: SsnL

fbshipit-source-id: 1e2a487cb6fe0bd4cc67501c9d72a295c35693e2
---
 test/test_distributed.py              | 30 +++++++++
 torch/distributed/__init__.py         |  4 ++
 torch/distributed/distributed_c10d.py | 88 ++++++++++++++++++++++-----
 3 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/test/test_distributed.py b/test/test_distributed.py
index d311f74e3fe0f1..cc135c2ebec7a3 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -234,6 +234,36 @@ def test_get_default_group(self):
         default_grp = dist.get_default_group()
         self.assertNotEqual(default_grp, None)
 
+    def test_get_backend(self):
+        if dist.get_world_size() > 2:
+            group = [1, 2]
+        else:
+            group = [0, 1]
+        group_id = dist.new_group(group)
+        backend_str = BACKEND.lower()
+        self.assertEqual(dist.get_backend(), backend_str)
+        if dist.get_rank() in group:
+            self.assertEqual(dist.get_backend(group_id), backend_str)
+        else:
+            with self.assertRaisesRegex(RuntimeError, "Invalid process group specified"):
+                dist.get_backend(group_id)
+
+    def test_DistBackend(self):
+        # test parsing
+        backend = BACKEND.lower()
+        self.assertEqual(dist.DistBackend(BACKEND.upper()), backend)
+        self.assertEqual(dist.DistBackend(BACKEND), backend)
+        with self.assertRaisesRegex(ValueError, "Invalid backend: 'undefined'"):
+            dist.DistBackend("undefined")
+        with self.assertRaisesRegex(ValueError, "Invalid backend: 'xYz'"):
+            dist.DistBackend("xYz")
+        with self.assertRaises(ValueError):
+            dist.DistBackend(None)
+        with self.assertRaises(ValueError):
+            dist.DistBackend(3)
+        with self.assertRaises(ValueError):
+            dist.DistBackend(["gloo"])
+
     # Test destroy
     def test_destroy_group(self):
         if dist.get_world_size() > 2:
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 5356097743aa3c..2a7b004d527093 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -11,3 +11,7 @@ def is_available():
 
 if is_available():
     from .distributed_c10d import *
+    # Variables prefixed with underscore are not auto imported
+    # See the comment in `distributed_c10d.py` above `_backend` on why we expose
+    # this.
+    from .distributed_c10d import _backend
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 651f2ea6f2f870..0568e4261f4480 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,4 +1,5 @@
 import torch
+from torch._six import string_classes
 
 from .rendezvous import rendezvous, register_rendezvous_handler
 from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \
@@ -23,11 +24,40 @@
     _NCCL_AVAILBLE = False
 
 
-class DistBackend:
-    UNDEFINED = -1
-    GLOO = 0
-    NCCL = 2
-    MPI = 3
+class DistBackend(object):
+    """
+    An enum-like class of available backends: GLOO, NCCL, and MPI.
+
+    The values of this class are lowercase strings, e.g., ``"gloo"``. They can
+    be accessed as attributes, e.g., ``DistBackend.NCCL``.
+
+    This class can be directly called to parse the string, e.g.,
+    ``DistBackend(backend_str)`` will check if ``backend_str`` is valid, and
+    return the parsed lowercase string if so. It also accepts uppercase strings,
+    e.g., ``DistBackend("GLOO")`` returns ``"gloo"``.
+
+    .. note:: The entry ``DistBackend.UNDEFINED`` is present but only used as
+              initial value of some fields. Users should neither use it directly
+              nor assume its existence.
+    """
+    UNDEFINED = "undefined"
+    GLOO = "gloo"
+    NCCL = "nccl"
+    MPI = "mpi"
+
+    def __new__(cls, name):
+        if not isinstance(name, string_classes):
+            raise ValueError("Backend name must be a string, but got: {}".format(name))
+        value = getattr(DistBackend, name.upper(), DistBackend.UNDEFINED)
+        if value == DistBackend.UNDEFINED:
+            raise ValueError("Invalid backend: '{}'".format(name))
+        return value
+
+# The following two values are here to maintain backward compatibility with
+# pre-c10d distributed package.
+# TODO: remove them when users are ready to take a hard dependency on PyTorch 1.
+_backend = DistBackend.UNDEFINED
+dist_backend = DistBackend
 
 
 class group(object):
@@ -166,6 +196,30 @@ def get_default_group():
     return _default_pg
 
 
+def get_backend(group=group.WORLD):
+    """
+    Returns the backend of the given process group.
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific group
+            is specified, the calling process must be part of :attr:`group`.
+
+    Returns:
+        The backend of the given process group as a lower case string.
+
+    """
+    _check_default_pg()
+
+    if group == GroupMember.WORLD:
+        pg = _default_pg
+    else:
+        pg = group
+    if _rank_not_in_group(pg):
+        raise RuntimeError("Invalid process group specified")
+    return _pg_map.get(pg, None)[0]
+
+
 def init_process_group(backend,
                        init_method="env://",
                        **kwargs):
@@ -174,9 +228,11 @@ def init_process_group(backend,
     initialize the distributed package
 
     Arguments:
-        backend (str): Name of the backend to use. Depending on build-time
-                       configuration valid values include:
-                        ``mpi`` and ``gloo``.
+        backend (str or DistBackend): The backend to use. Depending on
+            build-time configurations, valid values include ``mpi``, ``gloo``,
+            and ``nccl``. This field should be given as a lowercase string
+            (e.g., ``"gloo"``), which can also be accessed via
+            :class:`DistBackend` attributes (e.g., ``DistBackend.GLOO``).
         init_method (str, optional): URL specifying how to initialize the
                                      process group.
         world_size (int, optional): Number of processes participating in
@@ -184,12 +240,13 @@ def init_process_group(backend,
         rank (int, optional): Rank of the current process.
         group_name (str, optional, deprecated): Group name.
 
-    To enable ``backend == mpi``, PyTorch needs to built from source on
-    a system that supports MPI. The same applies to NCCL as well.
+    To enable ``backend == DistBackend.MPI``, PyTorch needs to built from source
+    on a system that supports MPI. The same applies to NCCL as well.
 
     """
     global _pg_map
     global _pg_names
+    global _backend
     global _default_pg
     global _default_pg_init_method
 
@@ -203,7 +260,9 @@ def init_process_group(backend,
     assert len(kwargs) == 0, \
         "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
 
-    if backend == "mpi":
+    backend = DistBackend(backend)
+
+    if backend == DistBackend.MPI:
         if not is_mpi_available():
             raise RuntimeError("Distributed package doesn't have MPI built in")
 
@@ -220,20 +279,19 @@ def init_process_group(backend,
         else:
             store, rank, world_size = next(rendezvous(init_method))
 
-        if backend == "gloo":
+        if backend == DistBackend.GLOO:
             _default_pg = ProcessGroupGloo(store, rank, world_size)
             _pg_map[_default_pg] = (DistBackend.GLOO, store)
             _pg_names[_default_pg] = group_name
-        elif backend == "nccl":
+        elif backend == DistBackend.NCCL:
             if not is_nccl_available():
                 raise RuntimeError("Distributed package doesn't have NCCL "
                                    "built in")
             _default_pg = ProcessGroupNCCL(store, rank, world_size)
             _pg_map[_default_pg] = (DistBackend.NCCL, store)
             _pg_names[_default_pg] = group_name
-        else:
-            raise RuntimeError("Invalid distributed backend name: " + backend)
 
+    _backend = _pg_map[_default_pg][0]
     _default_pg_init_method = init_method
 
 

From 47956ddf7e9632bcdb47df2e12b88a933454534d Mon Sep 17 00:00:00 2001
From: Will Feng <willfeng@fb.com>
Date: Tue, 18 Sep 2018 11:15:29 -0700
Subject: [PATCH 227/237] Revert D9755189: [pytorch][PR] [API CHANGE] Add empty
 tensor tests to test_sparse

Differential Revision:
D9755189

Original commit changeset: e9d36f437db1

fbshipit-source-id: 8b99edf626418a953a8bd786847a6e0174a3a14d
---
 aten/src/ATen/native/sparse/SparseTensor.cpp  |   22 +-
 .../ATen/native/sparse/SparseTensorMath.cpp   |   18 +-
 .../native/sparse/cuda/SparseCUDATensor.cpp   |    3 -
 .../native/sparse/cuda/SparseCUDATensor.cu    |   10 +-
 .../sparse/cuda/SparseCUDATensorMath.cu       |   66 +-
 ...arse.test_add_dense_sparse_mismatch.expect |    1 +
 .../TestCudaSparse.test_log1p-backward.expect |    1 +
 ...stCudaSparse.test_log1p-uncoalesced.expect |    1 +
 ...arse.test_add_dense_sparse_mismatch.expect |    1 +
 ...coalescedSparse.test_log1p-backward.expect |    1 +
 ...lescedSparse.test_log1p-uncoalesced.expect |    1 +
 ...arse.test_add_dense_sparse_mismatch.expect |    1 +
 .../TestSparse.test_log1p-backward.expect     |    1 +
 .../TestSparse.test_log1p-uncoalesced.expect  |    1 +
 ...TestSparseOneOff.test_cuda_from_cpu.expect |    1 +
 ...eOff.test_cuda_sparse_cpu_dense_add.expect |    1 +
 ...arse.test_add_dense_sparse_mismatch.expect |    1 +
 ...coalescedSparse.test_log1p-backward.expect |    1 +
 ...lescedSparse.test_log1p-uncoalesced.expect |    1 +
 test/test_sparse.py                           | 1125 ++++++-----------
 20 files changed, 478 insertions(+), 780 deletions(-)
 create mode 100644 test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
 create mode 100644 test/expect/TestCudaSparse.test_log1p-backward.expect
 create mode 100644 test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
 create mode 100644 test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
 create mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
 create mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
 create mode 100644 test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
 create mode 100644 test/expect/TestSparse.test_log1p-backward.expect
 create mode 100644 test/expect/TestSparse.test_log1p-uncoalesced.expect
 create mode 100644 test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
 create mode 100644 test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
 create mode 100644 test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
 create mode 100644 test/expect/TestUncoalescedSparse.test_log1p-backward.expect
 create mode 100644 test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect

diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 49efed2a1e066b..25fd4fc5df4326 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -257,16 +257,12 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
   AT_ASSERT(!self.is_variable());
   AT_ASSERT(self.is_sparse());
 
+  if (self._nnz() < 2) {
+    _get_sparse_impl(self)->set_coalesced(true);
+  }
   if (self.is_coalesced()) {
     return self;
   }
-  // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false,
-  // we should keep the original tensor intact and do coalesce on a copy of the tensor
-  if (self._nnz() < 2) {
-    SparseTensor dst = self.clone();
-    _get_sparse_impl(dst)->set_coalesced(true);
-    return dst;
-  }
 
   LongTensor indices = self._indices();
   Tensor values = self._values().contiguous();
@@ -310,17 +306,13 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
           int64_t pos = indicesPermutationAccessor[j];
           int64_t curr = indicesBufferAccessor[j];
           if (curr == prev) {
-            if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
-              THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
-            }
+            THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
           } else {
             ++i;
             for (int64_t d = 0; d < sparseDims; d++) {
               newIndicesAccessor[d][i] = indicesAccessor[d][pos];
             }
-            if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
-              THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
-            }
+            THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
           }
           prev = curr;
         }
@@ -353,10 +345,6 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   int64_t r_nnz = mask._nnz();
   _get_sparse_impl(r)->set_nnz_and_narrow(r_nnz);
-  if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
-    return r;
-  }
-
   // NB: Relies on mask._nnz() == 0 test above
   auto mask_indices_accessor = mask_indices.accessor<int64_t, 2>();
 
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index ec074b5a6c8a88..afd8001734a9a8 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -250,22 +250,18 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
             for (d = 0; d < sparseDims; d++) {
               r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
             }
-            if (t_values.numel() > 0) {  // We add all elements from t_values to r_values only if t_values is not an empty tensor
-              THBlas_axpy<scalar_t>(blockSize, 1,
-                t_values_ptr + t_i * blockSize, 1,
-                r_values_ptr + r_i * blockSize, 1);
-            }
+            THBlas_axpy<scalar_t>(blockSize, 1,
+              t_values_ptr + t_i * blockSize, 1,
+              r_values_ptr + r_i * blockSize, 1);
             t_i++;
           }
           if (cmp <= 0) {
             for (d = 0; d < sparseDims; d++) {
               r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i];
             }
-            if (s_values.numel() > 0) {  // We add all elements from s_values to r_values only if s_values is not an empty tensor
-              THBlas_axpy<scalar_t>(blockSize, cast_value,
-                s_values_ptr + s_i * blockSize, 1,
-                r_values_ptr + r_i * blockSize, 1);
-            }
+            THBlas_axpy<scalar_t>(blockSize, cast_value,
+              s_values_ptr + s_i * blockSize, 1,
+              r_values_ptr + r_i * blockSize, 1);
             s_i++;
           }
           r_i++;
@@ -372,7 +368,6 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes());
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
-    r.resize_as_(src_);
     return r.zero_();
   }
 
@@ -524,6 +519,7 @@ Tensor& s_addmm_out_sparse_dense_cpu(
 
   AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor");
   AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor");
   AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
 
   SparseTensor sparse = sparse_.coalesce();
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index 107a30f51c2a97..fc53c41b56f727 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -25,9 +25,6 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz());
-  if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
-    return r;
-  }
 
   LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options());
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index ef2da61460eb99..ff4b0e0c57736c 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -27,16 +27,12 @@ namespace at { namespace native {
 SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
 #ifndef __HIP_PLATFORM_HCC__
   int64_t nnz = self._nnz();
+  if (nnz < 2) {
+    _get_sparse_impl(self)->set_coalesced(true);
+  }
   if (self.is_coalesced()) {
     return self;
   }
-  // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false,
-  // we should keep the original tensor intact and do coalesce on a copy of the tensor
-  if (nnz < 2) {
-    SparseTensor dst = self.clone();
-    _get_sparse_impl(dst)->set_coalesced(true);
-    return dst;
-  }
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 712c28817bf6b1..d7701479e9b243 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -106,38 +106,37 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
           r__.transpose_(0, 1);
         }
 
-        if (nnz > 0) {
-          /* dense */
-          Tensor dense_;
-          char transpose_dense;
-          if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
-            transpose_dense = 'n';
-            dense_ = dense;
-          } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
-            transpose_dense = 't';
-            dense_ = dense;
-          } else {
-            transpose_dense = 't';
-            dense_ = dense.contiguous();
-          }
-
-          sparse::cuda::csrmm2(
-            'n',
-            transpose_dense,
-            m,
-            n,
-            k,
-            nnz,
-            cast_alpha,
-            values.data<scalar_t>(),
-            csr.data<int32_t>(),
-            colIndicesInt.data<int32_t>(),
-            dense_.data<scalar_t>(),
-            (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
-            cast_beta,
-            r__.data<scalar_t>(),
-            r__.stride(1));
+        /* dense */
+        Tensor dense_;
+        char transpose_dense;
+        if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
+          transpose_dense = 'n';
+          dense_ = dense;
+        } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
+          transpose_dense = 't';
+          dense_ = dense;
+        } else {
+          transpose_dense = 't';
+          dense_ = dense.contiguous();
         }
+
+        sparse::cuda::csrmm2(
+          'n',
+          transpose_dense,
+          m,
+          n,
+          k,
+          nnz,
+          cast_alpha,
+          values.data<scalar_t>(),
+          csr.data<int32_t>(),
+          colIndicesInt.data<int32_t>(),
+          dense_.data<scalar_t>(),
+          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
+          cast_beta,
+          r__.data<scalar_t>(),
+          r__.stride(1));
+
       });
 
   r_.copy_(r__);
@@ -282,10 +281,6 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   int64_t nDim = dense.dim();
   int64_t nDimI = sparse._sparseDims();
 
-  if (sparse._values().numel() == 0) {
-    return r_;
-  }
-
   if (sparse.is_coalesced()) {
     // TODO benchmark to decide whether to remove this special case
     const dim3 block = cuda::getApplyBlock();
@@ -428,7 +423,6 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   SparseTensor src = src_.coalesce();
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
-    r_.resize_as_(src_);
     return r_.zero_();
   }
 
diff --git a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 00000000000000..b6af4e9f4280b5
--- /dev/null
+++ b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-backward.expect b/test/expect/TestCudaSparse.test_log1p-backward.expect
new file mode 100644
index 00000000000000..8e4e1fc8c1c18f
--- /dev/null
+++ b/test/expect/TestCudaSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 00000000000000..b25c0d3db02b2e
--- /dev/null
+++ b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 00000000000000..b6af4e9f4280b5
--- /dev/null
+++ b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
new file mode 100644
index 00000000000000..8e4e1fc8c1c18f
--- /dev/null
+++ b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 00000000000000..b25c0d3db02b2e
--- /dev/null
+++ b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 00000000000000..b6af4e9f4280b5
--- /dev/null
+++ b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-backward.expect b/test/expect/TestSparse.test_log1p-backward.expect
new file mode 100644
index 00000000000000..8e4e1fc8c1c18f
--- /dev/null
+++ b/test/expect/TestSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-uncoalesced.expect b/test/expect/TestSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 00000000000000..b25c0d3db02b2e
--- /dev/null
+++ b/test/expect/TestSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
new file mode 100644
index 00000000000000..fab1614da93d4a
--- /dev/null
+++ b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
@@ -0,0 +1 @@
+backend of indices (CUDA) must match backend of values (CPU)
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
new file mode 100644
index 00000000000000..77b0b500f3b692
--- /dev/null
+++ b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
@@ -0,0 +1 @@
+add: expected 'other' to be a CPU tensor, but got a CUDA tensor
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
new file mode 100644
index 00000000000000..b6af4e9f4280b5
--- /dev/null
+++ b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
@@ -0,0 +1 @@
+add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
new file mode 100644
index 00000000000000..8e4e1fc8c1c18f
--- /dev/null
+++ b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
@@ -0,0 +1 @@
+log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
new file mode 100644
index 00000000000000..b25c0d3db02b2e
--- /dev/null
+++ b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
@@ -0,0 +1 @@
+log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 760879485e2b6d..82de45d6d17384 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -44,7 +44,7 @@ def setUp(self):
         self.SparseTensor = torch.sparse.DoubleTensor
         super(TestSparse, self).setUp()
 
-    def _gen_sparse(self, sparse_dims, nnz, with_size):
+    def _gen_sparse(self, d, nnz, with_size):
         # TODO: Consider implementing this in the CUDA case by directly
         # performing the operations on the GPU.  You won't be able to
         # use torch.rand/torch.randn in this case because they are
@@ -54,30 +54,28 @@ def _gen_sparse(self, sparse_dims, nnz, with_size):
         # If you do this, be sure to update assert_uncoalesced too
 
         if isinstance(with_size, Number):
-            with_size = [with_size] * sparse_dims
+            with_size = [with_size] * d
 
         if self.is_uncoalesced:
             # We want to generate a tensor with a lot of uncoalesced
             # entries to stress test whether or not we handle this
             # (subtle) case correctly
-            v_size = [nnz * 2] + list(with_size[sparse_dims:])
+            v_size = [nnz * 2] + list(with_size[d:])
             v = torch.randn(*v_size)
-            r = torch.rand(sparse_dims, nnz)
+            r = torch.rand(d, nnz)
             # Repeat the indexes, so every position shows up twice
-            i = torch.cat([r, r], dim=1)
-            if nnz > 0:
-                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz * 2, 1).transpose(0, 1)
+            i = torch.cat([r, r], dim=1) * \
+                torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
             self.assert_uncoalesced(x)
         else:
-            # Generate a sparse tensor with sparse_dims sparse dimensions; the
-            # rest the dimensions with_size[sparse_dims:] are dense.
-            v_size = [nnz] + list(with_size[sparse_dims:])
+            # Generate a sparse tensor with d sparse dimensions; the
+            # rest the dimensions with_size[d:] are dense.
+            v_size = [nnz] + list(with_size[d:])
             v = torch.randn(*v_size)
-            i = torch.rand(sparse_dims, nnz)
-            if nnz > 0:
-                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz, 1).transpose(0, 1)
+            i = torch.rand(d, nnz) * \
+                torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
 
@@ -92,13 +90,15 @@ def assert_uncoalesced(self, x):
         correctness of the uncoalesced tensor generation algorithm.
         """
         assert not x.is_coalesced()
-        existing_indices = set()
-        for i in range(x._nnz()):
-            index = str(x._indices()[:, i])
-            if index in existing_indices:
-                return True
-            else:
-                existing_indices.add(index)
+        # Strategy: construct a new sparse tensor with the raw value
+        # field overwritten to a tensor of ones, coalesce it, and then
+        # check if any value entries are > 1 (which indicates that the
+        # original was uncoalesced.)
+        i = x._indices().clone()
+        v = x._values().clone().fill_(1)
+        y = torch.sparse.DoubleTensor(i, v, x.size())
+        z = self.safeCoalesce(y)
+        assert (z._values() > 1).sum() > 0
 
     def randn(self, *args, **kwargs):
         """
@@ -164,20 +164,18 @@ def test_print(self):
 
     @skipIfRocm
     def test_basic(self):
-        def test_shape(sparse_dims, nnz, with_size):
-            if isinstance(with_size, Number):
-                with_size = [with_size] * sparse_dims
-            x, i, v = self._gen_sparse(sparse_dims, nnz, with_size)
-            self.assertEqual(i, x._indices())
-            self.assertEqual(v, x._values())
-            self.assertEqual(x.ndimension(), len(with_size))
-            self.assertEqual(self.safeCoalesce(x)._nnz(), nnz)
-            self.assertEqual(list(x.size()), with_size)
-
-        test_shape(3, 10, 100)
-        test_shape(3, 10, [100, 100, 100])
-        test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
-        test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
+        x, i, v = self._gen_sparse(3, 10, 100)
+
+        self.assertEqual(i, x._indices())
+        self.assertEqual(v, x._values())
+
+        x, i, v = self._gen_sparse(3, 10, [100, 100, 100])
+        self.assertEqual(i, x._indices())
+        self.assertEqual(v, x._values())
+        self.assertEqual(x.ndimension(), 3)
+        self.assertEqual(self.safeCoalesce(x)._nnz(), 10)
+        for i in range(3):
+            self.assertEqual(x.size(i), 100)
 
         # Make sure that coalesce handles duplicate indices correctly
         i = self.IndexTensor([[9, 0, 0, 0, 8, 1, 1, 1, 2, 7, 2, 2, 3, 4, 6, 9]])
@@ -215,13 +213,6 @@ def test_ctor_size_checks(self):
 
     @skipIfRocm
     def test_to_dense(self):
-        def test_tensor(x, res):
-            x.to_dense()  # Tests triple to_dense for memory corruption
-            x.to_dense()
-            x.to_dense()
-            self.assertEqual(res, x.to_dense())
-            self.assertEqual(res, self.safeToDense(x))
-
         i = self.IndexTensor([
             [0, 1, 2, 2],
             [0, 0, 0, 3],
@@ -243,17 +234,12 @@ def test_tensor(x, res):
              [0, 0, 0, 0, 0],
              [0, 0, 0, 0, 4]],
         ])
-        test_tensor(x, res)
 
-        i = self.IndexTensor([
-            [0, 1, 2, 2],
-            [0, 0, 0, 3],
-            [0, 0, 1, 4],
-        ])
-        v = self.ValueTensor(4, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
-        res = self.ValueTensor(3, 4, 5, 0)
-        test_tensor(x, res)
+        x.to_dense()  # Tests double to_dense for memory corruption
+        x.to_dense()
+        x.to_dense()
+        self.assertEqual(res, x.to_dense())
+        self.assertEqual(res, self.safeToDense(x))
 
     @skipIfRocm
     def test_shared(self):
@@ -265,21 +251,8 @@ def test_shared(self):
         i[0][0] = 0
         self.assertEqual(self.ValueTensor([6, 0, 0]), self.safeToDense(x))
 
-        i = self.IndexTensor([[2]])
-        v = self.ValueTensor(1, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 0]))
-        i[0][0] = 0
-        self.assertEqual(self.ValueTensor(3, 0), self.safeToDense(x))
-
     @skipIfRocm
     def test_to_dense_hybrid(self):
-        def test_tensor(x, res):
-            x.to_dense()  # Tests double to_dense for memory corruption
-            x.to_dense()
-            x.to_dense()
-            self.assertEqual(res, x.to_dense())
-            self.assertEqual(res, self.safeToDense(x))
-
         i = self.IndexTensor([
             [0, 1, 2, 2],
             [0, 0, 0, 3],
@@ -300,24 +273,15 @@ def test_tensor(x, res):
              [0, 0],
              [4, 5]],
         ])
-        test_tensor(x, res)
 
-        i = self.IndexTensor([
-            [0, 1, 2, 2],
-            [0, 0, 0, 3],
-        ])
-        v = self.ValueTensor(4, 2, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 4, 2, 0]))
-        res = self.ValueTensor(3, 4, 2, 0)
-        test_tensor(x, res)
+        x.to_dense()  # Tests double to_dense for memory corruption
+        x.to_dense()
+        x.to_dense()
+        self.assertEqual(res, x.to_dense())
+        self.assertEqual(res, self.safeToDense(x))
 
     @skipIfRocm
     def test_contig(self):
-        def test_tensor(x, exp_i, exp_v):
-            x = self.safeCoalesce(x)
-            self.assertEqual(exp_i, x._indices())
-            self.assertEqual(exp_v, x._values())
-
         i = self.IndexTensor([
             [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
@@ -329,7 +293,9 @@ def test_tensor(x, exp_i, exp_v):
             [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
         ])
         exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
-        test_tensor(x, exp_i, exp_v)
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
 
         i = self.IndexTensor([
             [2, 0, 2, 1],
@@ -344,22 +310,10 @@ def test_tensor(x, exp_i, exp_v):
             [0, 0, 1, 4],
         ])
         exp_v = self.ValueTensor([2, 1, 3, 4])
-        test_tensor(x, exp_i, exp_v)
 
-        i = self.IndexTensor([
-            [2, 0, 2, 1],
-            [0, 0, 3, 0],
-            [1, 0, 4, 0],
-        ])
-        v = self.ValueTensor(4, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
-        exp_i = self.IndexTensor([
-            [0, 1, 2, 2],
-            [0, 0, 0, 3],
-            [0, 0, 1, 4],
-        ])
-        exp_v = self.ValueTensor(4, 0)
-        test_tensor(x, exp_i, exp_v)
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
 
         # Duplicate indices
         i = self.IndexTensor([
@@ -375,30 +329,13 @@ def test_tensor(x, exp_i, exp_v):
             [0, 4],
         ])
         exp_v = self.ValueTensor([6, 4])
-        test_tensor(x, exp_i, exp_v)
 
-        i = self.IndexTensor([
-            [0, 0, 2, 0],
-            [0, 0, 3, 0],
-            [0, 0, 4, 0],
-        ])
-        v = self.ValueTensor(4, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
-        exp_i = self.IndexTensor([
-            [0, 2],
-            [0, 3],
-            [0, 4],
-        ])
-        exp_v = self.ValueTensor(2, 0)
-        test_tensor(x, exp_i, exp_v)
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
 
     @skipIfRocm
     def test_contig_hybrid(self):
-        def test_tensor(x, exp_i, exp_v):
-            x = self.safeCoalesce(x)
-            self.assertEqual(exp_i, x._indices())
-            self.assertEqual(exp_v, x._values())
-
         i = self.IndexTensor([
             [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
@@ -416,7 +353,9 @@ def test_tensor(x, exp_i, exp_v):
             [2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
             [3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
         ])
-        test_tensor(x, exp_i, exp_v)
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
 
         i = self.IndexTensor([
             [2, 0, 2, 1],
@@ -431,22 +370,10 @@ def test_tensor(x, exp_i, exp_v):
             [0, 0, 1, 4],
         ])
         exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
-        test_tensor(x, exp_i, exp_v)
 
-        i = self.IndexTensor([
-            [2, 0, 2, 1],
-            [0, 0, 3, 0],
-            [1, 0, 4, 0],
-        ])
-        v = self.ValueTensor(4, 3, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0]))
-        exp_i = self.IndexTensor([
-            [0, 1, 2, 2],
-            [0, 0, 0, 3],
-            [0, 0, 1, 4],
-        ])
-        exp_v = self.ValueTensor(4, 3, 0)
-        test_tensor(x, exp_i, exp_v)
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
 
         # Duplicate indices
         i = self.IndexTensor([
@@ -462,79 +389,51 @@ def test_tensor(x, exp_i, exp_v):
             [0, 4],
         ])
         exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]])
-        test_tensor(x, exp_i, exp_v)
 
-        i = self.IndexTensor([
-            [0, 0, 2, 0],
-            [0, 0, 3, 0],
-            [0, 0, 4, 0],
-        ])
-        v = self.ValueTensor(4, 3, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0]))
-        exp_i = self.IndexTensor([
-            [0, 2],
-            [0, 3],
-            [0, 4],
-        ])
-        exp_v = self.ValueTensor(2, 3, 0)
-        test_tensor(x, exp_i, exp_v)
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x._indices())
+        self.assertEqual(exp_v, x._values())
 
     @skipIfRocm
     def test_clone(self):
-        def test_shape(sparse_dims, nnz, with_size):
-            x = self._gen_sparse(sparse_dims, nnz, with_size)[0]
-            if self.is_uncoalesced:
-                self.assertFalse(x.is_coalesced())
-                y = x.clone()
-                self.assertFalse(y.is_coalesced())
-            x = x.coalesce()
-            self.assertTrue(x.is_coalesced())
+        x, _, _ = self._gen_sparse(4, 20, 5)
+        if self.is_uncoalesced:
+            self.assertFalse(x.is_coalesced())
             y = x.clone()
-            self.assertTrue(y.is_coalesced())
-
-        test_shape(4, 20, 5)
-        test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
-        test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
+            self.assertFalse(y.is_coalesced())
+        x = x.coalesce()
+        self.assertTrue(x.is_coalesced())
+        y = x.clone()
+        self.assertTrue(y.is_coalesced())
 
     @cuda_only
     def test_cuda_empty(self):
-        def test_tensor(x):
-            y = x.cuda(0)
-            self.assertEqual(x._sparseDims(), y._sparseDims())
-            self.assertEqual(x._denseDims(), y._denseDims())
-            x = y.cpu()
-            self.assertEqual(y._sparseDims(), x._sparseDims())
-            self.assertEqual(y._denseDims(), x._denseDims())
-
         x = torch.sparse.FloatTensor(2, 3, 4)
-        test_tensor(x)
-
-        x = torch.sparse.FloatTensor(2, 3, 4, 0)
-        test_tensor(x)
+        y = x.cuda(0)
+        self.assertEqual(x._sparseDims(), y._sparseDims())
+        self.assertEqual(x._denseDims(), y._denseDims())
+        x = y.cpu()
+        self.assertEqual(y._sparseDims(), x._sparseDims())
+        self.assertEqual(y._denseDims(), x._denseDims())
 
     @skipIfRocm
     def test_transpose(self):
-        def test_shape(sparse_dims, nnz, with_size):
-            x = self._gen_sparse(sparse_dims, nnz, with_size)[0]
-            y = self.safeToDense(x)
-
-            for i, j in itertools.combinations(range(4), 2):
-                x = x.transpose_(i, j)
-                y = y.transpose(i, j)
-                self.assertEqual(self.safeToDense(x), y)
+        x = self._gen_sparse(4, 20, 5)[0]
+        y = self.safeToDense(x)
 
-                x = x.transpose(i, j)
-                y = y.transpose(i, j)
-                self.assertEqual(self.safeToDense(x), y)
+        for i, j in itertools.combinations(range(4), 2):
+            x = x.transpose_(i, j)
+            y = y.transpose(i, j)
+            self.assertEqual(self.safeToDense(x), y)
 
-        test_shape(4, 20, 5)
-        test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0])
-        test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
+            x = x.transpose(i, j)
+            y = y.transpose(i, j)
+            self.assertEqual(self.safeToDense(x), y)
 
     @cpu_only
     def test_coalesce_transpose_mm(self):
-        def test_shape(di, dj, dk, nnz):
-            x, _, _ = self._gen_sparse(2, nnz, [dj, di])
+        def test_shape(di, dj, dk):
+            x, _, _ = self._gen_sparse(2, 20, [dj, di])
             y = torch.randn(dj, dk)
 
             x_coalesced = x.coalesce()
@@ -547,58 +446,43 @@ def test_shape(di, dj, dk, nnz):
             expected = torch.mm(self.safeToDense(x_coalesced_t), y)
             self.assertEqual(res, expected)
 
-        test_shape(10, 20, 30, 20)
-        test_shape(0, 20, 30, 0)
-        test_shape(10, 0, 30, 0)
-        test_shape(10, 20, 0, 0)
-        test_shape(10, 20, 0, 20)
+        test_shape(10, 20, 30)
 
     def test_t_empty(self):
-        def test_in_place(x):
-            shape_original = x.shape
-            x.t_()
-            self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), x.size())
-            self.assertEqual(0, x._indices().numel())
-            self.assertEqual(0, x._values().numel())
-            self.assertEqual(x._sparseDims(), 2)
-            self.assertEqual(x._denseDims(), 0)
-
-        def test_not_in_place(x):
-            shape_original = x.shape
-            y = x.t()
-            self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), y.size())
-            self.assertEqual(0, y._indices().numel())
-            self.assertEqual(0, y._values().numel())
-            self.assertEqual(x._sparseDims(), 2)
-            self.assertEqual(x._denseDims(), 0)
-
         x = self.SparseTensor(2, 3)
-        test_in_place(x)
-        test_not_in_place(x)
+        x.t_()
+        self.assertEqual(torch.Size([3, 2]), x.size())
+        self.assertEqual(0, x._indices().numel())
+        self.assertEqual(0, x._values().numel())
+        self.assertEqual(x._sparseDims(), 2)
+        self.assertEqual(x._denseDims(), 0)
 
-        x = self.SparseTensor(2, 0)
-        test_in_place(x)
-        test_not_in_place(x)
+        x = self.SparseTensor(2, 3)
+        y = x.t()
+        self.assertEqual(torch.Size([3, 2]), y.size())
+        self.assertEqual(0, y._indices().numel())
+        self.assertEqual(0, y._values().numel())
+        self.assertEqual(x._sparseDims(), 2)
+        self.assertEqual(x._denseDims(), 0)
 
     @skipIfRocm
     def test_add_zeros(self):
-        def test_shape(sparse_dims, nnz, sizes):
-            x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        def test_shape(sparse_dims, sizes):
+            x, _, _ = self._gen_sparse(sparse_dims, 20, sizes)
             zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device)
             r1 = zeros + x
             r2 = x + zeros
             self.assertEqual(r1, x)
             self.assertEqual(r2, x)
 
-        test_shape(1, 20, [1])
-        test_shape(4, 20, [3, 17, 19, 5])
-        test_shape(2, 20, [3, 17, 19, 5])
-        test_shape(2, 20, [3, 17, 19, 0])
+        test_shape(1, [1])
+        test_shape(4, [3, 17, 19, 5])
+        test_shape(2, [3, 17, 19, 5])
 
     @cpu_only
     def test_mm(self):
-        def test_shape(di, dj, dk, nnz):
-            x, _, _ = self._gen_sparse(2, nnz, [di, dj])
+        def test_shape(di, dj, dk):
+            x, _, _ = self._gen_sparse(2, 20, [di, dj])
             t = torch.randn(di, dk)
             y = torch.randn(dj, dk)
             alpha = random.random()
@@ -616,19 +500,15 @@ def test_shape(di, dj, dk, nnz):
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res, expected)
 
-        test_shape(10, 100, 100, 20)
-        test_shape(100, 1000, 200, 20)
-        test_shape(64, 10000, 300, 20)
-        test_shape(0, 100, 100, 0)
-        test_shape(10, 0, 100, 0)
-        test_shape(10, 100, 0, 0)
-        test_shape(10, 100, 0, 20)
+        test_shape(10, 100, 100)
+        test_shape(100, 1000, 200)
+        test_shape(64, 10000, 300)
 
     @cpu_only
     def test_saddmm(self):
-        def test_shape(di, dj, dk, nnz):
-            x = self._gen_sparse(2, nnz, [di, dj])[0]
-            t = self._gen_sparse(2, nnz, [di, dk])[0]
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj])[0]
+            t = self._gen_sparse(2, 20, [di, dk])[0]
             y = torch.randn(dj, dk)
             alpha = random.random()
             beta = random.random()
@@ -645,52 +525,43 @@ def test_shape(di, dj, dk, nnz):
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(self.safeToDense(res), expected)
 
-        test_shape(7, 5, 3, 20)
-        test_shape(1000, 100, 100, 20)
-        test_shape(3000, 64, 300, 20)
-        test_shape(0, 100, 100, 0)
-        test_shape(1000, 0, 100, 0)
-        test_shape(1000, 100, 0, 0)
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
 
     @skipIfRocm
     def test_dsmm(self):
-        def test_shape(di, dj, dk, nnz):
-            x = self._gen_sparse(2, nnz, [di, dj])[0]
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj])[0]
             y = self.randn(dj, dk)
 
             res = torch.dsmm(x, y)
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res, expected)
 
-        test_shape(7, 5, 3, 20)
-        test_shape(1000, 100, 100, 20)
-        test_shape(3000, 64, 300, 20)
-        test_shape(0, 100, 100, 0)
-        test_shape(1000, 0, 100, 0)
-        test_shape(1000, 100, 0, 0)
-        test_shape(1000, 100, 0, 20)
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
 
     @skipIfRocm
     def test_hsmm(self):
-        def test_shape(di, dj, dk, nnz):
-            x = self._gen_sparse(2, nnz, [di, dj])[0]
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj])[0]
             y = self.randn(dj, dk)
 
             res = torch.hsmm(x, y)
-            expected = torch.mm(self.safeToDense(x), y)
+            # TODO: use self.safeToDense(), but this triggers
+            # https://github.com/pytorch/pytorch/issues/3170
+            expected = torch.mm(x.to_dense(), y)
             self.assertEqual(res.to_dense(), expected)
 
-        test_shape(7, 5, 3, 20)
-        test_shape(1000, 100, 100, 20)
-        test_shape(3000, 64, 300, 20)
-        test_shape(0, 100, 100, 0)
-        test_shape(1000, 0, 100, 0)
-        test_shape(1000, 100, 0, 0)
-        test_shape(1000, 100, 0, 20)
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
 
-    def _test_spadd_shape(self, nnz, shape_i, shape_v=None):
+    def _test_spadd_shape(self, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x, _, _ = self._gen_sparse(len(shape_i), nnz, shape)
+        x, _, _ = self._gen_sparse(len(shape_i), 10, shape)
         y = self.randn(*shape)
         r = random.random()
 
@@ -712,7 +583,7 @@ def _test_spadd_shape(self, nnz, shape_i, shape_v=None):
 
         self.assertEqual(res, expected)
 
-        x, i, v = self._gen_sparse(len(shape_i), nnz, shape)
+        x, i, v = self._gen_sparse(len(shape_i), 10, shape)
         nnz = i.size(1)
 
         # Non contiguous sparse indices tensor
@@ -735,40 +606,28 @@ def _test_spadd_shape(self, nnz, shape_i, shape_v=None):
 
     @skipIfRocm
     def test_spadd(self):
-        self._test_spadd_shape(10, [5, 6])
-        self._test_spadd_shape(10, [10, 10, 10])
-        self._test_spadd_shape(10, [50, 30, 20])
-        self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5])
-        self._test_spadd_shape(0, [0, 30, 20])
-        self._test_spadd_shape(0, [50, 0, 20])
-        self._test_spadd_shape(0, [50, 30, 0])
+        self._test_spadd_shape([5, 6])
+        self._test_spadd_shape([10, 10, 10])
+        self._test_spadd_shape([50, 30, 20])
+        self._test_spadd_shape([5, 5, 5, 5, 5, 5])
 
     @skipIfRocm
     def test_spadd_hybrid(self):
-        self._test_spadd_shape(10, [5, 6], [2, 3])
-        self._test_spadd_shape(10, [10, 10, 10], [3])
-        self._test_spadd_shape(10, [50, 30, 20], [2])
-        self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5], [2])
-        self._test_spadd_shape(0, [0, 30, 20], [2, 0])
-        self._test_spadd_shape(0, [50, 0, 20], [2, 0])
-        self._test_spadd_shape(0, [50, 30, 0], [2, 0])
-        self._test_spadd_shape(10, [50, 30, 20], [2, 0])
+        self._test_spadd_shape([5, 6], [2, 3])
+        self._test_spadd_shape([10, 10, 10], [3])
+        self._test_spadd_shape([50, 30, 20], [2])
+        self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2])
 
     @skipIfRocm
     def test_norm(self):
-        def test_shape(sparse_dims, nnz, with_size):
-            x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size)
-            y = x.coalesce()
-            self.assertEqual(x.norm(), y._values().norm())
-
-        test_shape(3, 10, 100)
-        test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0])
-        test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
+        x, _, _ = self._gen_sparse(3, 10, 100)
+        y = x.coalesce()
+        self.assertEqual(x.norm(), y._values().norm())
 
-    def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
+    def _test_basic_ops_shape(self, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape)
-        x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape)
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
 
         y1 = x1 + x2
         y2 = x1.clone()
@@ -830,49 +689,30 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
 
     @skipIfRocm
     def test_basic_ops(self):
-        self._test_basic_ops_shape(9, 12, [5, 6])
-        self._test_basic_ops_shape(9, 12, [10, 10, 10])
-        self._test_basic_ops_shape(9, 12, [50, 30, 20])
-        self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5])
-        self._test_basic_ops_shape(0, 12, [10, 10, 10])
-        self._test_basic_ops_shape(9, 0, [10, 10, 10])
-        self._test_basic_ops_shape(0, 0, [10, 10, 10])
-        self._test_basic_ops_shape(0, 0, [10, 10, 0])
+        self._test_basic_ops_shape([5, 6])
+        self._test_basic_ops_shape([10, 10, 10])
+        self._test_basic_ops_shape([50, 30, 20])
+        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5])
 
     @skipIfRocm
     def test_basic_ops_hybrid(self):
-        self._test_basic_ops_shape(9, 12, [5, 6], [2, 3])
-        self._test_basic_ops_shape(9, 12, [10, 10, 10], [3])
-        self._test_basic_ops_shape(9, 12, [50, 30, 20], [2])
-        self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5], [2])
-        self._test_basic_ops_shape(0, 12, [10, 10, 10], [2])
-        self._test_basic_ops_shape(9, 0, [10, 10, 10], [2])
-        self._test_basic_ops_shape(0, 0, [10, 10, 10], [2])
-        self._test_basic_ops_shape(9, 12, [10, 10, 10], [2, 0])
-        self._test_basic_ops_shape(0, 12, [10, 10, 10], [2, 0])
-        self._test_basic_ops_shape(9, 0, [10, 10, 10], [2, 0])
-        self._test_basic_ops_shape(0, 0, [10, 10, 10], [2, 0])
-        self._test_basic_ops_shape(0, 0, [10, 10, 0], [2, 0])
+        self._test_basic_ops_shape([5, 6], [2, 3])
+        self._test_basic_ops_shape([10, 10, 10], [3])
+        self._test_basic_ops_shape([50, 30, 20], [2])
+        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2])
 
     @skipIfRocm
     def test_add_dense_sparse_mismatch(self):
-        def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
-            x = torch.zeros(dense_size, dtype=self.value_dtype, device=self.device)
-            sparse_y = self.SparseTensor(torch.zeros(sparse_dims_shape, dtype=torch.int64, device=self.device),
-                                         torch.randn(dense_dims_shape, dtype=self.value_dtype, device=self.device),
-                                         torch.Size(sparse_size))
-            with self.assertRaisesRegex(
-                    RuntimeError,
-                    "add: expected 'self' and 'other' to have same size"):
-                x + sparse_y
-
-        test_shape([3, 4], [1, 4], [4, 4, 4], [3, 4, 4])
-        test_shape([3, 4, 0], [1, 4], [4, 4, 4, 0], [3, 4, 4, 0])
-
-    def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
+        x = torch.zeros([3, 4], dtype=self.value_dtype, device=self.device)
+        sparse_y = self.SparseTensor(torch.zeros(1, 4, dtype=torch.int64, device=self.device),
+                                     torch.randn(4, 4, 4, dtype=self.value_dtype, device=self.device),
+                                     torch.Size([3, 4, 4]))
+        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
+
+    def _test_sparse_mask_shape(self, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape)
-        x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape)
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
 
         y1 = x1 + x2
         y2 = x1.clone()
@@ -900,85 +740,18 @@ def _test_sparse_mask_fixed(self):
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4]))
         self.assertEqual(res, expected)
 
-        i = self.IndexTensor([
-            [1, 3, 0, 4],
-            [2, 1, 2, 3],
-        ])
-        v = self.ValueTensor(4, 0)
-        x = self.SparseTensor(i, v, torch.Size([5, 4, 0])).coalesce()
-        dense = self.ValueTensor(5, 4, 0)
-        exp_v = self.ValueTensor(4, 0)
-        res = dense.sparse_mask(x)
-        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 0]))
-        self.assertEqual(res, expected)
-
     @skipIfRocm
     def test_sparse_mask(self):
         self._test_sparse_mask_fixed()
 
-        self._test_sparse_mask_shape(9, 12, [5, 6])
-        self._test_sparse_mask_shape(9, 12, [10, 10, 10])
-        self._test_sparse_mask_shape(9, 12, [50, 30, 20])
-        self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5])
-        self._test_sparse_mask_shape(0, 12, [10, 10, 10])
-        self._test_sparse_mask_shape(9, 0, [10, 10, 10])
-        self._test_sparse_mask_shape(0, 0, [10, 10, 10])
-        self._test_sparse_mask_shape(0, 0, [10, 10, 0])
-
-    def _test_sparse_mask_hybrid_fixed(self):
-        i = self.IndexTensor([
-            [1, 3, 0, 4],
-            [2, 1, 2, 3],
-        ])
-        v = self.ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5]])
-        # TODO: This is also testing that, if coalesce is a no-op,
-        # the indices don't get permuted. I don't know if we actually
-        # want to give this invariant.
-        x = self.SparseTensor(i, v, torch.Size([5, 4, 2])).coalesce()
-        dense = self.ValueTensor([
-            [[1, 3], [2, 2], [3, 3], [4, 2]],
-            [[5, 7], [6, 7], [7, 9], [8, 9]],
-            [[9, 2], [10, 4], [11, 1], [12, 3]],
-            [[13, 5], [14, 1], [15, 1], [16, 6]],
-            [[17, 7], [18, 2], [19, 7], [20, 1]],
-        ])
-        res = dense.sparse_mask(x)
-        exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]])
-        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
-        self.assertEqual(res, expected)
-
-        i = self.IndexTensor([
-            [1, 3, 0, 4],
-            [2, 1, 2, 3],
-        ])
-        v = self.ValueTensor(4, 2, 0)
-        x = self.SparseTensor(i, v, torch.Size([5, 4, 2, 0])).coalesce()
-        dense = self.ValueTensor(5, 4, 2, 0)
-        res = dense.sparse_mask(x)
-        exp_v = self.ValueTensor(4, 2, 0)
-        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2, 0]))
-        self.assertEqual(res, expected)
-
-    @skipIfRocm
-    def test_sparse_mask_hybrid(self):
-        self._test_sparse_mask_hybrid_fixed()
+        self._test_sparse_mask_shape([5, 6])
+        self._test_sparse_mask_shape([10, 10, 10])
+        self._test_sparse_mask_shape([50, 30, 20])
+        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5])
 
-        self._test_sparse_mask_shape(9, 12, [5, 6], [2, 3])
-        self._test_sparse_mask_shape(9, 12, [10, 10, 10], [3])
-        self._test_sparse_mask_shape(9, 12, [50, 30, 20], [2])
-        self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5], [2])
-        self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2])
-        self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2])
-        self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2])
-        self._test_sparse_mask_shape(9, 12, [10, 10, 10], [2, 0])
-        self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2, 0])
-        self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2, 0])
-        self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2, 0])
-        self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0])
-
-    def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None):
+    def _test_zeros(self, shape, out_shape_i, out_shape_v=None):
         out_shape = out_shape_i + (out_shape_v or [])
-        for nnz in nnzs:
+        for nnz in [9, 12]:
             out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape)
             torch.zeros(*shape, out=out)
             self.assertEqual(tuple(out.size()), tuple(shape))
@@ -987,22 +760,53 @@ def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None):
             self.assertEqual(out._sparseDims(), len(shape))
             self.assertEqual(out._denseDims(), 0)
 
+    @skipIfRocm
+    def test_log1p(self):
+        if self.is_cuda:
+            input = torch.cuda.sparse.DoubleTensor(
+                torch.LongTensor([[0], [1], [2]]).transpose(1, 0).cuda(),
+                torch.FloatTensor([3, 4, 5]).cuda(),
+                torch.Size([3]))
+        else:
+            input = torch.sparse.DoubleTensor(
+                torch.LongTensor([[0], [1], [2]]).transpose(1, 0),
+                torch.FloatTensor([3, 4, 5]),
+                torch.Size([3]))
+
+        expected_output = torch.tensor([3., 4., 5.]).log1p_()
+        self.assertEqual(expected_output, input.log1p().to_dense())
+        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
+
+        # test in-place op on uncoalesced input
+        self.assertExpectedRaises(RuntimeError, lambda: input.log1p_(), subname="uncoalesced")
+
+        input.requires_grad_()
+        self.assertTrue(input.requires_grad)
+
+        # test autograd
+        x = input.clone()
+        y = input.log1p()
+        self.assertExpectedRaises(RuntimeError, lambda: y.backward(x), subname="backward")
+
+        # test uncoalesced input
+        input_uncoalesced = torch.sparse.DoubleTensor(
+            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
+            torch.Size([3]))
+        self.assertEqual(expected_output, input_uncoalesced.log1p().to_dense())
+        self.assertEqual(expected_output, input_uncoalesced.coalesce().log1p_().to_dense())
+
     def test_zeros(self):
-        def test_shape(i_shapes, v_shapes, shape, nnzs):
-            for i_dim in range(1, len(i_shapes) + 1):
-                for v_dim in range(len(v_shapes) + 1):
-                    self._test_zeros(nnzs, shape, i_shapes[:i_dim], v_shapes[:v_dim])
-        test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 4], [9, 12])
-        test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 4], [0])
-        test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 4], [9, 12])
-        test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 0], [9, 12])
-        test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 0], [0])
-        test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12])
-
-    def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None):
+        i_shapes = [2, 3, 4]
+        v_shapes = [3, 4, 5, 6]
+        for i_dim in range(1, len(i_shapes) + 1):
+            for v_dim in range(len(v_shapes) + 1):
+                self._test_zeros([2, 3, 4], i_shapes[:i_dim], v_shapes[:v_dim])
+
+    def _test_zeros_like(self, template_shape_i, template_shape_v=None):
         template_shape_v = template_shape_v or []
         template_shape = template_shape_i + template_shape_v
-        for nnz in nnzs:
+        for nnz in [9, 12]:
             t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
             res = torch.zeros_like(t)
             self.assertEqual(tuple(res.size()), tuple(template_shape))
@@ -1012,65 +816,138 @@ def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None):
             self.assertEqual(res._denseDims(), len(template_shape_v))
 
     def test_zeros_like(self):
-        def test_shape(i_shapes, v_shapes, nnzs):
-            for i_dim in range(1, len(i_shapes) + 1):
-                for v_dim in range(len(v_shapes) + 1):
-                    self._test_zeros_like(nnzs, i_shapes[:i_dim], v_shapes[:v_dim])
-        test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12])
-        test_shape([0, 3, 4], [3, 4, 5, 6], [0])
-        test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
-        test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12])
-        test_shape([0, 3, 4], [3, 4, 5, 6], [0])
-        test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
-
-    def _test_log1p_tensor(self, input, dense_tensor):
-        expected_output = torch.tensor(dense_tensor).log1p_()
-        self.assertEqual(expected_output, input.log1p().to_dense())
-        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
+        i_shapes = [2, 3, 4]
+        v_shapes = [3, 4, 5, 6]
+        for i_dim in range(1, len(i_shapes) + 1):
+            for v_dim in range(len(v_shapes) + 1):
+                self._test_zeros_like(i_shapes[:i_dim], v_shapes[:v_dim])
 
-        # test in-place op on uncoalesced input
-        with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"):
-            input.log1p_()
+    def _test_sparse_mask_hybrid_fixed(self):
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5]])
+        # TODO: This is also testing that, if coalesce is a no-op,
+        # the indices don't get permuted. I don't know if we actually
+        # want to give this invariant.
+        x = self.SparseTensor(i, v, torch.Size([5, 4, 2])).coalesce()
+        dense = self.ValueTensor([
+            [[1, 3], [2, 2], [3, 3], [4, 2]],
+            [[5, 7], [6, 7], [7, 9], [8, 9]],
+            [[9, 2], [10, 4], [11, 1], [12, 3]],
+            [[13, 5], [14, 1], [15, 1], [16, 6]],
+            [[17, 7], [18, 2], [19, 7], [20, 1]],
+        ])
+        res = dense.sparse_mask(x)
+        exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]])
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
+        self.assertEqual(res, expected)
 
-        input.requires_grad_()
-        self.assertTrue(input.requires_grad)
+    @skipIfRocm
+    def test_sparse_variable_methods(self):
+        # TODO: delete when tensor/variable are merged
+        from torch.autograd import Variable
+        i = self.IndexTensor([[0, 1, 1], [2, 0, 2]])
+        v = self.ValueTensor([3, 4, 5])
+        sparse_mat = self.SparseTensor(i, v, torch.Size([2, 3]))
+        sparse_var = Variable(sparse_mat)
+
+        to_test_one_arg = {
+            'zeros_like': lambda x: torch.zeros_like(x),
+            'transpose': lambda x: x.transpose(0, 1),
+            'transpose_': lambda x: x.transpose_(0, 1),
+            't': lambda x: x.t(),
+            't_': lambda x: x.t_(),
+            'div': lambda x: x.div(2),
+            'div_': lambda x: x.div_(2),
+            'pow': lambda x: x.pow(2),
+            '_nnz': lambda x: x._nnz(),
+            'is_coalesced': lambda x: x.is_coalesced(),
+            'coalesce': lambda x: x.coalesce(),
+            'to_dense': lambda x: x.to_dense(),
+            '_sparseDims': lambda x: x._sparseDims(),
+            '_denseDims': lambda x: x._denseDims(),
+            'norm': lambda x: x.norm(),
+            'log1p': lambda x: x.log1p(),
+        }
+
+        for test_name, test_fn in to_test_one_arg.items():
+            var1 = sparse_var.clone()
+            tensor1 = sparse_mat.clone()
+
+            out_var = test_fn(var1)
+            out_tensor = test_fn(tensor1)
+
+            if isinstance(out_tensor, int) or isinstance(out_tensor, bool):
+                if not isinstance(out_var, int) and not isinstance(out_var, bool):
+                    check_var = out_var.data[0]
+                else:
+                    check_var = out_var
+                self.assertEqual(out_var, out_tensor)
+                continue
+
+            # Assume output is variable / tensor
+            self.assertEqual(test_fn(var1).data, test_fn(tensor1),
+                             test_name)
+
+        i = self.IndexTensor([[0, 0, 1], [1, 2, 1]])
+        v = self.ValueTensor([3, 3, 4])
+        sparse_mat2 = self.SparseTensor(i, v, torch.Size([2, 3]))
+        sparse_var2 = Variable(sparse_mat2)
+
+        to_test_two_arg = {
+            'sub': lambda x, y: x.sub(y),
+            'sub_': lambda x, y: x.sub_(y),
+            'mul': lambda x, y: x.mul(y),
+            'mul_': lambda x, y: x.mul_(y),
+        }
+
+        for test_name, test_fn in to_test_two_arg.items():
+            var1 = sparse_var.clone()
+            var2 = sparse_var2.clone()
+            tensor1 = sparse_mat.clone()
+            tensor2 = sparse_mat2.clone()
+            self.assertEqual(test_fn(var1, var2).data,
+                             test_fn(tensor1, tensor2), test_name)
+
+        to_test_mixed = [
+            # test name, lambda expression, should_run_when_cuda
+            ('sspaddmm', lambda sp, de: sp.sspaddmm(sp, de), False),
+            ('sspaddmm_b', lambda sp, de: sp.sspaddmm(2, sp, de), False),
+            ('sspaddmm_b_a', lambda sp, de: sp.sspaddmm(3, 2, sp, de), False),
+            ('addmm', lambda sp, de: de.addmm(sp, de), True),
+            # TODO: This looks like a typo
+            ('addmm_', lambda sp, de: de.addmm(sp, de), True),
+            ('mm', lambda sp, de: torch.mm(sp, de), True),
+            ('mm_out', lambda sp, de: torch.mm(sp, de, out=de), True),
+        ]
 
-        # test autograd
-        x = input.clone()
-        y = input.log1p()
-        with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"):
-            y.backward(x)
+        i = self.IndexTensor([[0, 0, 1, 2, 2], [1, 2, 1, 0, 1]])
+        v = self.ValueTensor([3, 3, 4, 1, 2])
+        sparse_mat = self.SparseTensor(i, v, torch.Size([3, 3]))
+        sparse_var = Variable(sparse_mat)
+        dense_mat = sparse_mat.to_dense().random_(0, 5)
+        dense_var = Variable(dense_mat)
+
+        for test_name, test_fn, test_cuda in to_test_mixed:
+            if sparse_var.is_cuda and not test_cuda:
+                continue
+            sp_var = sparse_var.clone()
+            de_var = dense_var.clone()
+            sp_mat = sparse_mat.clone()
+            de_mat = dense_mat.clone()
+            self.assertEqual(test_fn(sp_var, de_var).data,
+                             test_fn(sp_mat, de_mat), test_name)
 
     @skipIfRocm
-    def test_log1p(self):
-        input = torch.sparse_coo_tensor(
-            torch.LongTensor([[0], [1], [2]]).transpose(1, 0),
-            torch.FloatTensor([3, 4, 5]),
-            torch.Size([3]),
-            device=self.device)
-        self._test_log1p_tensor(input, [3., 4., 5.])
+    def test_sparse_mask_hybrid(self):
+        self._test_sparse_mask_hybrid_fixed()
 
-        # test uncoalesced input
-        input_uncoalesced = torch.sparse_coo_tensor(
-            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
-            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
-            torch.Size([3]),
-            device=self.device)
-        self._test_log1p_tensor(input_uncoalesced, [3., 4., 5.])
-
-        input = torch.sparse_coo_tensor(
-            torch.zeros([2, 0]),
-            torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
-            torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0]))
-
-        input = torch.sparse_coo_tensor(
-            torch.zeros([1, 5]),
-            torch.zeros([5, 6, 0]),
-            torch.Size([5, 6, 0]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.zeros([5, 6, 0]))
+        self._test_sparse_mask_shape([5, 6], [2, 3])
+        self._test_sparse_mask_shape([10, 10, 10], [3])
+        self._test_sparse_mask_shape([50, 30, 20], [2])
+        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2])
 
     @skipIfRocm
     def test_sparse_add_coalesce(self):
@@ -1082,55 +959,30 @@ def test_sparse_add_coalesce(self):
 
         self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
 
-        i = self.IndexTensor([[1, 2, 1]])
-        v = self.ValueTensor(3, 0)
-        x = self.SparseTensor(i, v, torch.Size([3, 0]))
-        y = self.SparseTensor(i, v, torch.Size([3, 0]))
-        z = x + y
-
-        self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
-
     @cuda_only
     def test_storage_not_null(self):
         x = torch.cuda.sparse.FloatTensor(2)
         self.assertNotEqual(x.get_device(), -1)
 
-        x = torch.cuda.sparse.FloatTensor(2, 0)
-        self.assertNotEqual(x.get_device(), -1)
-
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     @skipIfRocm
     def test_same_gpu(self):
-        def check_device(x, device_id):
-            self.assertEqual(x.get_device(), device_id)
-            self.assertEqual(x._values().get_device(), device_id)
-            self.assertEqual(x._indices().get_device(), device_id)
-
         i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(1)
         x = self.SparseTensor(i, v, torch.Size([3]), device=1)
-        check_device(x, 1)
-
-        i = self.IndexTensor([[2]]).cuda(1)
-        v = self.ValueTensor(1, 0).cuda(1)
-        x = self.SparseTensor(i, v, torch.Size([3, 0]), device=1)
-        check_device(x, 1)
+        self.assertEqual(x.get_device(), 1)
+        self.assertEqual(x._values().get_device(), 1)
+        self.assertEqual(x._indices().get_device(), 1)
 
         x = self.SparseTensor(3, device=1)
-        check_device(x, 1)
-
-        x = self.SparseTensor(3, 0, device=1)
-        check_device(x, 1)
+        self.assertEqual(x.get_device(), 1)
+        self.assertEqual(x._values().get_device(), 1)
+        self.assertEqual(x._indices().get_device(), 1)
 
-        i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(0)
         self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3])))
 
-        i = self.IndexTensor([[2]]).cuda(1)
-        v = self.ValueTensor(1, 0).cuda(0)
-        self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3, 0])))
-
     def _test_new_device(self, size, device):
         with torch.cuda.device(device):
             x = torch.cuda.sparse.DoubleTensor(*size)
@@ -1145,7 +997,6 @@ def test_new_device_single_gpu(self):
         self._test_new_device((), 0)
         self._test_new_device((30, 20), 0)
         self._test_new_device((30, 20, 10), 0)
-        self._test_new_device((30, 20, 10, 0), 0)
 
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
@@ -1153,67 +1004,49 @@ def test_new_device_multi_gpu(self):
         self._test_new_device((), 1)
         self._test_new_device((30, 20), 1)
         self._test_new_device((30, 20, 10), 1)
-        self._test_new_device((30, 20, 10, 0), 1)
 
     @skipIfRocm
     def test_new(self):
-        def test_shape(sparse_dims, nnz, with_size):
-            x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size)
-            if not x.is_cuda:
-                # CUDA sparse tensors currently requires the size to be
-                # specified if nDimV > 0
-                self.assertEqual(x.new(indices, values), x)
-            self.assertEqual(x.new(indices, values, x.size()), x)
-
-        test_shape(3, 10, 100)
-        test_shape(3, 0, [100, 100, 0])
+        x, indices, values = self._gen_sparse(3, 10, 100)
+        if not x.is_cuda:
+            # CUDA sparse tensors currently requires the size to be
+            # specified if nDimV > 0
+            self.assertEqual(x.new(indices, values), x)
+        self.assertEqual(x.new(indices, values, x.size()), x)
 
     @cpu_only  # not really, but we only really want to run this once
     @skipIfRocm
     def test_factory(self):
-        for test_empty_tensor in [True, False]:
-            if test_empty_tensor:
-                default_size = torch.Size([1, 3, 0])
-                size = torch.Size([3, 3, 0])
-            else:
-                default_size = torch.Size([1, 3])
-                size = torch.Size([3, 3])
-            for include_size in [True, False]:
-                for use_tensor_idx in [True, False]:
-                    for use_tensor_val in [True, False]:
-                        for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
-                            # have to include size with cuda sparse tensors
-                            include_size = include_size or use_cuda
-                            dtype = torch.float64
-                            long_dtype = torch.int64
-                            device = torch.device('cpu') if not use_cuda else \
-                                torch.device(torch.cuda.device_count() - 1)
-                            indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
-                            if test_empty_tensor:
-                                values = self.ValueTensor(1, 0)
-                            else:
-                                if use_tensor_val:
-                                    values = torch.tensor([1.], dtype=dtype)
-                                else:
-                                    values = 1.
-                            if include_size:
-                                sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
-                                                                        device=device, requires_grad=True)
-                            else:
-                                sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
-                                                                        device=device, requires_grad=True)
-                            self.assertEqual(indices, sparse_tensor._indices())
-                            self.assertEqual(values, sparse_tensor._values())
-                            self.assertEqual(size if include_size else default_size, sparse_tensor.size())
-                            self.assertEqual(dtype, sparse_tensor.dtype)
-                            if use_cuda:
-                                self.assertEqual(device, sparse_tensor._values().device)
-                            self.assertEqual(True, sparse_tensor.requires_grad)
+        default_size = torch.Size([1, 3])
+        size = torch.Size([3, 3])
+        for include_size in [True, False]:
+            for use_tensor_idx in [True, False]:
+                for use_tensor_val in [True, False]:
+                    for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
+                        # have to include size with cuda sparse tensors
+                        include_size = include_size or use_cuda
+                        dtype = torch.float64
+                        long_dtype = torch.int64
+                        device = torch.device('cpu') if not use_cuda else torch.device(torch.cuda.device_count() - 1)
+                        indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
+                        values = torch.tensor([1.], dtype=dtype) if use_tensor_val else 1.
+                        if include_size:
+                            sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
+                                                                    device=device, requires_grad=True)
+                        else:
+                            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
+                                                                    device=device, requires_grad=True)
+                        self.assertEqual(indices, sparse_tensor._indices())
+                        self.assertEqual(values, sparse_tensor._values())
+                        self.assertEqual(size if include_size else default_size, sparse_tensor.size())
+                        self.assertEqual(dtype, sparse_tensor.dtype)
+                        if use_cuda:
+                            self.assertEqual(device, sparse_tensor._values().device)
+                        self.assertEqual(True, sparse_tensor.requires_grad)
 
     @skipIfRocm
     def test_factory_size_check(self):
-        indices = self.IndexTensor([[1, 2],
-                                    [0, 2]])
+        indices = self.IndexTensor([[1, 2], [0, 2]])
         values = self.ValueTensor([.5, .5])
         sizes = torch.Size([2, 3])
         with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
@@ -1223,34 +1056,12 @@ def test_factory_size_check(self):
         with self.assertRaisesRegex(RuntimeError, "found negative index"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[1, 2],
-                                    [0, 2]])
-        values = self.ValueTensor(2, 1, 0)
-        sizes = torch.Size([2, 3, 1, 0])
-        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
-            torch.sparse_coo_tensor(indices, values, sizes)
-
-        indices = self.IndexTensor([[1, 2],
-                                    [0, 2]])
-        values = self.ValueTensor(2, 2, 2)
-        sizes = torch.Size([0, 0, 2, 2])
-        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
-            torch.sparse_coo_tensor(indices, values, sizes)
-
-        indices = self.IndexTensor([[1, 2],
-                                    [0, 2]])
+        indices = self.IndexTensor([[1, 2], [0, 2]])
         values = self.ValueTensor([[1, 1, 1], [1, 1, 1]])
         sizes = torch.Size([3, 3, 2])
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[1, 2],
-                                    [0, 2]])
-        values = self.ValueTensor(2, 1, 0)
-        sizes = torch.Size([3, 3, 2, 0])
-        with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
-            torch.sparse_coo_tensor(indices, values, sizes)
-
     def test_factory_default(self):
         tensor = self.SparseTensor()
         expected_indices = self.IndexTensor(1, 0)
@@ -1284,31 +1095,25 @@ def test_factory_nnz(self):
         with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[0]])  # (sparseDims, nnz): (1, 1)
-        values = self.ValueTensor(2, 0)  # (nnz, ...): (2, 0)
-        sizes = torch.Size([2, 0])
-        with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
-            torch.sparse_coo_tensor(indices, values, sizes)
+    def _test_factory_tensor_shape(self, i_shape, v_shape, size, expected_size):
+        device = 'cuda' if self.is_cuda else 'cpu'
+        if size:
+            t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device)
+        else:
+            t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device)
+        expected_indices = torch.empty(i_shape, device=device)
+        expected_values = torch.empty(v_shape, device=device)
+        expected_size = torch.Size(expected_size)
+        self.assertEqual(t._indices(), expected_indices)
+        self.assertEqual(t._values(), expected_values)
+        self.assertEqual(t.size(), expected_size)
 
     def test_factory_nnz_zero(self):
-        def test_shape(i_shape, v_shape, size, expected_size):
-            device = 'cuda' if self.is_cuda else 'cpu'
-            if size:
-                t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device)
-            else:
-                t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device)
-            expected_indices = torch.empty(i_shape, device=device)
-            expected_values = torch.empty(v_shape, device=device)
-            expected_size = torch.Size(expected_size)
-            self.assertEqual(t._indices(), expected_indices)
-            self.assertEqual(t._values(), expected_values)
-            self.assertEqual(t.size(), expected_size)
-
-        test_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0])
-        test_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0])
-        test_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0])
-        test_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
-        test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
+        self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0])
+        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0])
+        self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0])
+        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
+        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
     @skipIfRocm
     def test_factory_dense_dims(self):
@@ -1318,12 +1123,6 @@ def test_factory_dense_dims(self):
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[0]])
-        values = self.ValueTensor(1, 2, 3, 0)
-        sizes = torch.Size([1, 3, 4, 0])
-        with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
-            torch.sparse_coo_tensor(indices, values, sizes)
-
     @cpu_only
     def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32))
@@ -1333,13 +1132,6 @@ def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1]))
         self.assertEqual(torch.int64, t.dtype)
 
-        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0))
-        self.assertEqual(torch.float32, t.dtype)
-        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0))
-        self.assertEqual(torch.float64, t.dtype)
-        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.LongTensor(1, 0))
-        self.assertEqual(torch.int64, t.dtype)
-
     @cuda_only
     @skipIfRocm
     def test_factory_device_type_inference(self):
@@ -1348,66 +1140,41 @@ def test_factory_device_type_inference(self):
         for indices_device in ['cuda', 'cpu']:
             for values_device in ['cuda', 'cpu']:
                 for sparse_device in ['cuda', 'cpu', None]:
-                    for test_empty_tensor in [True, False]:
-                        if test_empty_tensor:
-                            t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
-                                                        self.ValueTensor(1, 0).to(values_device),
-                                                        (1, 3, 0), device=sparse_device)
-                        else:
-                            t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
-                                                        torch.tensor([1.], device=values_device),
-                                                        (1, 3), device=sparse_device)
-                        should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
-                        self.assertEqual(should_be_cuda, t.is_cuda)
+                    t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
+                                                torch.tensor([1.], device=values_device),
+                                                (1, 3), device=sparse_device)
+                    should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
+                    self.assertEqual(should_be_cuda, t.is_cuda)
 
     @cpu_only
     def test_factory_copy(self):
-        def test_tensor(indices, values, indices_equal, values_equal):
-            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-            if indices_equal:
-                self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-            else:
-                self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-            if values_equal:
-                self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
-            else:
-                self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
-
         # both correct
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.tensor([1.], dtype=torch.float64)
-        test_tensor(indices, values, True, True)
-
-        indices = torch.tensor(([0], [2]), dtype=torch.int64)
-        values = torch.DoubleTensor(1, 0)
-        test_tensor(indices, values, True, True)
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
         # only indices correct
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.tensor([1.], dtype=torch.float32)
-        test_tensor(indices, values, True, False)
-
-        indices = torch.tensor(([0], [2]), dtype=torch.int64)
-        values = torch.FloatTensor(1, 0)
-        test_tensor(indices, values, True, True)  # An empty tensor's data_ptr is always equal to 0
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
         # only values correct
         indices = torch.tensor(([0], [2]), dtype=torch.int32)
         values = torch.tensor([1.], dtype=torch.float64)
-        test_tensor(indices, values, False, True)
-
-        indices = torch.tensor(([0], [2]), dtype=torch.int32)
-        values = torch.DoubleTensor(1, 0)
-        test_tensor(indices, values, False, True)
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
         # neither correct
         indices = torch.tensor(([0], [2]), dtype=torch.int32)
         values = torch.tensor([1.], dtype=torch.float32)
-        test_tensor(indices, values, False, False)
-
-        indices = torch.tensor(([0], [2]), dtype=torch.int32)
-        values = torch.FloatTensor(1, 0)
-        test_tensor(indices, values, False, True)  # An empty tensor's data_ptr is always equal to 0
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
     @cpu_only  # just run once, we test both cpu and cuda
     def test_constructor_device_legacy(self):
@@ -1458,15 +1225,9 @@ def test_is_sparse(self):
         x = torch.randn(3, 3)
         self.assertFalse(x.is_sparse)
 
-        x = torch.randn(3, 3, 0)
-        self.assertFalse(x.is_sparse)
-
         x = self.SparseTensor()
         self.assertTrue(x.is_sparse)
 
-        x = self.SparseTensor(1, 0)
-        self.assertTrue(x.is_sparse)
-
     @skipIfRocm
     def test_resize_as(self):
         def do_test(t):
@@ -1477,8 +1238,6 @@ def do_test(t):
             self.assertEqual(t, t + y)
 
         do_test(self.SparseTensor())
-        do_test(self.SparseTensor(3, 0))
-        do_test(self.SparseTensor(3, 3))
 
     @skipIfRocm
     def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
@@ -1505,13 +1264,10 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
 
     @skipIfRocm
     def test_resize(self):
-        # 1. Expand the size of some dense dimensions [Supported]
+        # 1. Increase the size of some dense dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                 [1, 1], [1, 2, 4], [2, 2, 4])
 
-        self._test_resize_shape([1, 1], [1, 2, 0], [2, 2, 0],
-                                [1, 1], [1, 2, 4], [2, 2, 4])
-
         # 2. Expand the size of some sparse dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                 [1, 1], [1, 2, 3], [4, 2, 3])
@@ -1520,18 +1276,11 @@ def test_resize(self):
         self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3],
                                 [2, 0], [0, 2, 4, 5], [1, 1, 2, 4, 5])
 
-        self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3],
-                                [2, 0], [0, 2, 4, 0], [1, 1, 2, 4, 0])
-
         # 4. Add dims to dense dimensions [Not Supported]
         with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 3, 4], [2, 2, 3, 4])
 
-        with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
-            self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
-                                    [1, 1], [1, 2, 3, 0], [2, 2, 3, 0])
-
         # 5. Remove dims from dense dimensions [Not Supported]
         with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
@@ -1552,10 +1301,6 @@ def test_resize(self):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 2], [2, 2, 2])
 
-        with self.assertRaisesRegex(RuntimeError, "shrinking the size of dense dimensions"):
-            self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
-                                    [1, 1], [1, 2, 0], [2, 2, 0])
-
     def test_is_nonzero(self):
         self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
@@ -1563,8 +1308,6 @@ def test_is_nonzero(self):
         self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (0., 0.), (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (-1., 1.), (1,)).is_nonzero())
         self.assertTrue(torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, []).is_nonzero())  # scalar sparse tensor
-        with self.assertRaisesRegex(RuntimeError, "bool value of Tensor with no values is ambiguous"):
-            torch.sparse_coo_tensor(([0, 1],), self.ValueTensor(2, 0), (4, 0)).is_nonzero()
 
 
 class TestUncoalescedSparse(TestSparse):
@@ -1595,26 +1338,11 @@ class TestSparseOneOff(TestCase):
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @skipIfRocm
     def test_cuda_from_cpu(self):
-        with self.assertRaisesRegex(
-                RuntimeError,
-                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
-            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                     torch.randn(4, 4, 4),
-                                     [3, 4, 4])
-
-        with self.assertRaisesRegex(
-                RuntimeError,
-                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
-            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                     torch.randn(4, 4, 4, 0),
-                                     [3, 4, 4, 0])
-
-        with self.assertRaisesRegex(
-                RuntimeError,
-                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
-            torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
-                                     torch.randn(0, 4, 4, 0),
-                                     [0, 4, 4, 0])
+        self.assertExpectedRaises(
+            RuntimeError,
+            lambda: torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                             torch.randn(4, 4, 4),
+                                             [3, 4, 4]))
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @skipIfRocm
@@ -1623,22 +1351,7 @@ def test_cuda_sparse_cpu_dense_add(self):
         sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
                                                  torch.randn(4, 4, 4).cuda(),
                                                  [3, 4, 4])
-        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
-            x + sparse_y
-
-        x = torch.zeros(3, 4, 4, 0)
-        sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                                 torch.randn(4, 4, 4, 0).cuda(),
-                                                 [3, 4, 4, 0])
-        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
-            x + sparse_y
-
-        x = torch.zeros(0, 4, 4, 0)
-        sparse_y = torch.cuda.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
-                                                 torch.randn(0, 4, 4, 0).cuda(),
-                                                 [0, 4, 4, 0])
-        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
-            x + sparse_y
+        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
 
 
 if __name__ == '__main__':

From 9eb72889b49a2fbeeb93313d38c1f7ece7eed014 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Tue, 18 Sep 2018 12:14:38 -0700
Subject: [PATCH 228/237] Add successor/predecessor functions

Summary: More functionality to prep nomnigraph for scheduler implementations

Reviewed By: duc0

Differential Revision: D9794686

fbshipit-source-id: b460859d8ff965d0049b2a696bd8d2f5c97f3f86
---
 caffe2/python/nomnigraph.py         |  2 ++
 caffe2/python/nomnigraph_test.py    | 33 ++++++++++++++++++++++++
 caffe2/python/pybind_state_nomni.cc | 39 ++++++++++++++++++++++++++---
 3 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index 417ad64ccc2bb2..abe1971680a7e9 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -68,7 +68,9 @@ def render(s):
 
 
 NeuralNetOperator = C.NeuralNetOperator
+Operator = C.NeuralNetOperator
 NeuralNetData = C.NeuralNetData
+Data = C.NeuralNetData
 NNSubgraph = C.NNSubgraph
 NNMatchGraph = C.NNMatchGraph
 Graph = C.Graph
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 1956d2e9f01b68..9288364bbcb944 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -175,6 +175,39 @@ def test_convertToProto(self):
         for a, b in zip(new_netdef.external_output, net.Proto().external_output):
             assert a == b
 
+    def test_node_interactions(self):
+        nn = ng.NNModule()
+        dfg = nn.dataFlow
+        test1 = dfg.createNode(ng.Operator("test1"))
+        test2 = dfg.createNode(ng.Operator("test2"))
+        x = dfg.createNode(ng.Data("x"))
+        dfg.createEdge(test1, x)
+        dfg.createEdge(x, test2)
+        p = test2.getOperatorPredecessors()
+        assert len(p) == 1
+        assert p[0] == test1
+
+        # Add another node
+        test3 = dfg.createNode(ng.Operator("test3"))
+        y = dfg.createNode(ng.Data("y"))
+        dfg.createEdge(test3, y)
+        dfg.createEdge(y, test2)
+        p = test2.getOperatorPredecessors()
+        assert len(p) == 2
+        assert test1 in p
+        assert test3 in p
+
+        # Successors
+        assert len(test2.getOperatorSuccessors()) == 0
+        assert len(test1.getOperatorSuccessors()) == 1
+        assert test1.getOperatorSuccessors()[0] == test2
+
+        # Check all the nodes are valid (pybind ownership test)
+        for node in [test1, test2, test3]:
+            assert node.isOperator()
+        for node in [x, y]:
+            assert node.isTensor()
+
     def test_annotation_basic(self):
         annot = ng.Annotation()
         annot.setDevice("woot")
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
index fa7baf60c92541..1f05d3bd1beeb4 100644
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@@ -202,10 +202,41 @@ void addNomnigraphMethods(pybind11::module& m) {
       .def(
           "getAnnotation",
           [](NNGraph::NodeRef n) { return getOrAddCaffe2Annotation(n); })
-      .def("setAnnotation", [](NNGraph::NodeRef n, Caffe2Annotation annot) {
-        auto* nnOp = nn::get<NeuralNetOperator>(n);
-        nnOp->setAnnotation(nom::util::make_unique<Caffe2Annotation>(annot));
-      });
+      .def(
+          "setAnnotation",
+          [](NNGraph::NodeRef n, Caffe2Annotation annot) {
+            auto* nnOp = nn::get<NeuralNetOperator>(n);
+            nnOp->setAnnotation(
+                nom::util::make_unique<Caffe2Annotation>(annot));
+          })
+      .def(
+          "getOperatorPredecessors",
+          [](NNGraph::NodeRef n) {
+            CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
+            std::vector<NNGraph::NodeRef> pred;
+            for (const auto& inEdge : n->getInEdges()) {
+              auto data = inEdge->tail();
+              if (nn::hasProducer(data)) {
+                pred.emplace_back(nn::getProducer(data));
+              }
+            }
+            return pred;
+          },
+          py::return_value_policy::reference)
+      .def(
+          "getOperatorSuccessors",
+          [](NNGraph::NodeRef n) {
+            CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
+            std::vector<NNGraph::NodeRef> succ;
+            for (const auto& outEdge : n->getOutEdges()) {
+              auto data = outEdge->head();
+              for (const auto& consumer : nn::getConsumers(data)) {
+                succ.emplace_back(consumer);
+              }
+            }
+            return succ;
+          },
+          py::return_value_policy::reference);
 
   py::class_<GenericOperator> nnop(m, "NeuralNetOperator");
   py::class_<nom::repr::Tensor> nndata(m, "NeuralNetData");

From 1d399a80a03bb2b2f2e6db514c48c6cfd08363a0 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 18 Sep 2018 13:10:08 -0700
Subject: [PATCH 229/237] Handle pollution of MAX, MIN and CHECK macros.
 (#11805)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11805

Some of our headers in Caffe2 pollute the macro namespace with things like MAX,
MIN, CHECK, so I renamed these in places where this is a problem.

This patch courtesy of gchanan, extracted out of #11721

Reviewed By: Yangqing

Differential Revision: D9917757

fbshipit-source-id: 17fc692ca04b208dcb8ae00731ed60e393284f7c
---
 aten/src/TH/THTensorCopy.cpp         |  2 ++
 aten/src/TH/generic/THTensorCopy.cpp | 18 +++++++-----------
 torch/csrc/cuda/nccl.cpp             |  4 ++--
 torch/csrc/cuda/nccl.h               |  6 +++---
 torch/csrc/cuda/python_nccl.cpp      | 14 +++++++-------
 5 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/aten/src/TH/THTensorCopy.cpp b/aten/src/TH/THTensorCopy.cpp
index d8df519e26bdbc..482a7b986f5302 100644
--- a/aten/src/TH/THTensorCopy.cpp
+++ b/aten/src/TH/THTensorCopy.cpp
@@ -1,6 +1,8 @@
 #include "THTensor.hpp"
 #include "THVector.h"
 
+#include <algorithm>
+
 #include "generic/THTensorCopy.cpp"
 #include "THGenerateAllTypes.h"
 
diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp
index f04c75cd3f9105..91a60275a6c03e 100644
--- a/aten/src/TH/generic/THTensorCopy.cpp
+++ b/aten/src/TH/generic/THTensorCopy.cpp
@@ -26,13 +26,11 @@ int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
 // special case copy where tensor is contiguous and src is a transposed matrix
 // This can be generalized to most copies, but it's tricker
 void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
-  #define MIN(x, y) (((x) < (y)) ? (x) : (y))
-  #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
 #ifdef TH_REAL_IS_BYTE
-  const int BLOCK_SZ = 120;
+  const int64_t BLOCK_SZ = 120;
 #else
-  const int BLOCK_SZ = 60;
+  const int64_t BLOCK_SZ = 60;
 #endif
 
   THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
@@ -48,8 +46,8 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
       scalar_t *spo = sp + R + C * NR;
       scalar_t *rpo = rp + C + R * NC;
 
-      int nr = MIN(NR - R, BLOCK_SZ);
-      int nc = MIN(NC - C, BLOCK_SZ);
+      int nr = std::min(NR - R, BLOCK_SZ);
+      int nc = std::min(NC - C, BLOCK_SZ);
 
       // 1. copy columns from src to buf
       for (int c = 0; c < nc; c++) {
@@ -57,10 +55,10 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
       }
 
       // 2. transpose buf in place
-      int rc_max = MAX(nr, nc);
-      int rc_min = MIN(nr, nc);
+      int rc_max = std::max(nr, nc);
+      int rc_min = std::min(nr, nc);
       for (int r = 0; r < rc_max; r++) {
-        int end = MIN(r, rc_min);
+        int end = std::min(r, rc_min);
         for (int c = 0; c < end; c++) {
           scalar_t tmp = bp[r + BLOCK_SZ * c];
           bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
@@ -75,8 +73,6 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
     }
   }
   c10::raw::intrusive_ptr::decref(buf);
-  #undef MIN
-  #undef MAX
 }
 
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index ab0fecfd761c24..e769b85f8e2d90 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -28,7 +28,7 @@ struct NcclCommList {
   int ndevices;
   NcclCommList(const std::vector<int>& devices)
     : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) {
-    CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data()));
+    NCCL_CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data()));
   }
   NcclCommList(NcclCommList&& foo) = default;
   ~NcclCommList() {
@@ -219,7 +219,7 @@ void broadcast(TensorList tensors, const stream_list& streams, const comm_list&
     AT_CHECK(static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
              "Broadcast tensor has ", numel, " elements, which exceeds the "
              "maximum NCCL supports (", count_max, ")");
-    CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream));
+    NCCL_CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream));
   }
 #else
   throw std::runtime_error("PyTorch built without NCCL support");
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
index 919363474b9f9f..349d8bcfdf507f 100644
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@@ -12,7 +12,7 @@ namespace detail {
 
 void throw_nccl_error(ncclResult_t status);
 
-static inline void CHECK(ncclResult_t status) {
+static inline void NCCL_CHECK(ncclResult_t status) {
   if (status != ncclSuccess) {
     throw_nccl_error(status);
   }
@@ -21,12 +21,12 @@ static inline void CHECK(ncclResult_t status) {
 struct AutoNcclGroup {
   AutoNcclGroup() {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    CHECK(ncclGroupStart());
+    NCCL_CHECK(ncclGroupStart());
 #endif
   }
   ~AutoNcclGroup() {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    CHECK(ncclGroupEnd());
+    NCCL_CHECK(ncclGroupEnd());
 #endif
   }
 };
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 49d36e51649a4a..79a859f20d1075 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -26,7 +26,7 @@ PyObject * THCPModule_nccl_version(PyObject *self, PyObject *args) {
 PyObject * THCPModule_nccl_unique_id(PyObject *self, PyObject *args) {
   HANDLE_TH_ERRORS
   ncclUniqueId id;
-  CHECK(ncclGetUniqueId(&id));
+  NCCL_CHECK(ncclGetUniqueId(&id));
   return PyBytes_FromStringAndSize((char*)&id, NCCL_UNIQUE_ID_BYTES);
   END_HANDLE_TH_ERRORS
 }
@@ -109,7 +109,7 @@ PyObject * THCPModule_nccl_init_rank(PyObject *self, PyObject *args) {
   memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
   ncclComm_t comm;
   with_no_gil([&]{
-    CHECK(ncclCommInitRank(&comm, nranks, commId, rank));
+    NCCL_CHECK(ncclCommInitRank(&comm, nranks, commId, rank));
   });
   return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm);
   END_HANDLE_TH_ERRORS
@@ -149,7 +149,7 @@ PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) {
       int device = inputs[i].get_device();
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
-      CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
            count, data_type, (ncclRedOp_t) op, root, comms[i], stream));
     }
   });
@@ -191,7 +191,7 @@ PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args) {
       int device = inputs[i].get_device();
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
-      CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
           count, data_type, (ncclRedOp_t) op, comms[i], stream));
     }
   });
@@ -255,10 +255,10 @@ PyObject * THCPModule_nccl_all_gather(PyObject *self, PyObject *args) {
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
     #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-      CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(),
         count, data_type, comms[i], stream));
     #else
-      CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type,
+      NCCL_CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type,
         outputs[i].data_ptr(), comms[i], stream));
     #endif
     }
@@ -299,7 +299,7 @@ PyObject * THCPModule_nccl_reduce_scatter(PyObject *self, PyObject *args) {
       int device = inputs[i].get_device();
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
-      CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(),
           count, data_type, (ncclRedOp_t) op, comms[i], stream));
     }
   });

From 7d25fa3c721f6b515af5bbfc704e96a978bef3c9 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 18 Sep 2018 13:39:42 -0700
Subject: [PATCH 230/237] Emit Undefined type for value when it is Dynamic type
 (#11810)

Summary:
For example, outputs of control blocks often have Dynamic type, and when we try to export them to ONNX we get an invalid proto, since `elem_type` is not populated on the TypeInfoProto. This makes it so at least we can get past the checker, since having a dynamic typed output from a control block should still be semantically valid
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11810

Differential Revision: D9922754

Pulled By: jamesr66a

fbshipit-source-id: 5c66113cc302a9d9b8b9f5a8605473d3c6ad5af1
---
 torch/csrc/jit/export.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 3a0500b765dead..1984f35fcc8974 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -179,6 +179,8 @@ void EncoderBase::EncodeValueInfo(
       shape->mutable_dim(i)->set_dim_value(sizes[i]);
     }
     tensor_type->set_elem_type(ATenTypeToOnnxType(node_type->scalarType()));
+  } else {
+    tensor_type->set_elem_type(onnx::TensorProto_DataType_UNDEFINED);
   }
 }
 
@@ -500,6 +502,7 @@ void ModuleEncoder::EncodeTypeInfo(
   auto kind = type->kind();
   if (kind == TypeKind::DynamicType) {
     type_proto->set_denotation("DynamicType");
+    tensortype_proto->set_elem_type(onnx::TensorProto_DataType_UNDEFINED);
   } else if (kind == TypeKind::TensorType) {
     type_proto->set_denotation("TensorType");
     // encode the number of dimensions by pushing that number of ones into the shape proto

From d4e1fa45d055a1b00e8b7cfefa5c9f5db9ae6160 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 18 Sep 2018 13:41:11 -0700
Subject: [PATCH 231/237] allow no-alpha add/sub in onnx symbolic (#10972)

Summary:
The PR fixes #10873

The context is aten::add and aten::sub ST overloads don't have alpha, so onnx symbolic does not match.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10972

Reviewed By: jamesr66a

Differential Revision: D9724224

Pulled By: wanchaol

fbshipit-source-id: eb5d1b09fa8f1604b288f4a62b8d1f0bc66611af
---
 ...nnx_export_script_non_alpha_add_sub.expect | 22 +++++++++++++++++++
 .../expect/TestOperators.test_rsub.expect     |  4 ++--
 test/test_jit.py                              | 16 ++++++++++++++
 torch/onnx/symbolic.py                        | 17 ++++++--------
 4 files changed, 47 insertions(+), 12 deletions(-)
 create mode 100644 test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect

diff --git a/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect b/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect
new file mode 100644
index 00000000000000..1c2b3c655d3324
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect
@@ -0,0 +1,22 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 3 4}]
+      outputs: [{name: "7", type:Tensor dims: 1}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Shape", inputs: [x], outputs: [2], attributes: []},
+        Node {type: "Gather", inputs: [2,1], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Add", inputs: [3,4], outputs: [5], attributes: []},
+        Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Sub", inputs: [5,6], outputs: [7], attributes: []}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect
index 49fa976a5e1bd6..7f2e5284e0cef8 100644
--- a/test/onnx/expect/TestOperators.test_rsub.expect
+++ b/test/onnx/expect/TestOperators.test_rsub.expect
@@ -8,8 +8,8 @@ graph {
     attribute {
       name: "value"
       t {
-        data_type: DOUBLE
-        raw_data: "\000\000\000\000\000\000\360?"
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
diff --git a/test/test_jit.py b/test/test_jit.py
index d91b5b4653d57c..b77846908f72ab 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -5331,6 +5331,22 @@ def forward(self, x):
             mte, (torch.zeros(1, 2, 3),), None, verbose=False,
             example_outputs=outputs, export_raw_ir=True))
 
+    def test_onnx_export_script_non_alpha_add_sub(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                bs = x.size(0) + 1
+                return bs - 1
+
+        mte = ModuleToExport()
+        outputs = torch.LongTensor([mte(torch.rand(3, 4))])
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.rand(3, 4),), None, verbose=False,
+            example_outputs=outputs))
+
     def test_onnx_export_script_module_if(self):
         class ModuleToExport(torch.jit.ScriptModule):
             def __init__(self):
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index d5b586c3846d6f..3f334304705ded 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -192,25 +192,22 @@ def unused(g):
     return g.op("prim::Undefined")
 
 
-@parse_args('v', 'v', 't')
-def add(g, self, other, alpha):
-    if _scalar(alpha) != 1:
+def add(g, self, other, alpha=None):
+    # default alpha arg is to allow no-alpha add (aten add st overload no alpha)
+    if alpha and _scalar(_maybe_get_scalar(alpha)) != 1:
         return _unimplemented("add", "alpha != 1")
     # See Note [Pointwise by scalar]
     other = _maybe_get_scalar(other)
     return g.op("Add", self, _if_scalar_type_as(g, other, self))
 
 
-@parse_args('v', 'v', 't')
-def sub(g, self, other, alpha):
-    if _scalar(alpha) != 1:
+def sub(g, self, other, alpha=None):
+    # default alpha arg is to allow no-alpha sub (aten sub st overload no alpha)
+    if alpha and _scalar(_maybe_get_scalar(alpha)) != 1:
         return _unimplemented("sub", "alpha != 1")
     # See Note [Pointwise by scalar]. Note that self or other may be scalars.
     other = _maybe_get_scalar(other)
-    self = _maybe_get_scalar(self)
-    self = _if_scalar_type_as(g, self, other)
-    other = _if_scalar_type_as(g, other, self)
-    return g.op("Sub", self, other)
+    return g.op("Sub", self, _if_scalar_type_as(g, other, self))
 
 
 def mul(g, self, other):

From 8ad846fda55f8cd04e832440c794ace04daf40fa Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Tue, 18 Sep 2018 14:00:52 -0700
Subject: [PATCH 232/237] Don't build Detectron ops with NO_CAFFE2_OPS=1
 (#11799)

Summary:
cc apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11799

Differential Revision: D9922745

Pulled By: orionr

fbshipit-source-id: b88724b7c2919aabc00d98658e8e563233e01c85
---
 modules/detectron/CMakeLists.txt | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
index 1791ca27a98590..971baa41adebbe 100644
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@@ -1,18 +1,20 @@
 file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 
-# Note(ilijar): Since Detectron ops currently have no
-# CPU implementation, we only build GPU ops for now.
-if (USE_CUDA)
-  CUDA_ADD_LIBRARY(
-      caffe2_detectron_ops_gpu SHARED
-      ${Detectron_CPU_SRCS}
-      ${Detectron_GPU_SRCS})
+if (BUILD_CAFFE2_OPS)
+  # Note(ilijar): Since Detectron ops currently have no
+  # CPU implementation, we only build GPU ops for now.
+  if (USE_CUDA)
+    CUDA_ADD_LIBRARY(
+        caffe2_detectron_ops_gpu SHARED
+        ${Detectron_CPU_SRCS}
+        ${Detectron_GPU_SRCS})
 
-  target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
-  install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
-elseif(NOT IOS_PLATFORM)
-  add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
-  target_link_libraries(caffe2_detectron_ops caffe2)
-  install(TARGETS caffe2_detectron_ops DESTINATION lib)
+    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
+    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
+  elseif(NOT IOS_PLATFORM)
+    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
+    target_link_libraries(caffe2_detectron_ops caffe2)
+    install(TARGETS caffe2_detectron_ops DESTINATION lib)
+  endif()
 endif()

From e585f2fb480f1bdf2a570a0f049ab3e4f45ca3a1 Mon Sep 17 00:00:00 2001
From: sven <sjm324@cornell.edu>
Date: Tue, 18 Sep 2018 14:47:54 -0700
Subject: [PATCH 233/237] Polish CPP docs, Minor Python Docs Fixes (#11722)

Differential Revision: D9919120

Pulled By: goldsborough

fbshipit-source-id: bf14cbe4ab79524495957cb749828046af864aab
---
 .gitignore                                    |    5 +-
 .travis.yml                                   |    2 +-
 docs/Makefile                                 |    2 +-
 docs/cpp/Doxyfile                             | 2032 -----------------
 docs/cpp/Makefile                             |   14 +-
 docs/cpp/source/Doxyfile                      |  132 ++
 docs/cpp/{ => source}/building.rst            |    0
 docs/cpp/{ => source}/check-doxygen.sh        |    8 +-
 docs/cpp/{ => source}/conf.py                 |   56 +-
 docs/cpp/{ => source}/contributing.rst        |    0
 docs/cpp/{ => source}/examples.rst            |    0
 docs/cpp/{ => source}/index.rst               |    0
 docs/source/conf.py                           |   26 +-
 torch/csrc/api/include/torch/nn/module.h      |    3 +
 .../api/include/torch/nn/modules/functional.h |   18 +-
 15 files changed, 223 insertions(+), 2075 deletions(-)
 delete mode 100644 docs/cpp/Doxyfile
 create mode 100644 docs/cpp/source/Doxyfile
 rename docs/cpp/{ => source}/building.rst (100%)
 rename docs/cpp/{ => source}/check-doxygen.sh (60%)
 rename docs/cpp/{ => source}/conf.py (81%)
 rename docs/cpp/{ => source}/contributing.rst (100%)
 rename docs/cpp/{ => source}/examples.rst (100%)
 rename docs/cpp/{ => source}/index.rst (100%)

diff --git a/.gitignore b/.gitignore
index b600e7515b9134..a56ae2ab81df84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,9 +25,8 @@ aten/src/ATen/cuda/CUDAConfig.h
 build/
 dist/
 docs/src/**/*
-docs/cpp/xml/
-docs/cpp/html/
-docs/cpp/api/
+docs/cpp/build
+docs/cpp/source/api
 test/.coverage
 test/cpp/api/mnist
 test/custom_operator/model.pt
diff --git a/.travis.yml b/.travis.yml
index be45e69f67cb2f..77d430ee8917a4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,4 +28,4 @@ matrix:
         script: mypy @mypy-files.txt
       - env: CPP_DOC_CHECK
         install: sudo apt-get install -y doxygen
-        script: cd docs/cpp && ./check-doxygen.sh
+        script: cd docs/cpp/source && ./check-doxygen.sh
diff --git a/docs/Makefile b/docs/Makefile
index 4a56c12ca22d89..59c2397bb023e1 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -39,4 +39,4 @@ html-stable:
 
 clean:
 	@echo "Removing everything under 'build'.."
-	@rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees
+	@rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees
diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile
deleted file mode 100644
index 37abadc83ff7ca..00000000000000
--- a/docs/cpp/Doxyfile
+++ /dev/null
@@ -1,2032 +0,0 @@
-# Doxyfile 1.8.14
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See
-# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "PyTorch"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = build
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = YES
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines (in the resulting output). You can put ^^ in the value part of an
-# alias to insert a newline as if a physical newline was in the original file.
-
-ALIASES                = "rst=\verbatim embed:rst:leading-asterisk"
-ALIASES               += "endrst=\endverbatim"
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-GENERATE_LATEX         = NO
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = YES
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  =  ../../torch/csrc/api/include \
-                          ../../torch/csrc/api/src \
-                          ../../torch/csrc/jit/custom_operator.h \
-                          ../../torch/csrc/jit/import.h \
-                          ../../torch/csrc/jit/ivalue.h \
-                          ../../torch/csrc/jit/script/module.h \
-                          ../../aten/src/ATen/ATen.h \
-                          ../../aten/src/ATen/Backend.h \
-                          ../../aten/src/ATen/Device.h \
-                          ../../aten/src/ATen/DeviceGuard.h \
-                          ../../aten/src/ATen/Layout.h \
-                          ../../aten/src/ATen/OptionsGuard.h \
-                          ../../aten/src/ATen/Scalar.h \
-                          ../../aten/src/ATen/TensorOptions.h \
-                          ../../aten/src/ATen/core/ArrayRef.h \
-                          ../../aten/src/ATen/core/DeviceType.h \
-                          ../../aten/src/ATen/core/Error.h \
-                          ../../aten/src/ATen/core/Half.h \
-                          ../../aten/src/ATen/core/ScalarType.h \
-                          ../../aten/src/ATen/cuda/CUDAGuard.h \
-                          ../../aten/src/ATen/cuda/CUDAStream.h \
-                          ../../aten/src/ATen/cuda/CUDAContext.h \
-                          ../../aten/src/ATen/cudnn/Descriptors.h \
-                          ../../aten/src/ATen/cudnn/Handles.h \
-                          ../../aten/src/ATen/cudnn/Types.h \
-                          ../../aten/src/ATen/cudnn/Utils.h \
-                          ../../aten/src/ATen/mkl/Descriptors.h \
-                          ../../build/aten/src/ATen/Tensor.h \
-                          ../../build/aten/src/ATen/Functions.h \
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
-
-FILE_PATTERNS          = *.h
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn*
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see https://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# https://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           = ../../ \
-                         ../../torch/csrc/api/include/ \
-                         ../../aten/src/ \
-                         ../../build/aten/src/
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = DOXYGEN_DOCUMENTATION_BUILD
-PREDEFINED            += DOXYGEN_SHOULD_SKIP_THIS
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/docs/cpp/Makefile b/docs/cpp/Makefile
index 336792c10277f1..689743bbd4372e 100644
--- a/docs/cpp/Makefile
+++ b/docs/cpp/Makefile
@@ -4,7 +4,7 @@
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = PyTorch
-SOURCEDIR     = ./
+SOURCEDIR     = source
 BUILDDIR      = build
 PYCMD         = python
 
@@ -12,13 +12,17 @@ PYCMD         = python
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-.PHONY: help Makefile
+.PHONY: help Makefile clean
+
+clean:
+	@# Clean up sphinx and doxygen build artifacts.
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@# Clean up exhale generated api.
+	@echo "Removing everything under generated 'source/api'..."
+	@rm -rf $(SOURCEDIR)/api
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-clean:
-	@echo "Removing everything under 'build'..."
-	@rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(BUILDDIR)/xml
diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
new file mode 100644
index 00000000000000..2ab4947453eaca
--- /dev/null
+++ b/docs/cpp/source/Doxyfile
@@ -0,0 +1,132 @@
+################################################################################
+# Primary project setup.                                                       #
+################################################################################
+PROJECT_NAME           = "PyTorch"
+# Parent directory of where Doxygen should output files.  Cannot be for example
+# `../build/doxygen` because if `../build` does not exist Doxygen will error.
+OUTPUT_DIRECTORY       = ../build
+# XML output stored in ${OUTPUT_DIRECTORY}/${XML_OUTPUT}.  Must agree
+# with what is specified to `breathe_projects` in `conf.py`.
+XML_OUTPUT             = xml
+# HTML output stored in ${OUTPUT_DIRECTORY}/${HTML_OUTPUT}.
+# NOTE: it can be useful to have Doxygen generate HTML to verify what sphinx is
+#       creating, or to generate HTML documentation to check a docstring you are
+#       writing without needing to wait for Sphinx.
+#
+#       # Must run from this directory (see relative paths below...)
+#       $ cd /path/to/pytorch/docs/cpp/source
+#
+#       # Send configs in this file plus `GENERATE_HTML=YES` to doxygen on stdin
+#       $ (cat Doxyfile; echo 'GENERATE_HTML = YES') | doxygen -
+#
+#       # View the doxygen results.
+#       $ open ../build/doxygen_html/index.html
+#
+#       This sends everything in this file plus `GENERATE_HTML = YES` to doxygen
+#       on stdin.  Without needing to edit this `Doxyfile` directly.
+HTML_OUTPUT            = doxygen_html
+# Strip the path prefix.  *MUST* agree with `exhale_args` in conf.py.
+# {repo_root}/docs/cpp/source/../../.. -> {repo_root}
+STRIP_FROM_PATH        = ../../..
+# What folders / files Doxygen should process.
+INPUT                  = ../../../torch/csrc/api/include \
+                         ../../../torch/csrc/api/src \
+                         ../../../torch/csrc/jit/custom_operator.h \
+                         ../../../torch/csrc/jit/import.h \
+                         ../../../torch/csrc/jit/ivalue.h \
+                         ../../../torch/csrc/jit/script/module.h \
+                         ../../../aten/src/ATen/ATen.h \
+                         ../../../aten/src/ATen/Backend.h \
+                         ../../../aten/src/ATen/Device.h \
+                         ../../../aten/src/ATen/DeviceGuard.h \
+                         ../../../aten/src/ATen/Layout.h \
+                         ../../../aten/src/ATen/OptionsGuard.h \
+                         ../../../aten/src/ATen/Scalar.h \
+                         ../../../aten/src/ATen/TensorOptions.h \
+                         ../../../aten/src/ATen/core/ArrayRef.h \
+                         ../../../aten/src/ATen/core/DeviceType.h \
+                         ../../../aten/src/ATen/core/Error.h \
+                         ../../../aten/src/ATen/core/Half.h \
+                         ../../../aten/src/ATen/core/ScalarType.h \
+                         ../../../aten/src/ATen/cuda/CUDAGuard.h \
+                         ../../../aten/src/ATen/cuda/CUDAStream.h \
+                         ../../../aten/src/ATen/cuda/CUDAContext.h \
+                         ../../../aten/src/ATen/cudnn/Descriptors.h \
+                         ../../../aten/src/ATen/cudnn/Handles.h \
+                         ../../../aten/src/ATen/cudnn/Types.h \
+                         ../../../aten/src/ATen/cudnn/Utils.h \
+                         ../../../aten/src/ATen/mkl/Descriptors.h \
+                         ../../../build/aten/src/ATen/Tensor.h \
+                         ../../../build/aten/src/ATen/Functions.h
+# Don't include .cpp files!
+FILE_PATTERNS          = *.h
+# If you need this to be YES, exhale will probably break.
+CREATE_SUBDIRS         = NO
+# So that Doxygen does not trim paths, which affects the file hierarchy
+FULL_PATH_NAMES        = YES
+# Nested folders will be ignored without this.
+RECURSIVE              = YES
+################################################################################
+# Output formats for Doxygen to create.                                        #
+################################################################################
+# Set to YES if you are debugging or want to compare.
+GENERATE_HTML          = NO
+# Unless you want it...
+GENERATE_LATEX         = NO
+# Both breathe and exhale need the xml.
+GENERATE_XML           = YES
+# Set to NO if you do not want the Doxygen program listing included.
+# NOTE: setting to NO may result in unrecovered file relationships
+#       (which file defined which compound)
+XML_PROGRAMLISTING     = YES
+################################################################################
+# Doxygen preprocessor / parser control.                                       #
+################################################################################
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = NO
+SKIP_FUNCTION_MACROS   = NO
+# Extra defs for to help with building the _right_ version of the docs
+PREDEFINED             = DOXYGEN_DOCUMENTATION_BUILD
+PREDEFINED            += DOXYGEN_SHOULD_SKIP_THIS
+# Symbol expansion to follow for #include statements (not input)
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = ../../.. \
+                         ../../../torch/csrc/api/include \
+                         ../../../aten/src \
+                         ../../../build/aten/src
+################################################################################
+# Compound extraction control.                                                 #
+################################################################################
+EXTRACT_ALL            = YES
+EXTRACT_PACKAGE        = YES
+EXTRACT_STATIC         = YES
+CASE_SENSE_NAMES       = NO
+EXCLUDE_SYMBOLS        = c10::* caffe2::* cereal* DL* TH* cudnn*
+################################################################################
+# Docstring control / customization.                                           #
+################################################################################
+# First line of /** comment */ treated as \brief
+JAVADOC_AUTOBRIEF      = YES
+# Allow for rst directives and advanced functions e.g. grid tables
+# Example:
+#     /**
+#      * \rst
+#      * .. code-block:: cpp
+#      *
+#      *     int main() {
+#      *         return 0;
+#      *     }
+#      *
+#      * \endrst
+#      */
+# NOTE:
+#     1. \rst and \endrst must be on their own line.
+#     2. leading-asterisk required.
+ALIASES                = "rst=\verbatim embed:rst:leading-asterisk"
+ALIASES               += "endrst=\endverbatim"
+################################################################################
+# Warning suppression.                                                         #
+################################################################################
+QUIET                  = YES
+WARN_IF_UNDOCUMENTED   = NO
diff --git a/docs/cpp/building.rst b/docs/cpp/source/building.rst
similarity index 100%
rename from docs/cpp/building.rst
rename to docs/cpp/source/building.rst
diff --git a/docs/cpp/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh
similarity index 60%
rename from docs/cpp/check-doxygen.sh
rename to docs/cpp/source/check-doxygen.sh
index 0af06ad2350c39..fcc7c081860aa1 100755
--- a/docs/cpp/check-doxygen.sh
+++ b/docs/cpp/source/check-doxygen.sh
@@ -4,7 +4,9 @@ set -ex
 
 ignore_warning() {
   # Invert match to filter out $1.
+  set +e
   grep -v "$1" doxygen-log.txt > temp.txt
+  set -e
   mv temp.txt doxygen-log.txt
 }
 
@@ -17,11 +19,7 @@ cat original-doxygen-log.txt
 
 # Filter out some warnings.
 ignore_warning "warning: no uniquely matching class member found for"
-ignore_warning "warning: source ../../build/aten/src/ is not a readable file"
-ignore_warning "warning: source ../../build/aten/src/ATen/Tensor.h is not a readable file"
-ignore_warning "warning: source ../../build/aten/src/ATen/Functions.h is not a readable file"
-ignore_warning "warning: documented symbol \`torch::nn::FunctionalImpl::FunctionalImpl' was not declared or defined"
-ignore_warning "functional.h:81: warning: Found ';' while parsing initializer list!"
+ignore_warning "warning:.*\.\./\.\./\.\./build/aten.*"
 
 # Count the number of remaining warnings.
 warnings="$(grep 'warning:' doxygen-log.txt | wc -l)"
diff --git a/docs/cpp/conf.py b/docs/cpp/source/conf.py
similarity index 81%
rename from docs/cpp/conf.py
rename to docs/cpp/source/conf.py
index c494decdf9dbbe..c5d928120e76ae 100644
--- a/docs/cpp/conf.py
+++ b/docs/cpp/source/conf.py
@@ -17,7 +17,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-# import os
+import os
 # sys.path.insert(0, os.path.abspath('.'))
 
 import sys
@@ -29,7 +29,7 @@
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-# needs_sphinx = '1.0'
+needs_sphinx = '1.6'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -39,7 +39,24 @@
     'exhale'
 ]
 
-breathe_projects = {"PyTorch": "build/xml"}
+# Setup absolute paths for communicating with breathe / exhale where
+# items are expected / should be trimmed by.
+# This file is {repo_root}/docs/cpp/source/conf.py
+this_file_dir = os.path.abspath(os.path.dirname(__file__))
+doxygen_xml_dir = os.path.join(
+    os.path.dirname(this_file_dir),  # {repo_root}/docs/cpp
+    'build',                         # {repo_root}/docs/cpp/build
+    'xml'                            # {repo_root}/docs/cpp/build/xml
+)
+repo_root = os.path.dirname(  # {repo_root}
+    os.path.dirname(          # {repo_root}/docs
+        os.path.dirname(      # {repo_root}/docs/cpp
+            this_file_dir     # {repo_root}/docs/cpp/source
+        )
+    )
+)
+
+breathe_projects = {"PyTorch": doxygen_xml_dir}
 breathe_default_project = "PyTorch"
 
 # Setup the exhale extension
@@ -50,7 +67,7 @@
     "containmentFolder": "./api",
     "rootFileName": "library_root.rst",
     "rootFileTitle": "Library API",
-    "doxygenStripFromPath": "../",
+    "doxygenStripFromPath": repo_root,
     ############################################################################
     # Suggested optional arguments.                                            #
     ############################################################################
@@ -137,7 +154,6 @@
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -150,21 +166,33 @@
     'logo_only': True,
 }
 
-html_logo = '../source/_static/img/pytorch-logo-dark-unstable.png'
+# NOTE: sharing python docs resources
+html_logo = os.path.join(
+    repo_root, 'docs', 'source', '_static', 'img', 'pytorch-logo-dark-unstable.png'
+)
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['../source/_static']
+# NOTE: sharing python docs resources
+html_static_path = [os.path.join(repo_root, 'docs', 'source', '_static')]
 
-# html_style_path = 'css/pytorch_theme.css'
-html_context = {
-    'css_files': [
-        'https://fonts.googleapis.com/css?family=Lato',
-        '_static/css/pytorch_theme.css'
-    ],
-}
 
+# Called automatically by Sphinx, making this `conf.py` an "extension".
+def setup(app):
+    # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
+    # and can be moved outside of this function (and the setup(app) function
+    # can be deleted).
+    html_css_files = [
+        'https://fonts.googleapis.com/css?family=Lato',
+        'css/pytorch_theme.css'  # relative to paths in `html_static_path`
+    ]
+
+    # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
+    # `add_stylesheet` (deprecated in 1.8).
+    add_css = getattr(app, 'add_css_file', getattr(app, 'add_stylesheet'))
+    for css_file in html_css_files:
+        add_css(css_file)
 
 # -- Options for HTMLHelp output ------------------------------------------
 
diff --git a/docs/cpp/contributing.rst b/docs/cpp/source/contributing.rst
similarity index 100%
rename from docs/cpp/contributing.rst
rename to docs/cpp/source/contributing.rst
diff --git a/docs/cpp/examples.rst b/docs/cpp/source/examples.rst
similarity index 100%
rename from docs/cpp/examples.rst
rename to docs/cpp/source/examples.rst
diff --git a/docs/cpp/index.rst b/docs/cpp/source/index.rst
similarity index 100%
rename from docs/cpp/index.rst
rename to docs/cpp/source/index.rst
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ca578c40e3df5a..fb7340ab47b528 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -38,7 +38,7 @@
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-# needs_sphinx = '1.0'
+needs_sphinx = '1.6'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -135,7 +135,6 @@
 #
 #
 html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 
 # Theme options are theme-specific and customize the look and feel of a theme
@@ -159,14 +158,23 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static', '_images']
 
-html_style_path = 'css/pytorch_theme.css'
-html_context = {
-    'css_files': [
+
+# Called automatically by Sphinx, making this `conf.py` an "extension".
+def setup(app):
+    # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
+    # and can be moved outside of this function (and the setup(app) function
+    # can be deleted).
+    html_css_files = [
         'https://fonts.googleapis.com/css?family=Lato',
-        '_static/css/pytorch_theme.css',
-        'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css',
-    ],
-}
+        'css/pytorch_theme.css',  # relative to paths in `html_static_path`
+        'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css'
+    ]
+
+    # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
+    # `add_stylesheet` (deprecated in 1.8).
+    add_css = getattr(app, 'add_css_file', getattr(app, 'add_stylesheet'))
+    for css_file in html_css_files:
+        add_css(css_file)
 
 
 # -- Options for HTMLHelp output ------------------------------------------
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index 3dd730fcca5c68..4dca240aac493e 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -14,12 +14,15 @@
 #include <type_traits>
 #include <unordered_map>
 
+// forward declarations confuse doxygen
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace torch {
 namespace detail {
 template <typename T>
 class CursorBase;
 } // namespace detail
 } // namespace torch
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 namespace torch {
 namespace nn {
diff --git a/torch/csrc/api/include/torch/nn/modules/functional.h b/torch/csrc/api/include/torch/nn/modules/functional.h
index 44d11d1ffa6ee0..98f72963686f32 100644
--- a/torch/csrc/api/include/torch/nn/modules/functional.h
+++ b/torch/csrc/api/include/torch/nn/modules/functional.h
@@ -18,7 +18,8 @@ namespace nn {
 /// `Sequential`.
 ///
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
+///
 ///   Sequential sequential(
 ///     Linear(3, 4),
 ///     Functional(torch::relu),
@@ -31,11 +32,18 @@ namespace nn {
 /// these have to be bound *at construction time*. For example, if
 /// you want to wrap `torch::leaky_relu`, which accepts a `slope` scalar as its
 /// second argument, with a particular value for its `slope` in a `Functional`
-/// module, you could write `Functional(torch::leaky_relu, /*slope=*/0.5)`. The
-/// value of `0.5` is then stored within the `Functional` object and supplied to
-/// the function call at invocation time. Note that such bound values are
+/// module, you could write
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Functional(torch::leaky_relu, /*slope=*/0.5)
+/// \endrst
+///
+/// The value of `0.5` is then stored within the `Functional` object and supplied
+/// to the function call at invocation time. Note that such bound values are
 /// evaluated eagerly and stored a single time. See the documentation of
-/// [`std::bind`](https://en.cppreference.com/w/cpp/utility/functional/bind) for
+/// [std::bind](https://en.cppreference.com/w/cpp/utility/functional/bind) for
 /// more information on the semantics of argument binding.
 ///
 /// \rst

From 53cf628503af7b9b5f9b13a2c13e0a7a7c0f29bc Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Tue, 18 Sep 2018 15:05:59 -0700
Subject: [PATCH 234/237] Simplify Blob move constructor/assignment (#11402)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11402

- Simplify move constructor/assignment
- Make more things noexcept

Reviewed By: ezyang

Differential Revision: D9728631

fbshipit-source-id: 92562e30ea1e4d05ca857665a02b0ca66b0739e3
---
 aten/src/ATen/core/typeid.h |  2 +-
 caffe2/core/blob.h          | 31 +++++++++----------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index 9741de03dbaedb..a19035de1ba034 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -222,7 +222,7 @@ class AT_CORE_API TypeMeta {
   friend bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept;
 
   template <typename T>
-  bool Match() const {
+  bool Match() const noexcept {
     return (id_ == Id<T>());
   }
 
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 1df7bc9beafcb8..490b58c1565e6e 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -30,28 +30,15 @@ class CAFFE2_API Blob final {
   /**
    * Initializes an empty Blob.
    */
-  Blob() : meta_(), pointer_(nullptr) {}
+  Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {}
   ~Blob() { Reset(); }
 
-  Blob(Blob&& other) noexcept
-      : meta_(std::move(other.meta_)),
-        pointer_(std::move(other.pointer_)),
-        destroy_(std::move(other.destroy_)) {
-    other.meta_ = {};
-    other.pointer_ = nullptr;
-    other.destroy_ = nullptr;
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
   }
 
   Blob& operator=(Blob&& other) noexcept {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = std::move(other.meta_);
-    pointer_ = std::move(other.pointer_);
-    destroy_ = std::move(other.destroy_);
-    other.meta_ = {};
-    other.pointer_ = nullptr;
-    other.destroy_ = nullptr;
+    Blob(std::move(other)).swap(*this);
     return *this;
   }
 
@@ -59,7 +46,7 @@ class CAFFE2_API Blob final {
    * Checks if the content stored in the blob is of type T.
    */
   template <class T>
-  bool IsType() const {
+  bool IsType() const noexcept {
     return meta_.Match<T>();
   }
 
@@ -75,12 +62,12 @@ class CAFFE2_API Blob final {
   /**
    * Returns the meta info of the blob.
    */
-  inline const TypeMeta& meta() const { return meta_; }
+  inline const TypeMeta& meta() const noexcept { return meta_; }
 
   /**
    * Returns a printable typename of the blob.
    */
-  inline const char* TypeName() const { return meta_.name(); }
+  inline const char* TypeName() const noexcept { return meta_.name(); }
 
   /**
    * @brief Gets the const reference of the stored object. The code checks if
@@ -101,10 +88,10 @@ class CAFFE2_API Blob final {
     return *static_cast<const T*>(pointer_);
   }
 
-  const void* GetRaw() const {
+  const void* GetRaw() const noexcept {
     return pointer_;
   }
-  void* GetRaw() {
+  void* GetRaw() noexcept {
     return pointer_;
   }
 

From 846a573b46b6a04861144324125e817df0e4e97b Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 18 Sep 2018 21:26:25 -0500
Subject: [PATCH 235/237] New failure on CI.

---
 test/test_autograd.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 5636cf988c04a9..8f0b2a99787084 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1379,6 +1379,7 @@ def test_unused_output(self):
         expected_grad[:2] = grad_output
         self.assertEqual(x.grad.data, expected_grad)
 
+    @skipIfRocm
     def test_ctc_loss(self):
         batch_size = 64
         num_labels = 101

From 489f7839baab487f85f561c59612c64b9c441292 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Wed, 19 Sep 2018 13:16:16 -0500
Subject: [PATCH 236/237] Test fails now.

---
 test/test_sparse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index 82de45d6d17384..10622cd6798a85 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1177,6 +1177,7 @@ def test_factory_copy(self):
         self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
     @cpu_only  # just run once, we test both cpu and cuda
+    @skipIfRocm
     def test_constructor_device_legacy(self):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]])
         v = torch.tensor([3., 4., 5.])

From 70af48f321d891e09d7c44f4ba0ff80af2fce61d Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Wed, 19 Sep 2018 17:44:53 -0500
Subject: [PATCH 237/237] Skip for now.

---
 test/test_dataloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 020486c1fbda35..3d9af20c859658 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -371,6 +371,7 @@ def test_segfault(self):
         finally:
             p.terminate()
 
+    @skipIfRocm
     def test_timeout(self):
         p = ErrorTrackingProcess(target=_test_timeout)
         p.start()