From 7169906249fb57e0a9be00902abcb26457dcc14c Mon Sep 17 00:00:00 2001
From: zou3519 <zou3519@gmail.com>
Date: Wed, 29 Aug 2018 09:32:08 -0700
Subject: [PATCH 01/42] torch.digamma (#10967)

Summary:
Fixes #10307

cc SsnL
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10967

Differential Revision: D9546748

Pulled By: zou3519

fbshipit-source-id: 764e27b1cc8dd487270b3ffa653b806c86f717dd
---
 docs/source/torch.rst |  1 +
 torch/_torch_docs.py  | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index fa2f92092758a..d385ff07d323d 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -169,6 +169,7 @@ Pointwise Ops
 .. autofunction:: cos
 .. autofunction:: cosh
 .. autofunction:: div
+.. autofunction:: digamma 
 .. autofunction:: erf
 .. autofunction:: erfc
 .. autofunction:: erfinv
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 6561c7a7c2388..27f111a471fa1 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1168,6 +1168,26 @@ def parse_kwargs(desc):
              [ 1.0500,  0.7336, -0.3836, -1.1015]]])
 """)
 
+add_docstr(torch.digamma,
+           r"""
+digamma(input) -> Tensor
+
+Computes the logarithmic derivative of the gamma function on `input`.
+
+.. math::
+    \psi(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)}
+
+Args:
+    input (Tensor): the tensor to compute the digamma function on
+
+Example::
+
+    >>> a = torch.tensor([1, 0.5])
+    >>> torch.digamma(a)
+    tensor([-0.5772, -1.9635])
+""")
+
+
 add_docstr(torch.dist,
            r"""
 dist(input, other, p=2) -> Tensor

From b41988c71ed7d40af7a314b2049a4b0d5909fed2 Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Wed, 29 Aug 2018 10:02:12 -0700
Subject: [PATCH 02/42] Cleanup BUILD_DOCS cmake section (#11000)

Summary:
Breaking out of https://github.com/pytorch/pytorch/pull/8338

cc mingzhe09088 Yangqing
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11000

Differential Revision: D9557474

Pulled By: orionr

fbshipit-source-id: 7d84914b67ff37bdb7738f9b7846dfeb5b975c00
---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1009e5a4ec30f..75b4bf7b4512d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -306,7 +306,7 @@ if(BUILD_DOCS)
 
     if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
       file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
-    endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    endif()
 
     file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
     configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
@@ -323,10 +323,10 @@ if(BUILD_DOCS)
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         COMMENT "Generating Python API documentation with Doxygen"
         VERBATIM)
-  else (DOXYGEN_FOUND)
+  else()
     message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
-  endif (DOXYGEN_FOUND)
-endif (BUILD_DOCS)
+  endif()
+endif()
 
 # ---[ CMake related files
 # Uninistall option.

From a9469c9c8ab046a7961c1c357d84f60063507c4b Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Wed, 29 Aug 2018 10:48:04 -0700
Subject: [PATCH 03/42] Fill eigenvector with zeros if not required (#10645)

Summary:
Fix #10345, which only happens in CUDA case.

* Instead of returning some random buffer, we fill it with zeros.

* update torch.symeig doc.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10645

Reviewed By: soumith

Differential Revision: D9395762

Pulled By: ailzhang

fbshipit-source-id: 0f3ed9bb6a919a9c1a4b8eb45188f65a68bfa9ba
---
 aten/src/THC/generic/THCTensorMathMagma.cu |  8 +++++-
 test/test_cuda.py                          | 12 +--------
 test/test_torch.py                         | 30 +++++++++++++++-------
 torch/_torch_docs.py                       |  9 +++++++
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index aee04a8e22a4e..3b63c3ae1c7b2 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -235,7 +235,13 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
     else if (info < 0)
       THError("MAGMA syev : Argument %d : illegal value", -info);
   }
-  THCTensor_(freeCopyTo)(state, input, rv_);
+  if (jobzs[0] == 'N') {
+    // If eigenvector is not needed, fill the result with zeros.
+    THCTensor_(zero)(state, rv_);
+    THCTensor_(free)(state, input);
+  } else {
+    THCTensor_(freeCopyTo)(state, input, rv_);
+  }
 #else
   THError(NoMagma(syev));
 #endif
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 73ba3880697b1..088919ad595a9 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1734,17 +1734,7 @@ def test(use_double=False):
 
     @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
     def test_symeig(self):
-        # Small case
-        tensor = torch.randn(3, 3).cuda()
-        tensor = torch.mm(tensor, tensor.t())
-        eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
-        self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))
-
-        # Large case
-        tensor = torch.randn(257, 257).cuda()
-        tensor = torch.mm(tensor, tensor.t())
-        eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
-        self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))
+        TestTorch._test_symeig(self, lambda t: t.cuda())
 
     def test_arange(self):
         for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']:
diff --git a/test/test_torch.py b/test/test_torch.py
index 3bab1927a5de9..34b8256763c65 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -4279,13 +4279,12 @@ def test_eig(self):
         Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
         self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
 
-    @skipIfNoLapack
-    @skipIfRocm
-    def test_symeig(self):
-        xval = torch.rand(100, 3)
+    @staticmethod
+    def _test_symeig(self, conv_fn):
+        xval = conv_fn(torch.rand(100, 3))
         cov = torch.mm(xval.t(), xval)
-        rese = torch.zeros(3)
-        resv = torch.zeros(3, 3)
+        rese = conv_fn(torch.zeros(3))
+        resv = conv_fn(torch.zeros(3, 3))
 
         # First call to symeig
         self.assertTrue(resv.is_contiguous(), 'resv is not contiguous')
@@ -4299,17 +4298,30 @@ def test_symeig(self):
         ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t())
         self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong')
 
+        # test eigenvectors=False
+        rese2 = conv_fn(torch.zeros(3))
+        resv2 = conv_fn(torch.randn(3, 3))
+        expected_resv2 = conv_fn(torch.zeros(3, 3))
+        torch.symeig(cov.clone(), False, out=(rese2, resv2))
+        self.assertEqual(rese, rese2)
+        self.assertEqual(resv2, expected_resv2)
+
         # test non-contiguous
-        X = torch.rand(5, 5)
+        X = conv_fn(torch.rand(5, 5))
         X = X.t() * X
-        e = torch.zeros(4, 2).select(1, 1)
-        v = torch.zeros(4, 2, 4)[:, 1]
+        e = conv_fn(torch.zeros(4, 2)).select(1, 1)
+        v = conv_fn(torch.zeros(4, 2, 4))[:, 1]
         self.assertFalse(v.is_contiguous(), 'V is contiguous')
         self.assertFalse(e.is_contiguous(), 'E is contiguous')
         torch.symeig(X, True, out=(e, v))
         Xhat = torch.mm(torch.mm(v, torch.diag(e)), v.t())
         self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
 
+    @skipIfNoLapack
+    @skipIfRocm
+    def test_symeig(self):
+        self._test_symeig(self, lambda x: x)
+
     @skipIfNoLapack
     def test_svd(self):
         a = torch.Tensor(((8.79, 6.11, -9.15, 9.57, -3.49, 9.84),
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 27f111a471fa1..fd70fd20f5f45 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4459,6 +4459,15 @@ def parse_kwargs(desc):
     upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
     out (tuple, optional): the output tuple of (Tensor, Tensor)
 
+Returns:
+    (Tensor, Tensor): A tuple containing
+
+        - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``,
+            The eigenvalues are in ascending order.
+        - **V** (*Tensor*): Shape :math:`(m \times m)`.
+            If ``eigenvectors=False``, it's a tensor filled with zeros.
+            Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
+
 Examples::
 
 

From 1b0d5e60abe8eae3ebaaa3c16eb387314b455d5c Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Wed, 29 Aug 2018 10:59:05 -0700
Subject: [PATCH 04/42] Get rid of some unnecessary includes of Context.
 (#10951)

Summary:
This is part of splitting Context from what needs to go in ATen/core.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10951

Differential Revision: D9540369

Pulled By: gchanan

fbshipit-source-id: 73b0e8c4493785fbab368a989f46137c51f6ea0b
---
 aten/src/ATen/Formatting.cpp      | 1 -
 aten/src/ATen/StorageImpl.cpp     | 1 -
 aten/src/ATen/UndefinedTensor.cpp | 1 -
 aten/src/ATen/UndefinedType.h     | 1 -
 4 files changed, 4 deletions(-)

diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp
index ef04cc4bdfd97..dcdf7653f2308 100644
--- a/aten/src/ATen/Formatting.cpp
+++ b/aten/src/ATen/Formatting.cpp
@@ -1,6 +1,5 @@
 #include "ATen/Formatting.h"
 #include "ATen/Tensor.h"
-#include "ATen/Context.h"
 #include "ATen/TensorMethods.h"
 
 #include <cmath>
diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index 233540bfa06f2..af488472f24b5 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -1,4 +1,3 @@
-#include <ATen/Context.h>
 #include <ATen/StorageImpl.h>
 
 namespace at {
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
index 79f58479e90b5..f50a4e71da9ca 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -1,5 +1,4 @@
 #include "ATen/UndefinedTensor.h"
-#include "ATen/Context.h"
 #include "ATen/core/Error.h"
 
 namespace at {
diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h
index 9ca00cfb516ff..2cb14a3a652c4 100644
--- a/aten/src/ATen/UndefinedType.h
+++ b/aten/src/ATen/UndefinedType.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "ATen/Type.h"
-#include "ATen/Context.h"
 #include "ATen/CheckGenerator.h"
 
 #ifdef _MSC_VER

From 562fc7631ff8b25487c9a3886f57b74bd7008c97 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Wed, 29 Aug 2018 10:59:29 -0700
Subject: [PATCH 05/42] Add test cases for ONNX unsqueeze (#10924)

Summary:
PyTorch exporting test and end to end cases.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10924

Reviewed By: Ac2zoom

Differential Revision: D9548210

Pulled By: houseroad

fbshipit-source-id: 2381d1ad92a4e07f97060eb65c9fd09f60ad3de6
---
 .../TestOperators.test_unsqueeze.expect       | 54 +++++++++++++++++++
 test/onnx/test_operators.py                   |  4 ++
 test/onnx/test_pytorch_onnx_caffe2.py         | 12 +++++
 3 files changed, 70 insertions(+)
 create mode 100644 test/onnx/expect/TestOperators.test_unsqueeze.expect

diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect
new file mode 100644
index 0000000000000..3a8e01092f8d0
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect
@@ -0,0 +1,54 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index f476cde7afd93..d8e0b6be0d94a 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -428,6 +428,10 @@ def test_upsample(self):
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
         self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2., mode='bilinear'), x)
 
+    def test_unsqueeze(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.unsqueeze(len(x.shape)), x)
+
     def test_symbolic_override(self):
         """Lifted from fast-neural-style: custom implementation of instance norm
         to be mapped to ONNX operator"""
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 9b31d02d6e385..349e7fc1eec37 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -798,6 +798,18 @@ def test_convtranspose(self):
         model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2)
         self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7)
 
+    def test_unsqueeze(self):
+        shape = (3, 4, 5)
+        for dim in range(len(shape) + 1):
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super(MyModel, self).__init__()
+
+                def forward(self, x):
+                    return x.unsqueeze(dim)
+            x = Variable(torch.randn(*shape))
+            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7)
+
     # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed
     # TODO: We should have another pass to eliminate the unused initializers in ONNX models.
     @skipIfEmbed

From 206d52d0e3ad4ef547b5bb566cdf7ca2e7c824ae Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Wed, 29 Aug 2018 11:07:29 -0700
Subject: [PATCH 06/42] Disable smart_tensor_printer_test without glog (#10999)

Summary:
Breaking out of https://github.com/pytorch/pytorch/pull/8338

This test fails once we start building with `-DUSE_GLOG=OFF` since the non-glog logging case doesn't support flushing or streaming to the right location. For now, we just disable this test in that case.

cc Yangqing mingzhe09088
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10999

Reviewed By: mingzhe09088

Differential Revision: D9557488

Pulled By: orionr

fbshipit-source-id: 8b306f210411dfc8ccc404bdccf77ddcd36a4830
---
 caffe2/utils/smart_tensor_printer_test.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
index 82a59ad60aa95..e207f7c7b0528 100644
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ b/caffe2/utils/smart_tensor_printer_test.cc
@@ -39,6 +39,9 @@ void printTensorAndCheck(const std::vector<T>& values) {
   expect_stderr_contains(values);
 }
 
+// We need real glog for this test to pass
+#ifdef CAFFE2_USE_GOOGLE_GLOG
+
 #if !(__APPLE__) // TODO(janusz): thread_local does not work under mac.
 
 TEST(SmartTensorPrinterTest, SimpleTest) {
@@ -48,4 +51,6 @@ TEST(SmartTensorPrinterTest, SimpleTest) {
 
 #endif // !(__APPLE__)
 
+#endif // CAFFE2_USE_GOOGLE_GLOG
+
 } // namespace caffe2

From e0dbb91060f1d9dbc45ae8f37b8613e487e2e4b0 Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Wed, 29 Aug 2018 11:26:56 -0700
Subject: [PATCH 07/42] Windows raw string fix (#10998)

Summary:
Breaking this out of https://github.com/pytorch/pytorch/pull/8338

mingzhe09088's fix of the docstrings for Windows builds. Unfortunately some versions of Windows seem to try and parse the `#` inside the string as a pre-processor declaration. We might need to change this to something else later, but want to get this landed first.

cc mingzhe09088 Yangqing
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10998

Reviewed By: mingzhe09088

Differential Revision: D9557480

Pulled By: orionr

fbshipit-source-id: c6a6237c27b7cf35c81133fd9faefead675a9f59
---
 caffe2/operators/concat_split_op.cc           |  4 ++--
 caffe2/operators/conv_op.cc                   | 10 +++++-----
 caffe2/operators/conv_transpose_op.cc         | 10 +++++-----
 caffe2/operators/counter_ops.cc               | 10 +++++-----
 caffe2/operators/cross_entropy_op.cc          | 20 +++++++++----------
 caffe2/operators/distance_op.cc               | 20 +++++++++----------
 caffe2/operators/elementwise_linear_op.cc     | 12 +++++------
 caffe2/operators/elementwise_logical_ops.cc   |  4 ++--
 caffe2/operators/elementwise_sum_op.cc        |  2 +-
 caffe2/operators/filler_op.cc                 |  8 ++++----
 caffe2/operators/fully_connected_op.cc        | 16 +++++++--------
 caffe2/operators/gather_op.cc                 |  2 +-
 .../local_response_normalization_op.cc        |  2 +-
 caffe2/operators/lp_pool_op.cc                |  2 +-
 caffe2/operators/lpnorm_op.cc                 |  2 +-
 caffe2/operators/pool_op.cc                   |  4 ++--
 caffe2/operators/reduction_ops.cc             | 16 +++++++--------
 caffe2/operators/relu_op.cc                   |  2 +-
 caffe2/operators/sparse_to_dense_mask_op.cc   |  4 ++--
 caffe2/operators/sparse_to_dense_op.cc        |  2 +-
 caffe2/operators/stats_ops.cc                 |  4 ++--
 caffe2/operators/utility_ops.cc               | 12 +++++------
 22 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc
index a8f4c91e7e540..31256026028df 100644
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@@ -311,8 +311,8 @@ op = core.CreateOperator(
     axis=3
 )
 
-workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
-workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
+workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
+workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
 print("X1:", workspace.FetchBlob("X1"))
 print("X2:", workspace.FetchBlob("X2"))
 workspace.RunOperatorOnce(op)
diff --git a/caffe2/operators/conv_op.cc b/caffe2/operators/conv_op.cc
index 082c94fb6c18f..30fb79d384694 100644
--- a/caffe2/operators/conv_op.cc
+++ b/caffe2/operators/conv_op.cc
@@ -42,24 +42,24 @@ op = core.CreateOperator(
     stride=2
 )
 
-# Create X: (N,C,H,W)
+// Create X: (N,C,H,W)
 data = np.random.randn(1,1,8,8).astype(np.float32)
 print("Data shape: ",data.shape)
 
-# Create W: (M,C,Kh,Kw)
+// Create W: (M,C,Kh,Kw)
 filters = np.random.randn(3,1,5,5).astype(np.float32)
 print("Filter shape: ",filters.shape)
 
-# Create b: M
+// Create b: M
 bias = np.array([1.,1.,1.]).astype(np.float32)
 print("Bias shape: ",bias.shape)
 
-# Put the inputs into the workspace
+// Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("filter", filters)
 workspace.FeedBlob("bias", bias)
 
-# Run the operator
+// Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 
diff --git a/caffe2/operators/conv_transpose_op.cc b/caffe2/operators/conv_transpose_op.cc
index 57ec02b63ea0d..7de16afaed915 100644
--- a/caffe2/operators/conv_transpose_op.cc
+++ b/caffe2/operators/conv_transpose_op.cc
@@ -44,24 +44,24 @@ op = core.CreateOperator(
     strides=[2,2]
 )
 
-# Create X: (N,C,H,W)
+// Create X: (N,C,H,W)
 data = np.random.randn(2,3,5,5).astype(np.float32)
 print("Data shape: ",data.shape)
 
-# Create filter: (M,C,Kh,Kw)
+// Create filter: (M,C,Kh,Kw)
 filters = np.random.randn(3,1,2,2).astype(np.float32)
 print("Filter shape: ",filters.shape)
 
-# Create b: M
+// Create b: M
 bias = np.array([1.]).astype(np.float32)
 print("Bias shape: ",bias.shape)
 
-# Put the inputs into the workspace
+// Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("filter", filters)
 workspace.FeedBlob("bias", bias)
 
-# Run the operator
+// Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 
diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc
index 15cdab5849cc1..50e4b9448af31 100644
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@@ -58,22 +58,22 @@ resetcounter_op = core.CreateOperator(
 )
 
 
-# Create counter
+// Create counter
 workspace.RunOperatorOnce(createcounter_op)
 print("'counter' pointer:", workspace.FetchBlob("counter"))
 
 
-# Retrieve initial counter value
+// Retrieve initial counter value
 workspace.RunOperatorOnce(retrievecount_op)
 print("Initial 'count':", workspace.FetchBlob("count"))
 
 
-# Check if counter is done
+// Check if counter is done
 workspace.RunOperatorOnce(checkcounterdone_op)
 print("Initial 'done' value:", workspace.FetchBlob("done"))
 
 
-# Test CountUp operator
+// Test CountUp operator
 print("\nTesting CountUp operator...")
 for i in range(5):
     workspace.RunOperatorOnce(countup_op)
@@ -83,7 +83,7 @@ workspace.RunOperatorOnce(retrievecount_op)
 print("'count' value after CountUp test:", workspace.FetchBlob("count"))
 
 
-# Test CountDown operator
+// Test CountDown operator
 print("\nTesting CountDown operator...")
 for i in range(11):
     workspace.RunOperatorOnce(countdown_op)
diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc
index 584b7abd5a183..0473e7d4e435b 100644
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@@ -401,22 +401,22 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: Sample softmax output for 5-class model
+// Create X: Sample softmax output for 5-class model
 X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]])
 print("X:\n",X)
 
-# Create label: Sample 1-hot ground truth label vectors
+// Create label: Sample 1-hot ground truth label vectors
 label = np.array([4,2])
 print("label:\n",label)
 
-# Feed X & label into workspace
+// Feed X & label into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("label", label.astype(np.int32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
@@ -635,22 +635,22 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: Sample softmax output for 5-class model
+// Create X: Sample softmax output for 5-class model
 X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]])
 print("X:\n",X)
 
-# Create label: Sample 1-hot ground truth label vectors
+// Create label: Sample 1-hot ground truth label vectors
 label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]])
 print("label:\n",label)
 
-# Feed X & label into workspace
+// Feed X & label into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("label", label.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index d9abfa0e25433..9a38a4a77a004 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -437,22 +437,22 @@ op = core.CreateOperator(
     ["Z"]
 )
 
-# Create X
+// Create X
 X = 5*np.ones((1, 4))
 print("X:\n",X)
 
-# Create Y
+// Create Y
 Y = np.ones((1, 4))
 print("Y:\n",Y)
 
-# Feed X & Y into workspace
+// Feed X & Y into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("Y", Y.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Z:\n", workspace.FetchBlob("Z"))
 
 ```
@@ -645,22 +645,22 @@ op = core.CreateOperator(
     ["Z"]
 )
 
-# Create X
+// Create X
 X = np.random.randn(3, 3)
 print("X:\n",X)
 
-# Create Y
+// Create Y
 Y = np.random.randn(3, 3)
 print("Y:\n",Y)
 
-# Feed X & Y into workspace
+// Feed X & Y into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("Y", Y.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Z:\n", workspace.FetchBlob("Z"))
 
 ```
diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc
index d68bfbc5a0eb9..371aae78a2520 100644
--- a/caffe2/operators/elementwise_linear_op.cc
+++ b/caffe2/operators/elementwise_linear_op.cc
@@ -112,28 +112,28 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X
+// Create X
 X = np.array([[1,2,3,4,5],[6,8,9,16,10]])
 print("X:\n",X)
 
-# Create w
+// Create w
 w = np.array([1,1/2.,1/3.,1/4.,1/5.])
 print("w:\n",w)
 
-# Create b
+// Create b
 b = np.array([1.,1.,1.,1.,1.])
 print("b:\n",b)
 
 
-# Feed X & w & b into workspace
+// Feed X & w & b into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("w", w.astype(np.float32))
 workspace.FeedBlob("b", b.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc
index 5ddd4570356e9..0e2da569dcb11 100644
--- a/caffe2/operators/elementwise_logical_ops.cc
+++ b/caffe2/operators/elementwise_logical_ops.cc
@@ -63,7 +63,7 @@ op = core.CreateOperator(
     value=[0,2,4,6,8],
 )
 
-# Use a not-empty tensor
+// Use a not-empty tensor
 workspace.FeedBlob("X", np.array([0,1,2,3,4,5,6,7,8]).astype(np.int32))
 print("X:\n", workspace.FetchBlob("X"))
 
@@ -75,7 +75,7 @@ print("Y: \n", workspace.FetchBlob("Y"))
 **Result**
 
 ```
-# value=[0,2,4,6,8]
+// value=[0,2,4,6,8]
 
 X:
  [0 1 2 3 4 5 6 7 8]
diff --git a/caffe2/operators/elementwise_sum_op.cc b/caffe2/operators/elementwise_sum_op.cc
index 861f4f115c0a4..dee3671f5bdc4 100644
--- a/caffe2/operators/elementwise_sum_op.cc
+++ b/caffe2/operators/elementwise_sum_op.cc
@@ -86,7 +86,7 @@ workspace.ResetWorkspace()
 op = core.CreateOperator(
     "Sum",
     ["A",  "B"],
-    ["A"],  # inplace
+    ["A"],  // inplace
 )
 
 workspace.FeedBlob("A", np.array([[1,2,5],[8,3,4]]).astype(np.float32))
diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc
index ff3eac217390a..c5a121e3a222d 100644
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@@ -298,11 +298,11 @@ op_2 = core.CreateOperator(
     input_as_shape=1
 )
 
-# Test arg-based op
+// Test arg-based op
 workspace.RunOperatorOnce(op_1)
 print("output (op_1):\n", workspace.FetchBlob("output"))
 
-# Test input-based op
+// Test input-based op
 workspace.ResetWorkspace()
 workspace.FeedBlob("shape", np.array([5,5]))
 workspace.FeedBlob("min", np.array(13.8, dtype=np.float32))
@@ -389,11 +389,11 @@ op_2 = core.CreateOperator(
     input_as_shape=1
 )
 
-# Test arg-based op
+// Test arg-based op
 workspace.RunOperatorOnce(op_1)
 print("output (op_1):\n", workspace.FetchBlob("output"))
 
-# Test input-based op
+// Test input-based op
 workspace.ResetWorkspace()
 workspace.FeedBlob("shape", np.array([5,5]))
 workspace.FeedBlob("min", np.array(13, dtype=np.int32))
diff --git a/caffe2/operators/fully_connected_op.cc b/caffe2/operators/fully_connected_op.cc
index 6fe95eefbac47..e14fec6f8464b 100644
--- a/caffe2/operators/fully_connected_op.cc
+++ b/caffe2/operators/fully_connected_op.cc
@@ -182,9 +182,9 @@ Github Links:
 
 ```
 
-# In this example, our batch size is 1 (M=1), the input observation will have
-#   6 features (K=6), and the layer will have one hidden node (N=1). The
-#   expected output is Y=7.
+// In this example, our batch size is 1 (M=1), the input observation will have
+//   6 features (K=6), and the layer will have one hidden node (N=1). The
+//   expected output is Y=7.
 workspace.ResetWorkspace()
 
 op = core.CreateOperator(
@@ -193,23 +193,23 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: MxK
+// Create X: MxK
 data = np.array([1,2,3,4,5,6]).astype(np.float32)
 data = data[np.newaxis,:]
 
-# Create W: NxK
+// Create W: NxK
 weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32)
 weights = weights[np.newaxis,:]
 
-# Create b: N
+// Create b: N
 bias = np.array([1.]).astype(np.float32)
 
-# Put the inputs into the workspace
+// Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("W", weights)
 workspace.FeedBlob("b", bias)
 
-# Run the operator
+// Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 
diff --git a/caffe2/operators/gather_op.cc b/caffe2/operators/gather_op.cc
index cee268ddafdcb..34c42bfc983f8 100644
--- a/caffe2/operators/gather_op.cc
+++ b/caffe2/operators/gather_op.cc
@@ -37,7 +37,7 @@ print("DATA:\n",data)
 inds = np.array([[0, 1],[1, 2]])
 print("INDICES:\n",inds)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("DATA", data.astype(np.float32))
 workspace.FeedBlob("INDICES", inds.astype(np.int32))
 
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
index 1cba60e86d978..81499b4a5d6ab 100644
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -342,7 +342,7 @@ op = core.CreateOperator("LRN",
      order="NHWC"
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc
index f877786648350..f39aaaa6397a3 100644
--- a/caffe2/operators/lp_pool_op.cc
+++ b/caffe2/operators/lp_pool_op.cc
@@ -258,7 +258,7 @@ op = core.CreateOperator(
     p=2.0
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
index 6af404d115358..79c35cd83a214 100644
--- a/caffe2/operators/lpnorm_op.cc
+++ b/caffe2/operators/lpnorm_op.cc
@@ -100,7 +100,7 @@ op = core.CreateOperator(
 X = np.array([5., 2.])
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 
 workspace.RunOperatorOnce(op)
diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc
index eca7978e024aa..87d67b17e2b6c 100644
--- a/caffe2/operators/pool_op.cc
+++ b/caffe2/operators/pool_op.cc
@@ -764,7 +764,7 @@ op = core.CreateOperator(
     stride=2,
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
@@ -832,7 +832,7 @@ op = core.CreateOperator(
     stride=2,
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc
index 0d01d50ca000e..95f15b56a720e 100644
--- a/caffe2/operators/reduction_ops.cc
+++ b/caffe2/operators/reduction_ops.cc
@@ -139,17 +139,17 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X, simulating a batch of 2, 4x4 matricies
+// Create X, simulating a batch of 2, 4x4 matricies
 X = np.random.randint(0,high=20,size=(2,4,4))
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
@@ -226,17 +226,17 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X, simulating a batch of 2, 4x4 matricies
+// Create X, simulating a batch of 2, 4x4 matricies
 X = np.random.randint(0,high=20,size=(2,4,4))
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
diff --git a/caffe2/operators/relu_op.cc b/caffe2/operators/relu_op.cc
index 03205241efc3e..0f1abd8239615 100644
--- a/caffe2/operators/relu_op.cc
+++ b/caffe2/operators/relu_op.cc
@@ -105,7 +105,7 @@ op = core.CreateOperator(
   ["Y"]
   )
 
-workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 
 workspace.RunOperatorOnce(op)
diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
index bea0b43d751cc..d968112c9ecc2 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -48,8 +48,8 @@ vector and `values` tensor into a compacted tensor where the first dimension
 corresponds to each id provided in mask argument. Missing values are filled with
 the value of `default_value`. After running this op:
 
-  output[j, :] = values[i] # where mask[j] == indices[i]
-  output[j, ...] = default_value # when mask[j] doesn't appear in indices
+  output[j, :] = values[i] // where mask[j] == indices[i]
+  output[j, ...] = default_value // when mask[j] doesn't appear in indices
 
 If `lengths` is provided and not empty, and extra "batch" dimension is prepended
 to the output.
diff --git a/caffe2/operators/sparse_to_dense_op.cc b/caffe2/operators/sparse_to_dense_op.cc
index 4f6a49796df82..0c9519e657612 100644
--- a/caffe2/operators/sparse_to_dense_op.cc
+++ b/caffe2/operators/sparse_to_dense_op.cc
@@ -23,7 +23,7 @@ representation.
 
 After running this op:
 
-  output[indices[i], :] += values[i]  # sum over all indices[i] equal to the index
+  output[indices[i], :] += values[i]  // sum over all indices[i] equal to the index
   output[j, ...] = 0 if j not in indices
 )DOC")
     .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data")
diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc
index 508dd1ae82060..d07f9cace1363 100644
--- a/caffe2/operators/stats_ops.cc
+++ b/caffe2/operators/stats_ops.cc
@@ -290,7 +290,7 @@ timergetandend_op = core.CreateOperator(
     ["nanos"]
 )
 
-# Test TimerBegin/TimerGet/TimerEnd
+// Test TimerBegin/TimerGet/TimerEnd
 workspace.RunOperatorOnce(timerbegin_op)
 print("timer:", workspace.FetchBlob("timer"))
 workspace.RunOperatorOnce(timerget_op)
@@ -298,7 +298,7 @@ print("nanos:", workspace.FetchBlob("nanos"))
 workspace.RunOperatorOnce(timerend_op)
 
 
-# Test TimerBegin/TimerGetAndEnd
+// Test TimerBegin/TimerGetAndEnd
 workspace.RunOperatorOnce(timerbegin_op)
 print("timer:", workspace.FetchBlob("timer"))
 workspace.RunOperatorOnce(timergetandend_op)
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index cc7c037a6d332..eb771974fbf39 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -103,17 +103,17 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: Sample softmax output for 5-class model
+// Create X: Sample softmax output for 5-class model
 X = np.array([2,2,2,2,2,2,2,2,2,2])
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.int32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
@@ -508,14 +508,14 @@ op = core.CreateOperator(
     ["has_elements"],
 )
 
-# Use a not-empty tensor
+// Use a not-empty tensor
 workspace.FeedBlob("tensor", np.random.randn(2, 2).astype(np.float32))
 print("tensor:\n", workspace.FetchBlob("tensor"))
 
 workspace.RunOperatorOnce(op)
 print("has_elements: ", workspace.FetchBlob("has_elements"),"\n")
 
-# Use an empty tensor
+// Use an empty tensor
 workspace.FeedBlob("tensor", np.empty(0))
 print("tensor:\n", workspace.FetchBlob("tensor"))
 

From 525548fb64308271bd5248598e398eb4035e25f3 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Wed, 29 Aug 2018 11:48:49 -0700
Subject: [PATCH 08/42] Move SparseTensorRef to core, change some includes to
 core.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10964

Differential Revision: D9545021

Pulled By: gchanan

fbshipit-source-id: 8ba7e5e3a7bdf24e5aeb4bbc91957c1a6f14d7f0
---
 aten/src/ATen/DeviceGuard.h                | 4 ++--
 aten/src/ATen/TensorOptions.h              | 4 ++--
 aten/src/ATen/{ => core}/SparseTensorRef.h | 0
 aten/src/ATen/native/LegacyBridge.cpp      | 2 +-
 aten/src/ATen/templates/Tensor.h           | 6 +++---
 aten/src/ATen/templates/TensorMethods.h    | 2 +-
 aten/src/ATen/templates/Type.cpp           | 2 +-
 aten/src/ATen/templates/Type.h             | 4 ++--
 8 files changed, 12 insertions(+), 12 deletions(-)
 rename aten/src/ATen/{ => core}/SparseTensorRef.h (100%)

diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
index 7adddfca27c9e..b51d80d22d350 100644
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/Device.h>
-#include <ATen/ScalarType.h>
+#include <ATen/core/Device.h>
+#include <ATen/core/ScalarType.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/Error.h>
 #include <ATen/detail/CUDAHooksInterface.h>
diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h
index c871768983340..a598290485196 100644
--- a/aten/src/ATen/TensorOptions.h
+++ b/aten/src/ATen/TensorOptions.h
@@ -2,10 +2,10 @@
 
 #include <ATen/core/Backend.h>
 #include <ATen/Context.h>
-#include <ATen/Device.h>
+#include <ATen/core/Device.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/core/Layout.h>
-#include <ATen/ScalarType.h>
+#include <ATen/core/ScalarType.h>
 #include <ATen/Tensor.h>
 #include <ATen/Type.h>
 
diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/core/SparseTensorRef.h
similarity index 100%
rename from aten/src/ATen/SparseTensorRef.h
rename to aten/src/ATen/core/SparseTensorRef.h
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 5b73a09ad9b00..07d7e46ff79a5 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/SparseTensorRef.h>
+#include <ATen/core/SparseTensorRef.h>
 #include <ATen/ExpandUtils.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 28e8e5381f293..f426c6753adc3 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -2,11 +2,11 @@
 
 // ${generated_comment}
 
-#include "ATen/Device.h"
+#include "ATen/core/Device.h"
 #include "ATen/core/Layout.h"
 #include "ATen/Scalar.h"
-#include "ATen/ScalarType.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/ScalarType.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Storage.h"
 #include "ATen/TensorAccessor.h"
 #include "ATen/TensorBase.h"
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 214a5d1831658..e52c597b99eeb 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -4,7 +4,7 @@
 
 #include "ATen/Tensor.h"
 #include "ATen/Scalar.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Type.h"
 
 namespace at {
diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp
index 40621a9be6e08..5e5995b9523ba 100644
--- a/aten/src/ATen/templates/Type.cpp
+++ b/aten/src/ATen/templates/Type.cpp
@@ -5,7 +5,7 @@
 #include "ATen/ExpandUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/Scalar.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Storage.h"
 #include "ATen/Tensor.h"
 #include "ATen/TensorOptions.h"
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index d4972d87a6dfd..b000029e789ca 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -3,13 +3,13 @@
 // ${generated_comment}
 
 #include "ATen/core/ATenGeneral.h"
-#include "ATen/Allocator.h"
+#include "ATen/core/Allocator.h"
 #include "ATen/core/Deprecated.h"
 #include "ATen/core/Generator.h"
 #include "ATen/core/Layout.h"
 #include "ATen/Scalar.h"
 #include "ATen/core/ScalarType.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"

From 396dec0e3740fad00461bc0ebcdfae09708693c6 Mon Sep 17 00:00:00 2001
From: zou3519 <zou3519@gmail.com>
Date: Wed, 29 Aug 2018 12:02:34 -0700
Subject: [PATCH 09/42] s/spaerse/sparse (#10968)

Summary:
cc SsnL
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10968

Differential Revision: D9546746

Pulled By: zou3519

fbshipit-source-id: a6a4bb8bb04eccf89c3d90a90259070beb484500
---
 torch/_torch_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index fd70fd20f5f45..a9db54d311784 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4137,7 +4137,7 @@ def parse_kwargs(desc):
 Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices`
 with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate
 coordinates in the indices, and the value at that index is the sum of all duplicate value entries:
-`torch.spaerse`_.
+`torch.sparse`_.
 
 Args:
     indices (array_like): Initial data for the tensor. Can be a list, tuple,

From 4e446b85fb5e0b5db0951cc068e423d9caf5beef Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 29 Aug 2018 12:14:46 -0700
Subject: [PATCH 10/42] Make profiler.build_table() O(n) rather than O(n^2)
 (#10969)

Summary:
Fixes #10851

Speeds up profiling results dramatically.

For the following script:
```
import torch
import time

ITER = 2000

x = torch.randn(1, 1, requires_grad=True)

with torch.autograd.profiler.profile() as prof:
    y = x
    for i in range(ITER):
        y = 3 * y - 2 * y
    y.backward()

start = time.time()
print("Done running. Preparing prof")
x = str(prof)
print("Done preparing prof results")
end = time.time()
print("Elapsed: {}".format(end - start))
```

I get 7s before / 0.13s after these changes.

cc apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10969

Differential Revision: D9556129

Pulled By: zou3519

fbshipit-source-id: 26b421686f8a42cdaace6382567d403e6385dc12
---
 torch/autograd/profiler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 75e309ac0faf0..c1be47ad49439 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -554,11 +554,11 @@ def build_table(events, sort_by=None, header=None):
     header_sep = '-' * max_name_length + ('  ' + '-' * col_width) * 5
 
     # Have to use a list because nonlocal is Py3 only...
-    result = ['']
+    result = []
 
     def append(s):
-        result[0] += s
-        result[0] += '\n'
+        result.append(s)
+        result.append('\n')  # Yes, newline after the end as well
 
     # Actual printing
     if header is not None:
@@ -572,4 +572,4 @@ def append(s):
         append(row_format.format(evt.key, evt.cpu_time_str, evt.cuda_time_str,
                                  evt.count, evt.cpu_time_total_str, evt.cuda_time_total_str))
 
-    return result[0]
+    return ''.join(result)

From bed9d41abd27ec991dd66d1b24da22f1a1323033 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Wed, 29 Aug 2018 12:35:51 -0700
Subject: [PATCH 11/42] Generate Type::registerCPU as we do
 register_cuda_types. (#10947)

Summary:
The goal here is to separate out the base Type into core; as it was done previously we need all derived Types to be defined when we compile the base Type.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10947

Reviewed By: gchanan

Differential Revision: D9540025

Pulled By: ezyang

fbshipit-source-id: 49f0b5acb3c378348ef3a55780abb73e4ae27edd
---
 aten/src/ATen/Context.cpp               |  3 ++-
 aten/src/ATen/Context.h                 |  1 +
 aten/src/ATen/gen.py                    |  9 ++++++++-
 aten/src/ATen/templates/RegisterCPU.cpp | 20 ++++++++++++++++++++
 aten/src/ATen/templates/RegisterCPU.h   | 10 ++++++++++
 aten/src/ATen/templates/Type.cpp        | 12 ------------
 aten/src/ATen/templates/Type.h          |  1 -
 7 files changed, 41 insertions(+), 15 deletions(-)
 create mode 100644 aten/src/ATen/templates/RegisterCPU.cpp
 create mode 100644 aten/src/ATen/templates/RegisterCPU.h

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index f85996f74c4b7..a2c3fb40a7d41 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -9,6 +9,7 @@
 #include <stdexcept>
 
 #include "ATen/CPUGenerator.h"
+#include "ATen/RegisterCPU.h"
 
 #ifdef USE_SSE3
 #include <pmmintrin.h>
@@ -34,7 +35,7 @@ Context::Context()
 
   generator_registry[static_cast<int>(DeviceType::CPU)]
     .reset(new CPUGenerator(this));
-  Type::registerCPU(this);
+  register_cpu_types(this);
 }
 
 // TODO: This could be bad juju if someone calls globalContext() in the
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index f2b3a452cfed5..5584963fefe57 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -114,6 +114,7 @@ class AT_API Context {
   std::atomic<size_t> next_id;
   std::unique_ptr<THCState, void(*)(THCState*)> thc_state;
   friend struct Type;
+  friend void register_cpu_types(Context * context);
   friend void register_cuda_types(Context * context);
 };
 
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 0f859edd3ede3..53879e56ffb34 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -109,6 +109,9 @@ def check_all_files_written(self):
 TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h")
 TYPE_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.cpp")
 
+REGISTER_CPU_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.h")
+REGISTER_CPU_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.cpp")
+
 REGISTER_CUDA_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.h")
 REGISTER_CUDA_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.cpp")
 
@@ -340,7 +343,8 @@ def iterate_types():
 def declare_outputs():
     files = ['Declarations.yaml', 'Type.h', 'Type.cpp', 'Tensor.h',
              'TensorMethods.h', 'Functions.h',
-             'CPUCopy.cpp', 'NativeFunctions.h']
+             'CPUCopy.cpp', 'NativeFunctions.h',
+             'RegisterCPU.cpp', 'RegisterCPU.h']
     for f in files:
         file_manager.will_write(f)
     cuda_files = ['CUDACopy.cpp', 'RegisterCUDA.cpp', 'RegisterCUDA.h']
@@ -409,6 +413,9 @@ def generate_outputs():
     file_manager.write('Type.h', TYPE_H, top_env)
     file_manager.write('Type.cpp', TYPE_CPP, top_env)
 
+    file_manager.write('RegisterCPU.h', REGISTER_CPU_H, top_env)
+    file_manager.write('RegisterCPU.cpp', REGISTER_CPU_CPP, top_env)
+
     cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env)
     cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env)
 
diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp
new file mode 100644
index 0000000000000..184af2c8c014d
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCPU.cpp
@@ -0,0 +1,20 @@
+#include <ATen/RegisterCPU.h>
+
+// ${generated_comment}
+
+#include <ATen/Type.h>
+#include <ATen/Context.h>
+#include <ATen/UndefinedType.h>
+#include <ATen/detail/VariableHooksInterface.h>
+
+${cpu_type_headers}
+
+namespace at {
+
+void register_cpu_types(Context * context) {
+  ${cpu_type_registrations}
+  context->type_registry[static_cast<int>(Backend::Undefined)]
+                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType(context));
+}
+
+} // namespace at
diff --git a/aten/src/ATen/templates/RegisterCPU.h b/aten/src/ATen/templates/RegisterCPU.h
new file mode 100644
index 0000000000000..b923c180aac80
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCPU.h
@@ -0,0 +1,10 @@
+#pragma once
+
+// ${generated_comment}
+
+namespace at {
+
+class Context;
+void register_cpu_types(Context * context);
+
+} // namespace at
diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp
index 5e5995b9523ba..90dbbb810ee30 100644
--- a/aten/src/ATen/templates/Type.cpp
+++ b/aten/src/ATen/templates/Type.cpp
@@ -9,22 +9,10 @@
 #include "ATen/Storage.h"
 #include "ATen/Tensor.h"
 #include "ATen/TensorOptions.h"
-#include "ATen/UndefinedType.h"
 #include "ATen/DeviceGuard.h"
 
-#include <ATen/detail/VariableHooksInterface.h>
-
-#include <iostream>
-${cpu_type_headers}
-
 namespace at {
 
-void Type::registerCPU(Context * context) {
-  ${cpu_type_registrations}
-  context->type_registry[static_cast<int>(Backend::Undefined)]
-                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType(context));
-}
-
 Tensor & Type::copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   Tensor b_src;
   std::tie(b_src) = expand_inplace(self, src, "copy");
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index b000029e789ca..884bd3a3bdff7 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -56,7 +56,6 @@ struct AT_API Type {
   virtual bool is_distributed() const = 0;
   bool is_variable() const noexcept { return is_variable_; }
   bool is_undefined() const noexcept { return is_undefined_; }
-  static void registerCPU(Context * context);
   virtual Storage storage(bool resizable = false) const = 0;
   virtual Storage storage(size_t size, bool resizable = false) const = 0;
   virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;

From dbce1c840f36621fa12bb7917123e475c1345341 Mon Sep 17 00:00:00 2001
From: Yanghan Wang <yanghan@instagram.com>
Date: Wed, 29 Aug 2018 12:40:04 -0700
Subject: [PATCH 12/42] exposing net_transformer_fun before add grad (#11003)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11003

Need a interface to re-write the graph after the net is built and after adding gradient ops.

Reviewed By: aazzolini, harouwu

Differential Revision: D9557827

fbshipit-source-id: 2e082f0321c0776e488a29e18047d950948e7c37
---
 caffe2/python/data_parallel_model.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 60e5c39bed131..ae169eef2e648 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -44,6 +44,7 @@ def Parallelize(
     param_update_builder_fun=None,
     optimizer_builder_fun=None,
     post_sync_builder_fun=None,
+    pre_grad_net_transformer_fun=None,
     net_transformer_fun=None,
     devices=None,
     rendezvous=None,
@@ -91,6 +92,11 @@ def Parallelize(
                         Signature:
                         net_transformer_fun(
                             model, num_devices, device_prefix, device_type)
+      pre_grad_net_transformer_fun:
+                        Optional function to transform the network similar to
+                        net_transformer_fun, but happens before gradient ops
+                        been add.
+                        Signature: pre_grad_net_transformer_fun(model)
       post_sync_builder_fun:
                         Function applied after initial parameter sync has been
                         completed, such as keeping multi-precision parameters
@@ -234,6 +240,9 @@ def Parallelize(
     model_helper_obj._computed_param_names =\
         list(viewkeys(computed_params_grouped))
 
+    if pre_grad_net_transformer_fun:
+        pre_grad_net_transformer_fun(model_helper_obj)
+
     if has_parameter_updates:
         log.info("Adding gradient operators")
         _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

From ec519e8a4abf8c327fdceb395e2d718955a44e8f Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 29 Aug 2018 12:50:02 -0700
Subject: [PATCH 13/42] Reduce number of elements within test_abs

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10997

Differential Revision: D9556861

Pulled By: cpuhrsch

fbshipit-source-id: 986ef275e94fcffcc04a5c1103b8b7bfb4ae3ba5
---
 test/test_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 34b8256763c65..5167ac618bba7 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6107,7 +6107,7 @@ def _test_abs(tensors_dict):
         _test_abs(self._make_tensors((3, 5, 7), val_range=(0, max_val)))
         _test_abs(self._make_tensors((2, 2, 5, 8, 2, 3), val_range=(0, max_val)))
         _test_abs(self._make_tensors((1000, ), val_range=(0, max_val)))
-        _test_abs(self._make_tensors((30, 30, 30), val_range=(0, max_val)))
+        _test_abs(self._make_tensors((10, 10, 10), val_range=(0, max_val)))
 
         # Checking that the right abs function is called for LongTensor
         bignumber = 2 ^ 31 + 1

From fa7c81c6403632153412320754ad51ad3b1f58b0 Mon Sep 17 00:00:00 2001
From: Duc Ngo <duc@fb.com>
Date: Wed, 29 Aug 2018 12:54:01 -0700
Subject: [PATCH 14/42] nomnigraph - nit - code style update (#10987)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10987

some code style update to make it consistent with fb cpp style

Reviewed By: yinghai

Differential Revision: D9550130

fbshipit-source-id: 6aef9878676c08e7d384383c95e7ba8c5c9a1bce
---
 .../nomnigraph/Representations/Compiler.h     | 22 ++++-----
 .../nomnigraph/Representations/ControlFlow.h  | 40 ++++++++--------
 .../nomnigraph/Representations/NeuralNet.h    | 48 ++++++++++---------
 .../include/nomnigraph/Support/Common.h       | 20 ++++----
 4 files changed, 66 insertions(+), 64 deletions(-)

diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
index 8560ff82374d9..8c24a2e2cb107 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
@@ -11,15 +11,15 @@ namespace repr {
 class CAFFE2_API Value {
  public:
   enum class ValueKind { Value, Instruction, Data };
-  Value(ValueKind K) : Kind(K) {}
-  Value() : Kind(ValueKind::Value) {}
+  Value(ValueKind K) : kind_(K) {}
+  Value() : kind_(ValueKind::Value) {}
   ValueKind getKind() const {
-    return Kind;
+    return kind_;
   }
   virtual ~Value() = default;
 
  private:
-  const ValueKind Kind;
+  const ValueKind kind_;
 };
 
 class CAFFE2_API Data : public Value {
@@ -30,15 +30,15 @@ class CAFFE2_API Data : public Value {
   }
   virtual ~Data() = default;
   size_t getVersion() const {
-    return Version;
+    return version_;
   }
 
   void setVersion(size_t version) {
-    Version = version;
+    version_ = version;
   }
 
  private:
-  size_t Version = 0;
+  size_t version_ = 0;
 };
 
 class CAFFE2_API Instruction : public Value {
@@ -52,18 +52,18 @@ class CAFFE2_API Instruction : public Value {
     TerminatorEnd,
     Phi
   };
-  Instruction() : Value(ValueKind::Instruction), Op(Opcode::Generic) {}
-  Instruction(Opcode op) : Value(ValueKind::Instruction), Op(op) {}
+  Instruction() : Value(ValueKind::Instruction), op_(Opcode::Generic) {}
+  Instruction(Opcode op) : Value(ValueKind::Instruction), op_(op) {}
   CAFFE2_API static bool classof(const Value* V) {
     return V->getKind() == ValueKind::Instruction;
   }
   virtual ~Instruction() = default;
   Opcode getOpcode() const {
-    return Op;
+    return op_;
   }
 
  private:
-  Opcode Op;
+  Opcode op_;
 };
 
 class CAFFE2_API Terminator : public Instruction {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
index 835f187febf15..1934b1f1b7bad 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
@@ -19,45 +19,45 @@ class CAFFE2_API BasicBlock {
   using NodeRef = typename Subgraph<T, U...>::NodeRef;
   BasicBlock() {}
   ~BasicBlock() {
-    for (auto pair : callbacks) {
+    for (auto pair : callbacks_) {
       pair.first->deleteDestructorCallback(pair.second);
     }
   }
 
   void trackNode(NodeRef node) {
-    callbacks[node] = node->registerDestructorCallback([&](NodeRef n) {
+    callbacks_[node] = node->registerDestructorCallback([&](NodeRef n) {
       assert(
           hasInstruction(n) &&
           "Destructor callback invoked on untracked node in BasicBlock.");
       deleteInstruction(n);
     });
-    Nodes.addNode(node);
+    nodes_.addNode(node);
   }
 
   void untrackNode(NodeRef node) {
-    callbacks.erase(node);
-    Nodes.removeNode(node);
+    callbacks_.erase(node);
+    nodes_.removeNode(node);
   }
 
   void pushInstructionNode(NodeRef node) {
     assert(
         isa<Instruction>(node->data()) &&
         "Cannot push non-instruction node to basic block.");
-    Instructions.emplace_back(node);
+    instructions_.emplace_back(node);
     trackNode(node);
   }
   const std::vector<NodeRef>& getInstructions() {
-    return Instructions;
+    return instructions_;
   }
 
   bool hasInstruction(NodeRef instr) const {
-    return Nodes.hasNode(instr);
+    return nodes_.hasNode(instr);
   }
 
   void insertInstructionBefore(NodeRef newInstr, NodeRef instr) {
     auto it =
-        std::find(std::begin(Instructions), std::end(Instructions), instr);
-    Instructions.insert(it, newInstr);
+        std::find(std::begin(instructions_), std::end(instructions_), instr);
+    instructions_.insert(it, newInstr);
     trackNode(newInstr);
   }
 
@@ -65,28 +65,28 @@ class CAFFE2_API BasicBlock {
     assert(hasInstruction(instr1) && "Instruction not in basic block.");
     assert(hasInstruction(instr2) && "Instruction not in basic block.");
     auto it1 =
-        std::find(std::begin(Instructions), std::end(Instructions), instr1);
+        std::find(std::begin(instructions_), std::end(instructions_), instr1);
     auto it2 =
-        std::find(std::begin(Instructions), std::end(Instructions), instr2);
-    Instructions.erase(it1);
-    Instructions.insert(it2, instr1);
+        std::find(std::begin(instructions_), std::end(instructions_), instr2);
+    instructions_.erase(it1);
+    instructions_.insert(it2, instr1);
   }
 
   void deleteInstruction(NodeRef instr) {
     assert(hasInstruction(instr) && "Instruction not in basic block.");
-    Instructions.erase(
-        std::remove(Instructions.begin(), Instructions.end(), instr),
-        Instructions.end());
+    instructions_.erase(
+        std::remove(instructions_.begin(), instructions_.end(), instr),
+        instructions_.end());
     untrackNode(instr);
   }
 
  private:
-  Subgraph<T, U...> Nodes;
-  std::vector<NodeRef> Instructions;
+  Subgraph<T, U...> nodes_;
+  std::vector<NodeRef> instructions_;
   // Because we reference a dataflow graph, we need to register callbacks
   // for when the dataflow graph is modified.
   std::unordered_map<NodeRef, typename Notifier<Node<T, U...>>::Callback*>
-      callbacks;
+      callbacks_;
 };
 
 using Program = Graph<Value>;
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 1f7e2c27906c9..b1e9283bc9cce 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -45,19 +45,19 @@ class CAFFE2_API Annotation {
  public:
   enum class AnnotationKind { Generic, Caffe2 };
 
-  Annotation(AnnotationKind K) : Kind(K) {}
-  Annotation() : Kind(AnnotationKind::Generic) {}
+  Annotation(AnnotationKind kind) : kind_(kind) {}
+  Annotation() : kind_(AnnotationKind::Generic) {}
   virtual ~Annotation() {}
 
   AnnotationKind getKind() const {
-    return Kind;
+    return kind_;
   }
 
   Annotation(const Annotation&) = delete;
   Annotation& operator=(Annotation&) = delete;
 
  private:
-  const AnnotationKind Kind;
+  const AnnotationKind kind_;
 };
 
 class CAFFE2_API NeuralNetOperator : public Instruction {
@@ -75,36 +75,38 @@ class CAFFE2_API NeuralNetOperator : public Instruction {
   enum class NNLayout { Undefined, NCHW, NHWC };
 
   NeuralNetOperator(NNKind K, Opcode I, NNLayout L)
-      : Instruction(I), Kind(K), Layout(L) {}
+      : Instruction(I), kind_(K), layout_(L) {}
   NeuralNetOperator(NNKind K, Opcode I)
-      : Instruction(I), Kind(K), Layout(NNLayout::Undefined) {}
-  NeuralNetOperator(NNKind K, NNLayout L) : Instruction(), Kind(K), Layout(L) {}
+      : Instruction(I), kind_(K), layout_(NNLayout::Undefined) {}
+  NeuralNetOperator(NNKind K, NNLayout L)
+      : Instruction(), kind_(K), layout_(L) {}
   NeuralNetOperator(NNKind K)
-      : Instruction(), Kind(K), Layout(NNLayout::Undefined) {}
+      : Instruction(), kind_(K), layout_(NNLayout::Undefined) {}
   NeuralNetOperator()
-      : Instruction(), Kind(NNKind::Undefined), Layout(NNLayout::Undefined) {}
+      : Instruction(), kind_(NNKind::Undefined), layout_(NNLayout::Undefined) {}
 
   NNKind getKind() const {
-    return Kind;
+    return kind_;
   }
 
   void setLayout(NNLayout L) {
-    Layout = L;
+    layout_ = L;
   }
 
   NNLayout getLayout() const {
-    return Layout;
+    return layout_;
   }
 
   void setAnnotation(std::unique_ptr<Annotation> extraAnnotation) {
-    ExtraAnnotation = std::move(extraAnnotation);
+    extraAnnotation_ = std::move(extraAnnotation);
   }
 
   const Annotation* getAnnotation() const {
-    return ExtraAnnotation.get();
+    return extraAnnotation_.get();
   }
+
   Annotation* getMutableAnnotation() {
-    return ExtraAnnotation.get();
+    return extraAnnotation_.get();
   }
 
   const std::string getName() const;
@@ -128,9 +130,9 @@ class CAFFE2_API NeuralNetOperator : public Instruction {
   NeuralNetOperator& operator=(NeuralNetOperator&) = delete;
 
  private:
-  const NNKind Kind;
-  NNLayout Layout; // Mutable attribute, much like a type cast
-  std::unique_ptr<Annotation> ExtraAnnotation;
+  const NNKind kind_;
+  NNLayout layout_; // Mutable attribute, much like a type cast
+  std::unique_ptr<Annotation> extraAnnotation_;
 };
 
 class CAFFE2_API NeuralNetData : public Data {
@@ -138,12 +140,12 @@ class CAFFE2_API NeuralNetData : public Data {
   /// Discriminator for LLVM-style RTTI (isa<>)
   enum class NNDataKind { Generic, Tensor };
 
-  NeuralNetData(NNDataKind kind) : Kind(kind) {}
+  NeuralNetData(NNDataKind kind) : kind_(kind) {}
 
-  NeuralNetData() : Kind(NNDataKind::Generic) {}
+  NeuralNetData() : kind_(NNDataKind::Generic) {}
 
   NNDataKind getKind() const {
-    return Kind;
+    return kind_;
   }
 
   virtual NeuralNetData* clone() = 0;
@@ -153,8 +155,8 @@ class CAFFE2_API NeuralNetData : public Data {
   virtual ~NeuralNetData() = 0;
 
  private:
-  NNDataKind Kind;
-  size_t Version = 0;
+  NNDataKind kind_;
+  size_t version_ = 0;
 };
 
 class CAFFE2_API Tensor : public NeuralNetData {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
index cef1bdec522a5..91e4c2f6e01e8 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
@@ -71,13 +71,13 @@ class Notifier {
   Notifier() {}
 
   Callback* registerDestructorCallback(Callback fn) {
-    DtorCallbacks.emplace_back(fn);
-    return &DtorCallbacks.back();
+    dtorCallbacks_.emplace_back(fn);
+    return &dtorCallbacks_.back();
   }
 
   Callback* registerNotificationCallback(Callback fn) {
-    NotifCallbacks.emplace_back(fn);
-    return &NotifCallbacks.back();
+    notifCallbacks_.emplace_back(fn);
+    return &notifCallbacks_.back();
   }
 
   void deleteCallback(std::list<Callback>& callbackList, Callback* toDelete) {
@@ -90,11 +90,11 @@ class Notifier {
   }
 
   void deleteDestructorCallback(Callback* c) {
-    deleteCallback(DtorCallbacks, c);
+    deleteCallback(dtorCallbacks_, c);
   }
 
   void deleteNotificationCallback(Callback* c) {
-    deleteCallback(NotifCallbacks, c);
+    deleteCallback(notifCallbacks_, c);
   }
 
   /// \brief Notifies all listeners (`registerNotificationCallback`
@@ -102,20 +102,20 @@ class Notifier {
   /// is encoded in the state of the derived class, thus only passing
   /// a pointer of type T* to the callback.
   void notify() {
-    for (auto callback : NotifCallbacks) {
+    for (auto callback : notifCallbacks_) {
       callback(reinterpret_cast<T*>(this));
     }
   }
 
   virtual ~Notifier() {
-    for (auto callback : DtorCallbacks) {
+    for (auto callback : dtorCallbacks_) {
       callback(reinterpret_cast<T*>(this));
     }
   }
 
  private:
-  std::list<Callback> DtorCallbacks;
-  std::list<Callback> NotifCallbacks;
+  std::list<Callback> dtorCallbacks_;
+  std::list<Callback> notifCallbacks_;
 };
 
 #endif /* NOM_SUPPORT_COMMON_H */

From 56539f5fe1618ea93f733e196710e8c424f549db Mon Sep 17 00:00:00 2001
From: Teng Li <tengli@fb.com>
Date: Wed, 29 Aug 2018 12:54:55 -0700
Subject: [PATCH 15/42] PT1 Distributed Release MileStone No.1 - Completed
 Distributed Package and CI tests (#10871)

Summary:
The PR includes:
(1) torch.distributed.c10d, which now includes the complete backward compatible frontend API for `torch.distributed`
(2) `env://` init method functionality
(3) Minor change to `test_distributed.py`, which is now a test for `torch.distributed.c10d`.
(4) The old `test_distributed.py' is now moved to `test_distributed_thd`
(5) Miscellaneous bug fixes.
(6) DDP CPU test is removed since c10d doesn't have this support yet, but this is a very easy test after moving DDP CPU's dependency to torch.distributed.c10d.
(7) CI config to test MPI, NCCL, and Gloo backend of c10d

**Now all the distributed test including c10d DDP can pass with the c10d frontend API**

TODO: (in a separate PR)
MPI subgroup support, once this is added, CI group test will be enabled.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10871

Differential Revision: D9554514

Pulled By: teng-li

fbshipit-source-id: fb686ad42258526c8b4372148e82969fac4f42dd
---
 .jenkins/pytorch/build.sh                  |   25 +-
 test/common.py                             |   10 +
 test/run_test.py                           |   33 +-
 test/test_c10d.py                          |   36 +-
 test/test_distributed.py                   |  108 +-
 test/test_thd_distributed.py               | 1148 ++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp       |    4 +-
 torch/distributed/c10d/__init__.py         |   16 +-
 torch/distributed/c10d/distributed_c10d.py | 1054 ++++++++++++++++++
 torch/distributed/c10d/rendezvous.py       |   51 +-
 torch/lib/c10d/ProcessGroupMPI.cpp         |   17 +-
 torch/nn/parallel/distributed_c10d.py      |   20 +-
 12 files changed, 2404 insertions(+), 118 deletions(-)
 create mode 100644 test/test_thd_distributed.py
 create mode 100644 torch/distributed/c10d/distributed_c10d.py

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index f1eda3103a24a..0f26005f74cb2 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -1,15 +1,28 @@
 #!/bin/bash
 
+# For distributed, four environmental configs:
+# (1) build with only NCCL
+# (2) build with NCCL and MPI
+# (3) build with only MPI
+# (4) build with neither
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
+  # TODO: move this to Docker
+  sudo apt-get update
+  sudo apt-get install libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
+  # TODO: move this to Docker
+  sudo apt-get update
+  sudo apt-get install openmpi-bin libopenmpi-dev
+  sudo apt-get install -y --no-install-recommends openssh-client openssh-server
+  sudo mkdir -p /var/run/sshd
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == "pytorch-linux-xenial-py3-clang5-asan" ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" $*
 fi
 
-# TODO: move this to Docker
-# TODO: add both NCCL and MPI in CI test by fixing these test first
-sudo apt-get update
-sudo apt-get install libnccl-dev libnccl2
-# sudo apt-get install openmpi-bin libopenmpi-dev
-
 # Required environment variable: $BUILD_ENVIRONMENT
 # (This is set by default in the Docker images we build, so you don't
 # need to set it yourself.
diff --git a/test/common.py b/test/common.py
index 1c86bcd7fe24b..545ba4f1f0dd2 100644
--- a/test/common.py
+++ b/test/common.py
@@ -17,6 +17,7 @@
 import warnings
 import random
 import contextlib
+import socket
 from functools import wraps
 from itertools import product
 from copy import deepcopy
@@ -550,3 +551,12 @@ def download_file(url, binary=True):
         msg = "could not download test file '{}'".format(url)
         warnings.warn(msg, RuntimeWarning)
         raise unittest.SkipTest(msg)
+
+
+def find_free_port():
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(('localhost', 0))
+    sockname = sock.getsockname()
+    sock.close()
+    return sockname[1]
diff --git a/test/run_test.py b/test/run_test.py
index 4d33d34d40747..8fd32b7e75c07 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -14,6 +14,7 @@
 import torch
 from torch.utils import cpp_extension
 from common import TEST_WITH_ROCM
+import torch.distributed.c10d as c10d
 
 TESTS = [
     'autograd',
@@ -31,12 +32,14 @@
     'nn',
     'optim',
     'sparse',
+    'thd_distributed',
     'torch',
     'utils',
 ]
 
 WINDOWS_BLACKLIST = [
     'distributed',
+    'thd_distributed',
 ]
 
 ROCM_BLACKLIST = [
@@ -50,10 +53,29 @@
     'multiprocessing',
     'nccl',
     'nn',
+    'thd_distributed',
     'utils',
 ]
 
 DISTRIBUTED_TESTS_CONFIG = {
+    'gloo': {
+        'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
+    },
+}
+
+
+if c10d.is_available():
+    if c10d.is_mpi_available():
+        DISTRIBUTED_TESTS_CONFIG['mpi'] = {
+            'WORLD_SIZE': '3'
+        }
+    if c10d.is_nccl_available():
+        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
+            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
+        }
+
+
+THD_DISTRIBUTED_TESTS_CONFIG = {
     'tcp': {
         'WORLD_SIZE': '3'
     },
@@ -126,7 +148,10 @@ def test_distributed(python, test_module, test_directory, options):
     if options.verbose and not mpi_available:
         print_to_stderr(
             'MPI not available -- MPI backend tests will be skipped')
-    for backend, env_vars in DISTRIBUTED_TESTS_CONFIG.items():
+    config = DISTRIBUTED_TESTS_CONFIG
+    if test_module == "test_thd_distributed":
+        config = THD_DISTRIBUTED_TESTS_CONFIG
+    for backend, env_vars in config.items():
         if backend == 'mpi' and not mpi_available:
             continue
         for with_init_file in {True, False}:
@@ -141,7 +166,10 @@ def test_distributed(python, test_module, test_directory, options):
             os.environ['INIT_METHOD'] = 'env://'
             os.environ.update(env_vars)
             if with_init_file:
-                init_method = 'file://{}/shared_init_file'.format(tmp_dir)
+                if test_module == "test_distributed":
+                    init_method = 'file://{}/'.format(tmp_dir)
+                else:
+                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
                 os.environ['INIT_METHOD'] = init_method
             try:
                 os.mkdir(os.path.join(tmp_dir, 'barrier'))
@@ -170,6 +198,7 @@ def test_distributed(python, test_module, test_directory, options):
 CUSTOM_HANDLERS = {
     'cpp_extensions': test_cpp_extensions,
     'distributed': test_distributed,
+    'thd_distributed': test_distributed,
 }
 
 
diff --git a/test/test_c10d.py b/test/test_c10d.py
index c448eba134997..13f7b779d0473 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -1,7 +1,6 @@
 import copy
 import math
 import multiprocessing
-import socket
 import sys
 import tempfile
 import unittest
@@ -10,6 +9,7 @@
 from collections import namedtuple
 
 import torch
+import common
 from torch import nn
 import torch.nn.functional as F
 from torch.distributed import c10d
@@ -60,15 +60,6 @@ def get_timeout(test_id):
     return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
 
 
-def find_free_port():
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    sock.bind(('localhost', 0))
-    sockname = sock.getsockname()
-    sock.close()
-    return sockname[1]
-
-
 def gpus_for_rank(world_size):
     """Multigpu tests are designed to simulate the multi nodes with multi
     GPUs on each node. Nccl backend requires equal #GPUs in each process.
@@ -126,14 +117,14 @@ def _create_store(self):
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         addr = 'localhost'
-        port = find_free_port()
+        port = common.find_free_port()
         return c10d.TCPStore(addr, port, True)
 
 
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         addr = 'localhost'
-        port = find_free_port()
+        port = common.find_free_port()
         self.tcpstore = c10d.TCPStore(addr, port, True)
         self.prefix = "test_prefix"
 
@@ -150,10 +141,10 @@ def test_unknown_handler(self):
 class RendezvousFileTest(TestCase):
     def test_common_errors(self):
         with self.assertRaisesRegex(ValueError, 'path missing'):
-            gen = c10d.rendezvous('file://?rank=0&size=1')
+            gen = c10d.rendezvous('file://?rank=0&world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'rank parameter missing'):
-            gen = c10d.rendezvous('file:///tmp/foo?size=1')
+            gen = c10d.rendezvous('file:///tmp/foo?world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'size parameter missing'):
             gen = c10d.rendezvous('file:///tmp/foo?rank=0')
@@ -161,7 +152,7 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile() as file:
-            url = 'file://%s?size=%d' % (file.name, 2)
+            url = 'file://%s?world_size=%d' % (file.name, 2)
             gen0 = c10d.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -183,10 +174,10 @@ def test_nominal(self):
 class RendezvousTCPTest(TestCase):
     def test_common_errors(self):
         with self.assertRaisesRegex(ValueError, 'port number missing'):
-            gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&size=1')
+            gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'rank parameter missing'):
-            gen = c10d.rendezvous('tcp://127.0.0.1:23456?size=1')
+            gen = c10d.rendezvous('tcp://127.0.0.1:23456?world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'size parameter missing'):
             gen = c10d.rendezvous('tcp://127.0.0.1:23456?rank=0')
@@ -194,8 +185,8 @@ def test_common_errors(self):
 
     def test_nominal(self):
         addr = 'localhost'
-        port = find_free_port()
-        url = 'tcp://%s:%d?size=%d' % (addr, port, 2)
+        port = common.find_free_port()
+        url = 'tcp://%s:%d?world_size=%d' % (addr, port, 2)
         gen0 = c10d.rendezvous(url + "&rank=0")
         store0, rank0, size0 = next(gen0)
         self.assertEqual(0, rank0)
@@ -245,7 +236,7 @@ def setUpClass(cls):
     def setUp(self):
         self.rank = self.MAIN_PROCESS_RANK
         self.file = tempfile.NamedTemporaryFile()
-        self.port = find_free_port()
+        self.port = common.find_free_port()
         self.processes = [self._spawn_process(rank) for rank in range(int(self.world_size))]
 
     def tearDown(self):
@@ -529,8 +520,9 @@ def _test_ddp_with_process_group(self, process_group):
         model = Net()
         ddp_model = distributed_c10d._DistributedDataParallelC10d(
             copy.deepcopy(model).cuda(gpus[0]),
-            process_group,
-            device_ids=gpus)
+            device_ids=gpus,
+            process_group=process_group)
+
         model.cuda(gpus[0])
 
         local_batch_size = len(gpus)
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 47dbe9d056f15..38a32d69ef7c6 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -5,29 +5,32 @@
 import os
 import sys
 import time
+import tempfile
 import unittest
 from contextlib import contextmanager
 from functools import reduce, wraps
 
 import torch
 import torch.cuda
-import torch.distributed as dist
+import torch.distributed.c10d as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from common import TestCase
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch.autograd import Variable
-
+import common
 
 BACKEND = os.environ["BACKEND"]
 TEMP_DIR = os.environ["TEMP_DIR"]
 INIT_METHOD = os.getenv("INIT_METHOD", "env://")
-MASTER_PORT = "29500"
 
 DEFAULT_TIMEOUT = 300
 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}
 
+if INIT_METHOD.startswith("file://"):
+    FOLDER = INIT_METHOD[7:]
+
 
 def get_timeout(test_id):
     test_name = test_id.split(".")[-1]
@@ -361,8 +364,9 @@ def test_broadcast_cuda(self):
         rank_to_GPU = self._init_multigpu_helper()
         self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU)
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_broadcast_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_broadcast_helper(group, group_id, rank)
@@ -454,7 +458,8 @@ def test_reduce_max(self):
         self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10)
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_sum(self):
         group, group_id, rank = self._init_group_test()
@@ -469,7 +474,8 @@ def test_reduce_group_sum(self):
         )
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_product(self):
         group, group_id, rank = self._init_group_test()
@@ -484,14 +490,16 @@ def test_reduce_group_product(self):
         )
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_min(self):
         group, group_id, rank = self._init_group_test()
         self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1)
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_max(self):
         group, group_id, rank = self._init_group_test()
@@ -540,8 +548,8 @@ def test_all_reduce_sum(self):
         )
 
     @unittest.skipIf(
-        BACKEND != "gloo" and BACKEND != "nccl",
-        "Only Gloo & Nccl backend support CUDA allReduce",
+        BACKEND != "gloo",
+        "Only Gloo backend will have CUDA allReduce tested",
     )
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
@@ -587,8 +595,9 @@ def test_all_reduce_max(self):
             group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_sum(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
@@ -601,8 +610,9 @@ def test_all_reduce_group_sum(self):
             2 + (10 * (len(group) - 1)),
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_product(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
@@ -615,16 +625,18 @@ def test_all_reduce_group_product(self):
             reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_min(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
             group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_max(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
@@ -652,6 +664,7 @@ def test_scatter(self):
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter")
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_scatter_group(self):
         group, group_id, rank = self._init_group_test()
@@ -679,7 +692,8 @@ def test_gather(self):
         self._test_gather_helper(group, group_id, rank)
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_gather_group(self):
         group, group_id, rank = self._init_group_test()
@@ -703,12 +717,13 @@ def _test_all_gather_helper(
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports CPU all gather")
     def test_all_gather(self):
         group, group_id, rank = self._init_global_test()
         self._test_all_gather_helper(group, group_id, rank)
 
     @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather")
+    @unittest.skipIf(BACKEND == "nccl", "CUDA all gather skipped for NCCL")
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
     def test_all_gather_cuda(self):
@@ -716,8 +731,10 @@ def test_all_gather_cuda(self):
         rank_to_GPU = self._init_multigpu_helper()
         self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU)
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_gather_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_gather_helper(group, group_id, rank)
@@ -740,13 +757,14 @@ def _test_barrier_helper(self, group, group_id, rank):
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
     def test_barrier(self):
         group, group_id, rank = self._init_global_test()
         self._test_barrier_helper(group, group_id, rank)
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_barrier_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_barrier_helper(group, group_id, rank)
@@ -765,7 +783,8 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
                 self.assertEqual(tensor, expected_tensor)
         self._barrier()
 
-    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu")
+    @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
+    @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped")
     @skip_if_no_gpu
     def test_broadcast_multigpu(self):
         group, group_id, rank = self._init_global_test()
@@ -802,7 +821,8 @@ def _test_all_reduce_multigpu_helper(
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu")
+    @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
+    @unittest.skipIf(BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL")
     @skip_if_no_gpu
     def test_all_reduce_multigpu(self):
         group, group_id, rank = self._init_global_test()
@@ -985,7 +1005,7 @@ def test_DistributedDataParallel(self):
         # DDP training setup
         model_DDP = copy.deepcopy(model)
         model_DDP.cuda(gpu_subset[0])
-        model_DDP = nn.parallel.DistributedDataParallel(
+        model_DDP = nn.parallel._DistributedDataParallelC10d(
             model_DDP, device_ids=gpu_subset
         )
 
@@ -1006,33 +1026,8 @@ def test_DistributedDataParallel(self):
         )
         self._barrier()
 
-    @unittest.skipIf(
-        BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU"
-    )
-    def test_DistributedDataParallelCPU(self):
-        # Run a simple end to end DDP-CPU model, use result of single node
-        # model as baseline
-        group, group_id, rank = self._init_global_test()
-
-        # cpu training setup
-        model_base = self._create_Net()
-
-        # DDP-CPU training setup
-        model_DDP = copy.deepcopy(model_base)
-        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
-
-        # dummy data initialization
-        local_bs = 2
-        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
 
-        # check two model parameters over 2 iterations
-        self._test_DDP_2iter(
-            model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs
-        )
-        self._barrier()
-
-
-if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl":
+if BACKEND == "gloo" or BACKEND == "nccl":
     WORLD_SIZE = os.environ["WORLD_SIZE"]
 
     class TestDistBackend(TestCase, _DistTestBase):
@@ -1052,7 +1047,6 @@ def wrapper(self):
         @classmethod
         def setUpClass(cls):
             os.environ["MASTER_ADDR"] = MASTER_ADDR
-            os.environ["MASTER_PORT"] = MASTER_PORT
             os.environ["WORLD_SIZE"] = WORLD_SIZE
             for attr in dir(cls):
                 if attr.startswith("test"):
@@ -1060,6 +1054,17 @@ def setUpClass(cls):
                     setattr(cls, attr, cls.manager_join(fn))
 
         def setUp(self):
+            # Adding this hack until we fix the FileStore to delete its
+            # content at the end
+            global INIT_METHOD
+            if INIT_METHOD.startswith("file://"):
+                _, filename = tempfile.mkstemp(prefix=FOLDER)
+                INIT_METHOD = "file://{}".format(filename)
+
+            if INIT_METHOD.startswith("env://"):
+                port = common.find_free_port()
+                os.environ["MASTER_PORT"] = str(port)
+
             self.processes = []
             self.rank = self.MANAGER_PROCESS_RANK
             Barrier.init()
@@ -1081,7 +1086,10 @@ def _run(self, rank):
             self.rank = rank
             try:
                 dist.init_process_group(
-                    init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE)
+                    init_method=INIT_METHOD,
+                    backend=BACKEND,
+                    world_size=int(WORLD_SIZE),
+                    rank=self.rank
                 )
             except RuntimeError as e:
                 if "recompile" in e.args[0]:
diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py
new file mode 100644
index 0000000000000..47dbe9d056f15
--- /dev/null
+++ b/test/test_thd_distributed.py
@@ -0,0 +1,1148 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
+import fcntl
+import multiprocessing
+import os
+import sys
+import time
+import unittest
+from contextlib import contextmanager
+from functools import reduce, wraps
+
+import torch
+import torch.cuda
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from common import TestCase
+from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
+from torch.autograd import Variable
+
+
+BACKEND = os.environ["BACKEND"]
+TEMP_DIR = os.environ["TEMP_DIR"]
+INIT_METHOD = os.getenv("INIT_METHOD", "env://")
+MASTER_PORT = "29500"
+
+DEFAULT_TIMEOUT = 300
+CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}
+
+
+def get_timeout(test_id):
+    test_name = test_id.split(".")[-1]
+    if test_name in CUSTOMIZED_TIMEOUT:
+        return CUSTOMIZED_TIMEOUT[test_name]
+    else:
+        return DEFAULT_TIMEOUT
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests")
+    sys.exit(0)
+
+SKIP_IF_NO_CUDA_EXIT_CODE = 75
+SKIP_IF_NO_GPU_EXIT_CODE = 76
+SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77
+SKIP_IF_BACKEND_UNAVAILABLE = 78
+
+
+def skip_if_no_cuda_distributed(func):
+    func.skip_if_no_cuda_distributed = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def skip_if_no_gpu(func):
+    """ Nccl multigpu tests requires at least 2 GPUS. Skip if this is not met"""
+    func.skip_if_no_gpu = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+        if torch.cuda.device_count() < int(os.environ["WORLD_SIZE"]):
+            sys.exit(SKIP_IF_NO_GPU_EXIT_CODE)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def skip_if_small_worldsize(func):
+    func.skip_if_small_worldsize = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) <= 2:
+            sys.exit(SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def apply_hack_for_nccl():
+    # This is a hack for a known NCCL issue using multiprocess
+    # in conjunction with multiple threads to manage different GPUs which
+    # may cause ncclCommInitRank to fail.
+    # http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4
+    # It slows down the performance of collective operations.
+    # Without this setting NCCL might throw unhandled error.
+    os.environ["NCCL_MAX_NRINGS"] = "1"
+
+
+@contextmanager
+def _lock():
+    lockfile = os.path.join(TEMP_DIR, "lockfile")
+    with open(lockfile, "w") as lf:
+        try:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+            yield
+        finally:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            lf.close()
+
+
+def _build_tensor(size, value=None):
+    if value is None:
+        value = size
+    return torch.FloatTensor(size, size, size).fill_(value)
+
+
+class Barrier(object):
+    barrier_id = 0
+
+    @classmethod
+    def init(cls):
+        cls.barrier_id = 0
+        barrier_dir = os.path.join(TEMP_DIR, "barrier")
+        for f_name in os.listdir(barrier_dir):
+            os.unlink(os.path.join(barrier_dir, f_name))
+
+    @classmethod
+    def sync(cls, timeout=5):
+        cls.barrier_id += 1
+        barrier_dir = os.path.join(TEMP_DIR, "barrier")
+        pid = str(os.getpid())
+        barrier_file = os.path.join(barrier_dir, pid)
+        with _lock():
+            with open(barrier_file, "w") as f:
+                f.write(str(cls.barrier_id))
+
+        start_time = time.time()
+        while True:
+            arrived = 0
+            with _lock():
+                for f_name in os.listdir(barrier_dir):
+                    with open(os.path.join(barrier_dir, f_name), "r") as f:
+                        data = f.read()
+                        if int(data) >= cls.barrier_id:
+                            arrived += 1
+            if arrived == dist.get_world_size():
+                break
+
+            if time.time() - start_time > timeout:
+                raise RuntimeError("barrier timeout")
+            time.sleep(0.1)
+
+
+class _DistTestBase(object):
+    def _barrier(self, *args, **kwargs):
+        Barrier.sync(*args, **kwargs)
+
+    def _init_group_test(self):
+        group = [1, 2]
+        group_id = dist.new_group(group)
+        rank = dist.get_rank()
+        if rank not in group:
+            return ([], None, rank)
+
+        return (group, group_id, rank)
+
+    def _init_global_test(self):
+        group = [i for i in range(0, dist.get_world_size())]
+        group_id = dist.group.WORLD
+        rank = dist.get_rank()
+        return (group, group_id, rank)
+
+    # HELPER FOR MULTIGPU TESTS
+    def _init_multigpu_helper(self):
+        """Multigpu tests are designed to simulate the multi nodes with multi
+        GPUs on each node. Nccl backend requires equal #GPUs in each process.
+        On a single node, all visible GPUs are evenly
+        divided to subsets, each process only uses a subset.
+        """
+        nGPUs = torch.cuda.device_count()
+        world_size = dist.get_world_size()
+        visible_devices = range(nGPUs)
+
+        if BACKEND == "nccl":
+            apply_hack_for_nccl()
+
+        nGPUs_per_process = nGPUs // world_size
+        rank_to_GPU = {
+            i: list(
+                visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process]
+            )
+            for i in range(world_size)
+        }
+        return rank_to_GPU
+
+    # GET RANK
+    def test_get_rank(self):
+        test_dir = os.path.join(TEMP_DIR, "test_dir")
+        pid = str(os.getpid())
+        num_processes = dist.get_world_size()
+        with open(os.path.join(test_dir, pid), "w") as f:
+            f.write(str(dist.get_rank()))
+
+        self._barrier()
+
+        all_ranks = set()
+        for f_name in os.listdir(test_dir):
+            with open(os.path.join(test_dir, f_name), "r") as f:
+                all_ranks.add(int(f.read()))
+        self.assertEqual(len(all_ranks), num_processes)
+
+        self._barrier()
+
+        if dist.get_rank() == 0:
+            for f_name in os.listdir(test_dir):
+                os.unlink(os.path.join(test_dir, f_name))
+
+        self._barrier()
+
+    # SEND RECV
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support send/recv")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
+    def test_send_recv(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(rank + 1)
+        for dest in range(0, dist.get_world_size()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        for src in range(0, dist.get_world_size()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(src + 1, value=-1)
+            expected_tensor = _build_tensor(src + 1)
+            dist.recv(tensor, src)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    # SEND RECV ANY SOURCE
+    @unittest.skipIf(
+        BACKEND == "gloo", "Gloo does not support send/recv from any source"
+    )
+    @unittest.skipIf(
+        BACKEND == "nccl", "Nccl does not support send/recv from any source"
+    )
+    def test_send_recv_any_source(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(10, rank)
+        for dest in range(0, dist.get_world_size()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        recv_ranks = set()
+        for src in range(0, dist.get_world_size()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(10, value=-1)
+            sender = dist.recv(tensor)
+            self.assertTrue(tensor.eq(sender).all())
+            recv_ranks.add(sender)
+
+        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
+        self._barrier()
+
+    # ISEND
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support isend")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend")
+    def test_isend(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            requests = [
+                dist.isend(_build_tensor(dest, 10), dest)
+                for dest in range(1, world_size)
+            ]
+            for request in requests:
+                request.wait()
+                self.assertTrue(request.is_completed())
+        else:
+            tensor = _build_tensor(rank, -1)
+            dist.recv(tensor, 0)
+            self.assertEqual(tensor, _build_tensor(rank, 10))
+
+        self._barrier()
+
+    # IRECV
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support irecv")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support irecv")
+    def test_irecv(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
+            requests = [
+                dist.irecv(expected_tensors[src - 1], src)
+                for src in range(1, world_size)
+            ]
+
+            for src in range(1, world_size):
+                requests[src - 1].wait()
+                self.assertTrue(requests[src - 1].is_completed())
+                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
+        else:
+            tensor = _build_tensor(rank, 10)
+            dist.send(tensor, 0)
+
+        self._barrier()
+
+    # BROADCAST
+    def _test_broadcast_helper(
+        self, group, group_id, rank, cuda=False, rank_to_GPU=None
+    ):
+        for ttype, value, requires_cuda in [
+            ("torch.FloatTensor", -1e-10, False),
+            ("torch.DoubleTensor", -1e-100, False),
+            ("torch.HalfTensor", -0.1, True),
+            ("torch.CharTensor", -2, False),
+            ("torch.ByteTensor", 129, False),
+            ("torch.IntTensor", -1e5, False),
+            ("torch.LongTensor", -1e15, False),
+        ]:
+            if requires_cuda and not cuda:
+                continue
+            for src in group:
+                expected_tensor = _build_tensor(src + 1, value).type(ttype)
+                if cuda:
+                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
+                if rank == src:
+                    dist.broadcast(expected_tensor, src, group_id)
+                else:
+                    tensor = _build_tensor(src + 1, -1).type(ttype)
+                    if cuda:
+                        tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    dist.broadcast(tensor, src, group_id)
+                    self.assertEqual(tensor.size(), expected_tensor.size())
+                    self.assertEqual(tensor.ne(expected_tensor).max(), 0)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_broadcast(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    @unittest.skipIf(
+        BACKEND != "gloo" and BACKEND != "nccl",
+        "Only Gloo and Nccl backend supports CUDA allReduce",
+    )
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_broadcast_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_broadcast_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    # REDUCE
+    def _test_reduce_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+        cuda=False,
+        rank_to_GPU=None,
+    ):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.reduce(tensor, src, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.reduce(tensor, src, op, group_id)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA reduce")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_reduce_sum_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + 10 * (len(group) - 1),
+            True,
+            rank_to_GPU,
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10)
+
+    # ALL REDUCE
+    def _test_all_reduce_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+        cuda=False,
+        rank_to_GPU=None,
+    ):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(
+        BACKEND != "gloo" and BACKEND != "nccl",
+        "Only Gloo & Nccl backend support CUDA allReduce",
+    )
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_all_reduce_sum_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+            True,
+            rank_to_GPU,
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    # SCATTER
+    def _test_scatter_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, -1)
+            expected_tensor = _build_tensor(dest + 1, rank)
+            tensors = (
+                [_build_tensor(dest + 1, i) for i in group] if rank == dest else []
+            )
+            dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter")
+    def test_scatter(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter")
+    @skip_if_small_worldsize
+    def test_scatter_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    # GATHER
+    def _test_gather_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = (
+                [_build_tensor(dest + 1, -1) for i in group] if rank == dest else []
+            )
+            dist.gather(tensor, dst=dest, gather_list=tensors, group=group_id)
+            if rank == dest:
+                expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+                for t1, t2 in zip(tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    # ALL GATHER
+    def _test_all_gather_helper(
+        self, group, group_id, rank, cuda=False, rank_to_GPU=None
+    ):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = [_build_tensor(dest + 1, -1) for i in group]
+            if cuda:
+                tensor = tensor.cuda(rank_to_GPU[rank][0])
+                tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+            dist.all_gather(tensors, tensor, group_id)
+
+            expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+            for t1, t2 in zip(tensors, expected_tensors):
+                self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_all_gather_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    # BARRIER
+    def _test_barrier_helper(self, group, group_id, rank):
+        WAIT_TIME = 0.3  # seconds
+
+        for dest in group:
+            expected_time = torch.DoubleTensor(1).fill_(0.0)
+            if dest == rank:
+                expected_time.fill_(time.time() + WAIT_TIME)
+                dist.broadcast(expected_time, dest, group_id)
+                time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
+                dist.barrier(group_id)
+            else:
+                dist.broadcast(expected_time, dest, group_id)
+                dist.barrier(group_id)
+                self.assertGreaterEqual(time.time(), expected_time[0])
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_barrier(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_barrier_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
+        for src in group:
+            expected_tensor = _build_tensor(src + 1)
+            tensors = [
+                _build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank]
+            ]
+            if rank == src:
+                tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0])
+
+            dist.broadcast_multigpu(tensors, src, group_id)
+            for tensor in tensors:
+                self.assertEqual(tensor, expected_tensor)
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu")
+    @skip_if_no_gpu
+    def test_broadcast_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_broadcast_multigpu_helper(group, group_id, rank, rank_to_GPU)
+
+    def _test_all_reduce_multigpu_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        rank_to_GPU,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+    ):
+        for src in group:
+            if rank == src:
+                tensors = [
+                    _build_tensor(src + 1, master_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+            else:
+                tensors = [
+                    _build_tensor(src + 1, worker_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+
+            dist.all_reduce_multigpu(tensors, op, group_id)
+            expected_tensor = _build_tensor(src + 1, expected_value)
+            for tensor in tensors:
+                self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu")
+    @skip_if_no_gpu
+    def test_all_reduce_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_reduce_multigpu_helper(
+            group,
+            group_id,
+            rank,
+            rank_to_GPU,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
+        )
+
+    def _test_reduce_multigpu_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        rank_to_GPU,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+    ):
+        for src in group:
+            if rank == src:
+                tensors = [
+                    _build_tensor(src + 1, master_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+                dist.reduce_multigpu(tensors, src, op, group_id)
+                expected_tensor = _build_tensor(src + 1, expected_value)
+                self.assertEqual(tensors[0], expected_tensor)
+            else:
+                tensors = [
+                    _build_tensor(src + 1, worker_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+                dist.reduce_multigpu(tensors, src, op, group_id)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports reduce multigpu")
+    @skip_if_no_gpu
+    def test_reduce_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_reduce_multigpu_helper(
+            group,
+            group_id,
+            rank,
+            rank_to_GPU,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
+        )
+
+    def _test_all_gather_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
+        for dest in group:
+            tensors = [
+                _build_tensor(dest + 1).cuda(device=i) for i in rank_to_GPU[rank]
+            ]
+
+            # construct expected output along with
+            # a place holder to receive all gather results
+            output_tensors = []
+            expected_output = []
+            output_per_gpu = (
+                [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group)
+            )
+            expected_per_gpu = (
+                [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group)
+            )
+            for gpu in rank_to_GPU[rank]:
+                output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
+                expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
+
+            dist.all_gather_multigpu(output_tensors, tensors, group_id)
+            self.assertEqual(output_tensors, expected_output)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allgather multigpu")
+    @skip_if_no_gpu
+    def test_all_gather_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU)
+
+    def _create_Net(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 50, bias=False)
+                self.fc3 = nn.Linear(50, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                x = self.fc3(x)
+                return F.softmax(x, dim=1)
+
+        return Net()
+
+    def _model_step(self, model):
+        for param in model.parameters():
+            param.data += param.grad
+            param.grad = None
+
+    def _prepare_dummy_data(self, local_bs):
+        # global_bs for DDP should be divisible by WORLD_SIZE
+        global_bs = int(WORLD_SIZE) * local_bs
+        input_cpu = torch.randn(global_bs, 2)
+        target = torch.randn(global_bs, 4)
+        loss = nn.MSELoss()
+        return global_bs, input_cpu, target, loss
+
+    # END TO END TEST FOR DISTRIBUTEDDATAPARALLEL
+    def _test_DDP_helper(self, model, input_var, target, loss):
+        model.train()
+        output = model(input_var)
+        l = loss(output, target)
+        l.backward()
+
+    def _assert_equal_param(self, param_gpu, param_DDP):
+        self.assertEqual(len(param_gpu), len(param_DDP))
+        for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            self.assertEqual(p_gpu, p_DDP)
+
+    def _test_DDP_2iter(
+        self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size
+    ):
+        for _ in range(2):
+            # single cpu/gpu training
+            self._test_DDP_helper(model_base, input, target, loss)
+
+            # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs
+            self._test_DDP_helper(
+                model_DDP,
+                input[rank * local_bs: (rank + 1) * local_bs],
+                target[rank * local_bs: (rank + 1) * local_bs],
+                loss,
+            )
+
+            # Update weights and run a second iteration to shake out errors
+            self._model_step(model_base)
+            self._model_step(model_DDP)
+            self._assert_equal_param(
+                list(model_base.parameters()), list(model_DDP.module.parameters())
+            )
+
+            # Shuffle the input so that DDP input is different
+            input = input[torch.randperm(batch_size)]
+
+    @unittest.skipIf(
+        BACKEND != "nccl" and BACKEND != "gloo",
+        "Only Nccl & Gloo backend support DistributedDataParallel",
+    )
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_DistributedDataParallel(self):
+        # Run a simple end to end DDP model, use result of single node model
+        # as baseline
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+
+        # cpu training setup
+        model = self._create_Net()
+
+        # single gpu training setup
+        model_gpu = copy.deepcopy(model)
+        gpu_subset = list(rank_to_GPU[rank])
+        model_gpu.cuda(gpu_subset[0])
+
+        # DDP training setup
+        model_DDP = copy.deepcopy(model)
+        model_DDP.cuda(gpu_subset[0])
+        model_DDP = nn.parallel.DistributedDataParallel(
+            model_DDP, device_ids=gpu_subset
+        )
+
+        # dummy data initialization
+        local_bs = len(gpu_subset)
+        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+        # check two model parameters over 2 iterations
+        self._test_DDP_2iter(
+            model_gpu,
+            model_DDP,
+            input_cpu.cuda(gpu_subset[0]),
+            target.cuda(gpu_subset[0]),
+            loss,
+            local_bs,
+            rank,
+            global_bs,
+        )
+        self._barrier()
+
+    @unittest.skipIf(
+        BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU"
+    )
+    def test_DistributedDataParallelCPU(self):
+        # Run a simple end to end DDP-CPU model, use result of single node
+        # model as baseline
+        group, group_id, rank = self._init_global_test()
+
+        # cpu training setup
+        model_base = self._create_Net()
+
+        # DDP-CPU training setup
+        model_DDP = copy.deepcopy(model_base)
+        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+
+        # dummy data initialization
+        local_bs = 2
+        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+        # check two model parameters over 2 iterations
+        self._test_DDP_2iter(
+            model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs
+        )
+        self._barrier()
+
+
+if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl":
+    WORLD_SIZE = os.environ["WORLD_SIZE"]
+
+    class TestDistBackend(TestCase, _DistTestBase):
+        MANAGER_PROCESS_RANK = -1
+
+        @staticmethod
+        def manager_join(fn):
+            @wraps(fn)
+            def wrapper(self):
+                if self.rank == self.MANAGER_PROCESS_RANK:
+                    self._join_and_reduce(fn)
+                else:
+                    fn(self)
+
+            return wrapper
+
+        @classmethod
+        def setUpClass(cls):
+            os.environ["MASTER_ADDR"] = MASTER_ADDR
+            os.environ["MASTER_PORT"] = MASTER_PORT
+            os.environ["WORLD_SIZE"] = WORLD_SIZE
+            for attr in dir(cls):
+                if attr.startswith("test"):
+                    fn = getattr(cls, attr)
+                    setattr(cls, attr, cls.manager_join(fn))
+
+        def setUp(self):
+            self.processes = []
+            self.rank = self.MANAGER_PROCESS_RANK
+            Barrier.init()
+            for rank in range(int(WORLD_SIZE)):
+                self.processes.append(self._spawn_process(rank))
+
+        def tearDown(self):
+            for p in self.processes:
+                p.terminate()
+
+        def _spawn_process(self, rank):
+            os.environ["RANK"] = str(rank)
+            name = "process " + str(rank)
+            process = multiprocessing.Process(target=self._run, name=name, args=(rank,))
+            process.start()
+            return process
+
+        def _run(self, rank):
+            self.rank = rank
+            try:
+                dist.init_process_group(
+                    init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE)
+                )
+            except RuntimeError as e:
+                if "recompile" in e.args[0]:
+                    sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
+                    # sys.exit(0)
+                raise
+            # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+            # We're retreiving a corresponding test and executing it.
+            getattr(self, self.id().split(".")[2])()
+            sys.exit(0)
+
+        def _join_and_reduce(self, fn):
+            skip_ok = (
+                getattr(fn, "skip_if_no_cuda_distributed", False) or
+                getattr(fn, "skip_if_no_gpu", False) or
+                getattr(fn, "skip_if_small_worldsize", False)
+            )
+            self.JOIN_TIMEOUT = get_timeout(self.id())
+            for p in self.processes:
+                p.join(self.JOIN_TIMEOUT)
+
+            first_process = self.processes[0]
+            for p in self.processes:
+                self.assertEqual(p.exitcode, first_process.exitcode)
+
+            if first_process.exitcode == SKIP_IF_BACKEND_UNAVAILABLE:
+                raise unittest.SkipTest("Compiled without the " + BACKEND + " backend")
+
+            if skip_ok:
+                # do this first so we don't give an error message about
+                # mismatched exit codes if the first isn't valid
+                assert (
+                    first_process.exitcode == 0 or
+                    first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE or
+                    first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or
+                    first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE
+                )
+
+                if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
+                    raise unittest.SkipTest("cuda is not available")
+                if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE:
+                    raise unittest.SkipTest(
+                        "One unique gpu per process is not available"
+                    )
+                if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE:
+                    raise unittest.SkipTest("worldsize is too small to run group tests")
+
+            self.assertEqual(first_process.exitcode, 0)
+
+
+elif BACKEND == "mpi":
+    WORLD_SIZE = os.environ["WORLD_SIZE"]
+    dist.init_process_group(init_method=INIT_METHOD, backend="mpi")
+
+    class TestMPI(TestCase, _DistTestBase):
+        pass
+
+
+if __name__ == "__main__":
+    assert (
+        not torch.cuda._initialized
+    ), "test_distributed must not have initialized CUDA context on main process"
+
+    unittest.main()
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index fdf88bc0704a4..a67d009e02436 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -346,8 +346,8 @@ PyObject* c10d_init(PyObject* _unused) {
 #endif
 
   shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work")
-      .def("isCompleted", &::c10d::ProcessGroup::Work::isCompleted)
-      .def("isSuccess", &::c10d::ProcessGroup::Work::isSuccess)
+      .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted)
+      .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
       .def("exception", &::c10d::ProcessGroup::Work::exception)
       .def("synchronize", &::c10d::ProcessGroup::Work::synchronize)
       .def(
diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py
index 3b98424e89147..5356097743aa3 100644
--- a/torch/distributed/c10d/__init__.py
+++ b/torch/distributed/c10d/__init__.py
@@ -6,20 +6,8 @@ def is_available():
 
 
 if is_available() and not torch._C._c10d_init():
-    raise RuntimeError("c10d initialization failed")
+    raise RuntimeError("Failed to initialize PyTorch distributed support")
 
 
 if is_available():
-    from .rendezvous import rendezvous, register_rendezvous_handler
-    from . import BroadcastOptions, AllreduceOptions
-
-    DEFAULT_REDUCE_OPTIONS = AllreduceOptions()
-
-    def broadcast(tensor, src, process_group):
-        opts = BroadcastOptions()
-        opts.rootRank = src
-        opts.rootTensor = 0
-        return process_group.broadcast([tensor], opts)
-
-    def all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS):
-        return process_group.allreduce([tensor], opts)
+    from .distributed_c10d import *
diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/c10d/distributed_c10d.py
new file mode 100644
index 0000000000000..dc341f9942755
--- /dev/null
+++ b/torch/distributed/c10d/distributed_c10d.py
@@ -0,0 +1,1054 @@
+import torch
+
+from .rendezvous import rendezvous, register_rendezvous_handler
+from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \
+    ScatterOptions, GatherOptions
+from . import ReduceOp as reduce_op
+from . import PrefixStore
+from . import ProcessGroupGloo
+
+
+_MPI_AVAILBLE = True
+_NCCL_AVAILBLE = True
+
+
+try:
+    from. import ProcessGroupMPI
+except ImportError:
+    _MPI_AVAILBLE = False
+
+try:
+    from. import ProcessGroupNCCL
+except ImportError:
+    _NCCL_AVAILBLE = False
+
+
+class DistBackend:
+    UNDEFINED = -1
+    GLOO = 0
+    NCCL = 2
+    MPI = 3
+
+
+class group(object):
+    WORLD = object()
+
+
+class GroupMember(object):
+    # Alias to group.WORLD for backward compatibility
+    WORLD = group.WORLD
+    NON_GROUP_MEMBER = object()
+
+
+# Cached process groups, map from ProcessGroup to (DistBackend, Store)
+_pg_map = {}
+# Process group's names, map from ProcessGroup to str
+_pg_names = {}
+# Process group's global rank to local rank mapping
+_pg_group_ranks = {}
+
+# Default process group state
+_default_pg = None
+_default_pg_init_method = None
+
+# Process group count for default naming
+_group_count = 0
+
+
+def _rank_not_in_group(group):
+    """
+    Helper that checks if the current process's rank is not in a given group
+
+    """
+    return group == GroupMember.NON_GROUP_MEMBER
+
+
+def _get_group_rank(group, rank):
+    """
+    Helper that gets a given group's local rank in the group from a given global
+    rank
+
+    """
+    if group is GroupMember.WORLD:
+        raise RuntimeError("group.WORLD does not have local rank to global "
+                           "rank mapping")
+    group_rank = _pg_group_ranks[group][rank]
+    if group_rank is None:
+        raise RuntimeError("The global rank is not part of the group")
+    return group_rank
+
+
+def _get_global_rank(group, group_rank):
+    """
+    Helper that gets a given group's global rank from a given local rank in the
+    group
+
+    """
+    if group is GroupMember.WORLD:
+        raise RuntimeError("group.WORLD does not have local rank to global "
+                           "rank mapping")
+    group_rank_map = _pg_group_ranks[group]
+    for rank, grp_rank in group_rank_map.items():
+        if grp_rank == group_rank:
+            return rank
+    raise RuntimeError("The group rank is not part of the group")
+
+
+def _check_default_pg():
+    """
+    Helper that checks if the default ProcessGroup has been initializd, with
+    assertion
+
+    """
+    assert _default_pg is not None, \
+        "Default process group is not initialized"
+
+
+def is_mpi_available():
+    """
+    Checks if MPI is available
+
+    """
+    return _MPI_AVAILBLE
+
+
+def is_nccl_available():
+    """
+    Checks if NCCL is available
+
+    """
+    return _NCCL_AVAILBLE
+
+
+def is_initialized():
+    """
+    Checking if the default process group has been initialized
+
+    """
+    return _default_pg is not None
+
+
+def get_default_group():
+    """
+    Getting the default process group created by init_process_group
+
+    """
+    if not is_initialized():
+        raise RuntimeError("Default process group has not been initialized, "
+                           "please make sure to call init_process_group.")
+    return _default_pg
+
+
+def init_process_group(backend,
+                       init_method="env://",
+                       **kwargs):
+    """
+    Initializes the default distributed process group, and this will also
+    initialize the distributed package
+
+    Arguments:
+        backend (str): Name of the backend to use. Depending on build-time
+                       configuration valid values include:
+                        ``mpi`` and ``gloo``.
+        init_method (str, optional): URL specifying how to initialize the
+                                     process group.
+        world_size (int, optional): Number of processes participating in
+                                    the job.
+        rank (int, optional): Rank of the current process.
+        group_name (str, optional, deprecated): Group name.
+
+    To enable ``backend == mpi``, PyTorch needs to built from source on
+    a system that supports MPI. The same applies to NCCL as well.
+
+    """
+    global _pg_map
+    global _pg_names
+    global _default_pg
+    global _default_pg_init_method
+
+    if _default_pg is not None:
+        raise RuntimeError("trying to initialize the default process group "
+                           "twice!")
+
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, \
+        "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if backend == "mpi":
+        if not is_mpi_available():
+            raise RuntimeError("Distributed package doesn't have MPI built in")
+
+        _default_pg = ProcessGroupMPI()
+        _pg_map[_default_pg] = (DistBackend.MPI, None)
+    else:
+        # backward compatible API
+        if init_method != "env://" and world_size != -1 and rank != -1:
+            url = "{}?rank={}&world_size={}".format(init_method,
+                                                    rank,
+                                                    world_size)
+            store, _, _ = next(rendezvous(url))
+        else:
+            store, rank, world_size = next(rendezvous(init_method))
+
+        if backend == "gloo":
+            _default_pg = ProcessGroupGloo(store, rank, world_size)
+            _pg_map[_default_pg] = (DistBackend.GLOO, store)
+            _pg_names[_default_pg] = group_name
+        elif backend == "nccl":
+            if not is_nccl_available():
+                raise RuntimeError("Distributed package doesn't have NCCL "
+                                   "built in")
+            _default_pg = ProcessGroupNCCL(store, rank, world_size)
+            _pg_map[_default_pg] = (DistBackend.NCCL, store)
+            _pg_names[_default_pg] = group_name
+        else:
+            raise RuntimeError("Invalid distributed backend name: " + backend)
+
+    _default_pg_init_method = init_method
+
+
+def _new_process_group_helper(world_size, rank, group_name=""):
+    """
+    Create a new distributed process group. And the new process group can be
+    used to perform collective operations.
+
+    """
+    global _pg_map
+    global _group_count
+    global _pg_names
+
+    if not group_name:
+        group_name = str(_group_count)
+        _group_count += 1
+
+    if group_name in _pg_names.values():
+        raise RuntimeError("The specified group name has already been "
+                           "created, please use a different group name")
+
+    default_backend, default_store = _pg_map[_default_pg]
+
+    # Create the prefix store
+    store = PrefixStore(group_name, default_store)
+
+    if default_backend == DistBackend.GLOO:
+        pg = ProcessGroupGloo(store, rank, world_size)
+        _pg_map[pg] = (DistBackend.GLOO, store, group_name)
+        _pg_names[_default_pg] = group_name
+    elif default_backend == DistBackend.NCCL:
+        if not is_nccl_available():
+            raise RuntimeError("Distributed package doesn't have NCCL "
+                               "built in")
+        pg = ProcessGroupNCCL(store, rank, world_size)
+        _pg_map[pg] = (DistBackend.NCCL, store, group_name)
+        _pg_names[_default_pg] = group_name
+    else:
+        raise RuntimeError("Unsupported distributed backend by group")
+    return pg
+
+
+def destroy_process_group(group=group.WORLD):
+    """
+    Destroy a given process group, and deinitialize the distributed package
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to be destroyed, if
+                                        group.WORLD is given, all process
+                                        groups including the default one will
+                                        be destroyed.
+    """
+    if _rank_not_in_group(group):
+        return
+
+    global _pg_map
+    global _pg_names
+    global _pg_group_ranks
+    global _default_pg
+    global _default_pg_init_method
+
+    if group == GroupMember.WORLD:
+        pg = _default_pg
+
+    if _pg_map.get(pg, None) is None:
+        raise RuntimeError("Invalid process group specified")
+
+    if group == GroupMember.WORLD:
+        _default_pg = None
+        _default_pg_init_method = None
+        _pg_map.clear()
+        _pg_names.clear()
+        _pg_group_ranks.clear()
+    else:
+        del _pg_map[pg]
+        del _pg_names[pg]
+        del _pg_group_ranks[pg]
+
+
+def get_rank(group=group.WORLD):
+    """
+    Returns the rank of currrent process group
+
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        The rank of the process group
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.rank()
+
+    return group.rank()
+
+
+def get_world_size(group=group.WORLD):
+    """
+    Returns the number of processes in the current process group
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        The world size of the process group
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.size()
+
+    return group.size()
+
+
+def isend(tensor,
+          dst,
+          group=group.WORLD):
+    """
+    Sends a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        A distributed request object.
+        None, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.send([tensor], dst)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        return group.send([tensor], group_dst_rank)
+
+
+def irecv(tensor,
+          src,
+          group=group.WORLD):
+    """
+    Receives a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        A distributed request object.
+        None, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.recv([tensor], src)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        return group.recv([tensor], group_src_rank)
+
+
+def send(tensor,
+         dst,
+         group=group.WORLD):
+    """
+    Sends a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+        group (ProcessGroup, optional): The process group to work on
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        _default_pg.send([tensor], dst).wait()
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        group.send([tensor], group_dst_rank).wait()
+
+
+def recv(tensor,
+         src=None,
+         group=group.WORLD):
+    """
+    Receives a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank. Will receive from any
+            process if unspecified.
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        Sender rank
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        pg = _default_pg
+    else:
+        pg = group
+
+    if src is None:
+        rank_tensor = torch.IntTensor([-1])
+        pg.recv_anysource([tensor], rank_tensor).wait()
+        src_rank = rank_tensor[0].item()
+        if group == GroupMember.WORLD:
+            return src_rank
+        else:
+            return _get_global_rank(pg, src_rank)
+    else:
+        if group == GroupMember.WORLD:
+            pg.recv([tensor], src).wait()
+        else:
+            group_src_rank = _get_group_rank(pg, src)
+            pg.recv([tensor], group_src_rank).wait()
+        return src
+
+
+def broadcast_multigpu(tensor_list,
+                       src,
+                       group=group.WORLD,
+                       async_op=False,
+                       src_tensor=0):
+    """
+    Broadcasts the tensor to the whole group with multiple GPU tensors
+    per node.
+
+    ``tensor`` must have the same number of elements in all the GPUs from
+    all processes participating in the collective. each tensor in the list must
+    be on a different GPU
+
+    Only nccl and gloo backend are currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor_list (List[Tensor]): Tensors that participate in the collective
+            operation. if ``src`` is the rank, then ``src_tensor``th element of
+            ``tensor_list`` (``tensor_list[src_tensor]``) will be broadcasted
+            to all other tensors (on different GPUs) in the src process and
+            all tensors in ``tensor_list`` of other non-src processes.
+            You also need to make sure that ``len(tensor_list)`` is the same
+            for all the distributed processes calling this function.
+
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+        src_tensor (int, optional): Source tensor rank within ``tensor_list``
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = src_tensor
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.broadcast(tensor_list, opts)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.broadcast(tensor_list, opts)
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def broadcast(tensor,
+              src,
+              group=group.WORLD,
+              async_op=False):
+    """
+    Broadcasts the tensor to the whole group.
+
+    ``tensor`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Arguments:
+        tensor (Tensor): Data to be sent if ``src`` is the rank of current
+            process, and tensor to be used to save received data otherwise.
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = 0
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.broadcast([tensor], opts)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.broadcast([tensor], opts)
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_reduce_multigpu(tensor_list,
+                        op=reduce_op.SUM,
+                        group=group.WORLD,
+                        async_op=False):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result. This function reduces a number of tensors on every node,
+    while each tensor resides on different GPUs.
+    Therefore, the input tensor in the tensor list needs to be GPU tensors.
+    Also, each tensor in the tensor list needs to reside on a different GPU.
+
+    After the call, all ``tensor`` in ``tensor_list`` is going to be bitwise
+    identical in all processes.
+
+    Only nccl and gloo backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor list (List[Tensor]): List of input and output tensors of
+            the collective. The function operates in-place and requires that
+            each tensor to be a GPU tensor on different GPUs.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = AllreduceOptions()
+    opts.reduceOp = op
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allreduce(tensor_list, opts)
+    else:
+        work = group.allreduce(tensor_list, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_reduce(tensor,
+               op=reduce_op.SUM,
+               group=group.WORLD,
+               async_op=False):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call ``tensor`` is going to be bitwise identical in all processes.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = AllreduceOptions()
+    opts.reduceOp = op
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allreduce([tensor], opts)
+    else:
+        work = group.allreduce([tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def reduce_multigpu(tensor_list,
+                    dst,
+                    op=reduce_op.SUM,
+                    group=group.WORLD,
+                    async_op=False,
+                    dst_tensor=0):
+    """
+    Reduces the tensor data on multiple GPUs across all machines. Each tensor
+    in ``tensor_list`` should reside on a separate GPU
+
+    Only the GPU of ``tensor_list[dst_tensor]`` on the process with rank ``dst``
+    is going to receive the final result.
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor_list (List[Tensor]): Input and output GPU tensors of the
+            collective. The function operates in-place.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+        dst (int): Destination rank
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+        dst_tensor (int, optional): Destination tensor rank within
+                                    ``tensor_list``
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, otherwise
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = ReduceOptions()
+    opts.reduceOp = op
+    opts.rootRank = dst
+    opts.rootTensor = dst_tensor
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.reduce(tensor_list, opts)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.reduce(tensor_list, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def reduce(tensor,
+           dst,
+           op=reduce_op.SUM,
+           group=group.WORLD,
+           async_op=False):
+    """
+    Reduces the tensor data across all machines.
+
+    Only the process with rank ``dst`` is going to receive the final result.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        dst (int): Destination rank
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = ReduceOptions()
+    opts.reduceOp = op
+    opts.rootRank = dst
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.reduce([tensor], opts)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.reduce([tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_gather_multigpu(output_tensor_lists,
+                        input_tensor_list,
+                        group=group.WORLD,
+                        async_op=False):
+    """
+    Gathers tensors from the whole group in a list.
+    Each tensor in ``tensor_list`` should reside on a separate GPU
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        output_tensor_lists (List[List[Tensor]]): Output lists. It should
+            contain correctly-sized tensors on each GPU to be used for output of
+            the collective.
+            e.g. ``output_tensor_lists[i]`` contains the all_gather
+            result that resides on the GPU of ``input_tensor_list[i]``.
+            Note that each element of ``output_tensor_lists[i]`` has the size of
+            ``world_size * len(input_tensor_list)``, since the function all
+            gathers the result from every single GPU in the group. To interpret
+            each element of ``output_tensor_list[i]``, note that
+            ``input_tensor_list[j]`` of rank k will be appear in
+            ``output_tensor_list[i][rank * world_size + j]``
+            Also note that ``len(output_tensor_lists)``, and the size of each
+            element in ``output_tensor_lists`` (each element is a list,
+            therefore ``len(output_tensor_lists[i])``) need to be the same
+            for all the distributed processes calling this function.
+
+        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
+            be broadcast from current process.
+            Note that ``len(input_tensor_list)`` needs to be the same for
+            all the distributed processes calling this function.
+
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allgather(output_tensor_lists, input_tensor_list)
+    else:
+        work = group.allgather(output_tensor_lists, input_tensor_list)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_gather(tensor_list,
+               tensor,
+               group=group.WORLD,
+               async_op=False):
+    """
+    Gathers tensors from the whole group in a list.
+
+    Arguments:
+        tensor_list (list[Tensor]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allgather([tensor_list], [tensor])
+    else:
+        work = group.allgather([tensor_list], [tensor])
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def gather(tensor,
+           gather_list,
+           dst,
+           group=group.WORLD,
+           async_op=False):
+    """
+    Gathers a list of tensors in a single process.
+
+    Arguments:
+        tensor (Tensor): Input tensor.
+        gather_list (list[Tensor]): List of appropriately-sized tensors to
+            use for received data. Required only in the receiving process.
+        dst (int): Destination rank. Required in all processes except the one
+            that is receiveing the data.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    if dst == my_rank:
+        if gather_list is None:
+            raise RuntimeError("gather_list is a required argument in gather "
+                               "destination")
+    else:
+        if gather_list:
+            raise RuntimeError("non-empty gather_list can be given only "
+                               "to gather destination")
+
+    opts = GatherOptions()
+    opts.rootRank = dst
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.gather([gather_list], [tensor], opts)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.gather([gather_list], [tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def scatter(tensor,
+            scatter_list,
+            src,
+            group=group.WORLD,
+            async_op=False):
+    """
+    Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    ``tensor`` argument.
+
+    Arguments:
+        tensor (Tensor): Output tensor.
+        scatter_list (list[Tensor]): List of tensors to scatter. Required only
+            in the process that is sending the data.
+        src (int): Source rank. Required in all processes except the one that
+            is sending the data.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    if src == my_rank:
+        if scatter_list is None:
+            raise RuntimeError("scatter_list is a required argument in "
+                               "scatter source")
+    else:
+        if scatter_list:
+            raise RuntimeError("non-empty can be given only to scatter "
+                               "source")
+
+    opts = ScatterOptions()
+    opts.rootRank = src
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.scatter([tensor], [scatter_list], opts)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.scatter([tensor], [scatter_list], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def barrier(group=group.WORLD,
+            async_op=False):
+    """
+    Synchronizes all processes.
+
+    This collective blocks processes until the whole group enters this function,
+    if async_op is False, or if async work handle is called on wait().
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.barrier()
+    else:
+        work = group.barrier()
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def new_group(ranks=None):
+    """
+    Creates a new distributed group.
+
+    This function requires that all processes in the main group (i.e. all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group. Additionally, groups
+    should be created in the same order in all processes.
+
+    Arguments:
+        ranks (list[int]): List of ranks of group members.
+
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+
+    _check_default_pg()
+
+    global _pg_group_ranks
+
+    default_backend, _ = _pg_map[_default_pg]
+    if default_backend == DistBackend.MPI:
+        raise RuntimeError("Only NCCL and Gloo backend currently support "
+                           "new_group function")
+
+    global_rank = _default_pg.rank()
+    global_world_size = _default_pg.size()
+
+    # checks the input ranks
+    if ranks is not None:
+        group_world_size = len(ranks)
+        if group_world_size > global_world_size:
+            raise RuntimeError("the new group's world size should be less or "
+                               "equal to the world size set by "
+                               "init_process_group")
+        # check ranks' sanity
+        for rank in ranks:
+            if rank < 0 or rank >= global_world_size:
+                raise RuntimeError("The new group's rank should be within the "
+                                   "the world_size set by init_process_group")
+
+        if global_rank in ranks:
+            group_rank = ranks.index(global_rank)
+        else:
+            group_rank = None
+    else:
+        group_world_size = global_world_size
+        group_rank = global_rank
+
+    # Release ranks not in the group
+    if global_rank not in ranks:
+        return GroupMember.NON_GROUP_MEMBER
+
+    pg = _new_process_group_helper(group_world_size, group_rank)
+
+    # Create the global rank to group rank mapping
+    _pg_group_ranks[pg] = {}
+    for rank in range(global_world_size):
+        if rank in ranks:
+            _pg_group_ranks[pg][rank] = ranks.index(rank)
+        else:
+            _pg_group_ranks[pg][rank] = None
+
+    return pg
+
+
+# TODO: delete these functions and replace DDP with public functions
+DEFAULT_REDUCE_OPTIONS = AllreduceOptions()
+
+
+def _broadcast(tensor, src, process_group):
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = 0
+    return process_group.broadcast([tensor], opts)
+
+
+def _all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS):
+    return process_group.allreduce([tensor], opts)
diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/c10d/rendezvous.py
index 062443f87abfe..30c9f2dfe7dd3 100644
--- a/torch/distributed/c10d/rendezvous.py
+++ b/torch/distributed/c10d/rendezvous.py
@@ -3,6 +3,7 @@
 except ImportError:
     from urlparse import urlparse
 
+import os
 from . import FileStore, TCPStore
 
 
@@ -59,13 +60,13 @@ def _error(msg):
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
     if "rank" not in query:
         raise _error("rank parameter missing")
-    if "size" not in query:
-        raise _error("size parameter missing")
+    if "world_size" not in query:
+        raise _error("world size parameter missing")
 
     rank = int(query["rank"])
-    size = int(query["size"])
+    world_size = int(query["world_size"])
     store = FileStore(path)
-    yield (store, rank, size)
+    yield (store, rank, world_size)
 
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using file:// method")
@@ -81,18 +82,52 @@ def _error(msg):
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
     if "rank" not in query:
         raise _error("rank parameter missing")
-    if "size" not in query:
-        raise _error("size parameter missing")
+    if "world_size" not in query:
+        raise _error("world size parameter missing")
 
     rank = int(query["rank"])
-    size = int(query["size"])
+    world_size = int(query["world_size"])
     start_daemon = rank == 0
     store = TCPStore(result.hostname, result.port, start_daemon)
-    yield (store, rank, size)
+    yield (store, rank, world_size)
 
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using tcp:// method")
 
 
+def _env_rendezvous_handler(url):
+    def _error(msg):
+        return ValueError("env:// rendezvous: " + msg)
+
+    if url != "env://":
+        raise _error("Only `env://` is expected for the env init method")
+    world_size = os.environ["WORLD_SIZE"]
+    if world_size is None:
+        raise _error("world size is missing")
+    rank = os.environ["RANK"]
+    if rank is None:
+        raise _error("rank is missing")
+    master_addr = os.environ["MASTER_ADDR"]
+    if master_addr is None:
+        raise _error("master addr is missing")
+    master_port = os.environ["MASTER_PORT"]
+    if master_port is None:
+        raise _error("master port is missing")
+
+    # Converting before creating the store
+    rank = int(rank)
+    world_size = int(world_size)
+    master_port = int(master_port)
+
+    # Now start the TCP store daemon on the rank 0
+    start_daemon = rank == 0
+    store = TCPStore(master_addr, master_port, start_daemon)
+    yield (store, rank, world_size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform rerendezvous using env:// method")
+
+
 register_rendezvous_handler("file", _file_rendezvous_handler)
 register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 3d2bad9191a1f..3afa33c7536ba 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -386,16 +386,17 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
     const GatherOptions& opts) {
   checkSingleTensor(inputTensors);
 
+  if (outputTensors.size() != 1) {
+    throw std::runtime_error("Gather: multi-GPU collective is not supported");
+  }
+
   if (rank_ != opts.rootRank) {
-    if (outputTensors.size() > 0) {
+    if (outputTensors[0].size() > 0) {
       throw std::runtime_error(
           "Gather: number of output tensors should be 0 "
           "for non-root");
     }
   } else {
-    if (outputTensors.size() != 1) {
-      throw std::runtime_error("Gather: multi-GPU collective is not supported");
-    }
     if (static_cast<size_t>(size_) != outputTensors[0].size()) {
       throw std::runtime_error(
           "Gather: number of output tensors should equal "
@@ -449,17 +450,17 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
   checkSingleTensor(outputTensors);
+  if (inputTensors.size() != 1) {
+    throw std::runtime_error("Scatter: multi-GPU collective is not supported");
+  }
 
   if (rank_ != opts.rootRank) {
-    if (inputTensors.size() > 0) {
+    if (inputTensors[0].size() > 0) {
       throw std::runtime_error(
           "Scatter: number of input tensors should be 0 "
           "for non-root");
     }
   } else {
-    if (inputTensors.size() != 1) {
-      throw std::runtime_error("Gather: multi-GPU collective is not supported");
-    }
     if (static_cast<size_t>(size_) != inputTensors[0].size()) {
       throw std::runtime_error(
           "Scatter: number of input tensors should equal "
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
index 1310d2d748c89..daa03f9f58511 100644
--- a/torch/nn/parallel/distributed_c10d.py
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -91,13 +91,14 @@ class _DistributedDataParallelC10d(Module):
 
     Args:
         module: module to be parallelized
-        process_group: the c10d process group to be used for distributed data
-                       all-reduction
         device_ids: CUDA devices (default: all devices)
         output_device: device location of output (default: device_ids[0])
         broadcast_buffers: flag that enables syncing (broadcasting) buffers of
                            the module at beginning of the forward function.
                            (default: True)
+        process_group: the c10d process group to be used for distributed data
+                       all-reduction. If None, the default process group will
+                       be used
         bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into
                        multiple buckets so that gradient reduction of each
                        bucket can potentially overlap with backward computation.
@@ -112,9 +113,9 @@ class _DistributedDataParallelC10d(Module):
         >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size)
         >>> net = torch.nn._DistributedDataParallelC10d(model, pg)
     """
-    def __init__(self, module, process_group, device_ids=None,
+    def __init__(self, module, device_ids=None,
                  output_device=None, dim=0, broadcast_buffers=True,
-                 bucket_cap_mb=25):
+                 process_group=None, bucket_cap_mb=25):
 
         super(_DistributedDataParallelC10d, self).__init__()
 
@@ -125,13 +126,19 @@ def __init__(self, module, process_group, device_ids=None,
         if output_device is None:
             output_device = device_ids[0]
 
+        if process_group is None:
+            self.process_group = c10d.get_default_group()
+        else:
+            self.process_group = process_group
+
         self.dim = dim
         self.module = module
-        self.process_group = process_group
         self.device_ids = device_ids
         self.output_device = output_device
         self.broadcast_buffers = broadcast_buffers
 
+        self.allreduce_opts = c10d.AllreduceOptions()
+
         MB = 1024 * 1024
 
         # used for intra-node param sync and inter-node sync as well
@@ -341,7 +348,8 @@ def _queue_reduction(self, bucket_idx):
             nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
 
         # now work on the first gpu
-        reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group)
+        reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
+                                                      self.allreduce_opts)
         self.reduction_works[bucket_idx] = reduction_work
         self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
 

From c99a143eea5224272e52b58a7714eec805671cab Mon Sep 17 00:00:00 2001
From: Yi Cheng <eason@fb.com>
Date: Wed, 29 Aug 2018 13:27:03 -0700
Subject: [PATCH 16/42] Update blackbox predictor with new constructor (#10920)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10920

Update the black box predictor and the related code to use the
constructor with PredictorConfig.

Reviewed By: highker

Differential Revision: D9516972

fbshipit-source-id: fbd7ece934d527e17dc6bcc740b4e67e778afa1d
---
 caffe2/predictor/predictor_config.cc | 58 ++++++++++++++++++++++------
 caffe2/predictor/predictor_config.h  |  3 +-
 caffe2/predictor/predictor_utils.cc  | 51 ++++++++++++++++++++++++
 caffe2/predictor/predictor_utils.h   | 10 +++++
 4 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/caffe2/predictor/predictor_config.cc b/caffe2/predictor/predictor_config.cc
index aabff0daffcd7..0ca120d0121da 100644
--- a/caffe2/predictor/predictor_config.cc
+++ b/caffe2/predictor/predictor_config.cc
@@ -10,7 +10,7 @@ namespace {
 // We don't use the getNet() from predictor_utils.cc here because that file
 // has additional dependencies that we want to avoid bringing in, to keep the
 // binary size as small as possible.
-const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
+static const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
   for (const auto& n : def.nets()) {
     if (n.key() == name) {
       return n.value();
@@ -19,7 +19,7 @@ const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
   CAFFE_THROW("Net not found: ", name);
 }
 
-const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
+static const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
     const MetaNetDef& def,
     const std::string& name) {
   for (const auto& b : def.blobs()) {
@@ -30,26 +30,60 @@ const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
   CAFFE_THROW("Blob not found: ", name);
 }
 
+static std::string combine(const std::string& str, const std::string& name) {
+  if (name.empty()) {
+    return std::string(str);
+  }
+  return str + "_" + name;
+}
+
+static std::string getNamedPredictNet(const string& name) {
+  return combine(PredictorConsts::default_instance().predict_net_type(), name);
+}
+
+static std::string getNamedInitNet(const string& name) {
+  return combine(
+      PredictorConsts::default_instance().predict_init_net_type(), name);
+}
+
+static std::string getNamedInputs(const string& name) {
+  return combine(PredictorConsts::default_instance().inputs_blob_type(), name);
+}
+
+static std::string getNamedOutputs(const string& name) {
+  return combine(PredictorConsts::default_instance().outputs_blob_type(), name);
+}
+
+static std::string getNamedParams(const string& name) {
+  return combine(
+      PredictorConsts::default_instance().parameters_blob_type(), name);
+}
+
 } // namespace
 
-PredictorConfig
-makePredictorConfig(const MetaNetDef& def, Workspace* parent, bool run_init) {
-  const auto& init_net =
-      getNet(def, PredictorConsts::default_instance().global_init_net_type());
-  const auto& run_net =
-      getNet(def, PredictorConsts::default_instance().predict_net_type());
+PredictorConfig makePredictorConfig(
+    const MetaNetDef& def,
+    Workspace* parent,
+    bool run_init,
+    const std::string& net_name) {
+  const auto& init_net = getNet(def, getNamedInitNet(net_name));
+  const auto& run_net = getNet(def, getNamedPredictNet(net_name));
   auto config = makePredictorConfig(init_net, run_net, parent, run_init);
-  const auto& inputs =
-      getBlobs(def, PredictorConsts::default_instance().inputs_blob_type());
+  const auto& inputs = getBlobs(def, getNamedInputs(net_name));
   for (const auto& input : inputs) {
     config.input_names.emplace_back(input);
   }
 
-  const auto& outputs =
-      getBlobs(def, PredictorConsts::default_instance().outputs_blob_type());
+  const auto& outputs = getBlobs(def, getNamedOutputs(net_name));
   for (const auto& output : outputs) {
     config.output_names.emplace_back(output);
   }
+
+  const auto& params = getBlobs(def, getNamedParams(net_name));
+  for (const auto& param : params) {
+    config.parameter_names.emplace_back(param);
+  }
+
   return config;
 }
 
diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h
index eda1c9d03ca2b..b1555addfa6f0 100644
--- a/caffe2/predictor/predictor_config.h
+++ b/caffe2/predictor/predictor_config.h
@@ -45,7 +45,8 @@ CAFFE2_API Workspace makeWorkspace(std::shared_ptr<PredictorParameters> paramete
 CAFFE2_API PredictorConfig makePredictorConfig(
     const MetaNetDef& net,
     Workspace* parent = nullptr,
-    bool run_init = true);
+    bool run_init = true,
+    const std::string& net_name = "");
 
 CAFFE2_API PredictorConfig makePredictorConfig(
     const NetDef& init_net,
diff --git a/caffe2/predictor/predictor_utils.cc b/caffe2/predictor/predictor_utils.cc
index 4af83d0bea8c2..f5acd4f936010 100644
--- a/caffe2/predictor/predictor_utils.cc
+++ b/caffe2/predictor/predictor_utils.cc
@@ -1,4 +1,5 @@
 #include "caffe2/predictor/predictor_utils.h"
+#include "caffe2/predictor/predictor_config.h"
 
 #include "caffe2/core/blob.h"
 #include "caffe2/core/logging.h"
@@ -6,6 +7,13 @@
 #include "caffe2/proto/predictor_consts.pb.h"
 #include "caffe2/utils/proto_utils.h"
 
+CAFFE2_DEFINE_bool(
+    caffe2_predictor_claim_tensor_memory,
+    true,
+    "If false, then predictor will not claim tensor memory"
+    "otherwise when tensor is shrinked to a size smaller than current size "
+    "by FLAGS_caffe2_max_keep_on_shrink_memory, the memory will be claimed.");
+
 namespace caffe2 {
 namespace predictor_utils {
 
@@ -79,4 +87,47 @@ std::unique_ptr<MetaNetDef> runGlobalInitialization(
 }
 
 } // namespace predictor_utils
+
+void removeExternalBlobs(
+    const std::vector<std::string>& input_blobs,
+    const std::vector<std::string>& output_blobs,
+    Workspace* ws) {
+  for (const auto& blob : input_blobs) {
+    ws->RemoveBlob(blob);
+  }
+  for (const auto& blob : output_blobs) {
+    ws->RemoveBlob(blob);
+  }
+}
+
+PredictorConfig makePredictorConfig(
+    const string& db_type,
+    const string& db_path) {
+  // TODO: Remove this flags once Predictor accept PredictorConfig as
+  // constructors. These comes are copied temporarly from the Predictor.
+  if (FLAGS_caffe2_predictor_claim_tensor_memory) {
+    if (FLAGS_caffe2_max_keep_on_shrink_memory == LLONG_MAX) {
+      FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 1024 * 1024;
+    }
+  }
+  auto dbReader =
+      make_unique<db::DBReader>(db::CreateDB(db_type, db_path, db::READ));
+  auto ws = std::make_shared<Workspace>();
+  auto net_def =
+      predictor_utils::runGlobalInitialization(std::move(dbReader), ws.get());
+  auto config = makePredictorConfig(*net_def, ws.get());
+  config.ws = ws;
+  const auto& init_net = predictor_utils::getNet(
+      *net_def, PredictorConsts::default_instance().predict_init_net_type());
+  CAFFE_ENFORCE(config.ws->RunNetOnce(init_net));
+  config.ws->RemoveBlob(
+      PredictorConsts::default_instance().predictor_dbreader());
+  // Input and output blobs should never be allocated in the master workspace
+  // since we'll end up with race-conditions due to these being shared among
+  // predictor threads / TL workspaces. Safely handle against globalInitNet
+  // creating them in the master.
+  removeExternalBlobs(config.input_names, config.output_names, config.ws.get());
+  return config;
+}
+
 } // namespace caffe2
diff --git a/caffe2/predictor/predictor_utils.h b/caffe2/predictor/predictor_utils.h
index 8c9cb4a5792d4..af7799b039c8b 100644
--- a/caffe2/predictor/predictor_utils.h
+++ b/caffe2/predictor/predictor_utils.h
@@ -24,4 +24,14 @@ CAFFE2_API std::unique_ptr<MetaNetDef> runGlobalInitialization(
     Workspace* master);
 
 } // namespace predictor_utils
+
+PredictorConfig makePredictorConfig(
+    const string& db_type,
+    const string& db_path);
+
+void removeExternalBlobs(
+    const std::vector<std::string>& input_blobs,
+    const std::vector<std::string>& output_blobs,
+    Workspace* ws);
+
 } // namespace caffe2

From cd9416317d4c66c9a15caed3e47bbaa0469f40e0 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 29 Aug 2018 13:29:34 -0700
Subject: [PATCH 17/42] Minor copy-edit on setup.py

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10933

Reviewed By: cpuhrsch

Differential Revision: D9526650

fbshipit-source-id: 8ad1c989bee7009b3f95a2641189f55cf6c1979f
---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index a2dbff45a2902..e2446a1494d9f 100644
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,8 @@
 #   TORCH_CUDA_ARCH_LIST
 #     specify which CUDA architectures to build for.
 #     ie `TORCH_CUDA_ARCH_LIST="6.0;7.0"`
+#     These are not CUDA versions, instead, they specify what
+#     classes of NVIDIA hardware we should generate PTX for.
 #
 #   ONNX_NAMESPACE
 #     specify a namespace for ONNX built here rather than the hard-coded

From b644d5e74a7a70dfbcb6be83e06d0288c8769c3d Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 29 Aug 2018 13:50:54 -0700
Subject: [PATCH 18/42] Delete context and get_context from Type.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11001

Reviewed By: cpuhrsch

Differential Revision: D9557315

fbshipit-source-id: b9862b8dda49194298bb1a4fbc214d466f3c8350
---
 aten/src/ATen/UndefinedType.cpp               | 4 ++--
 aten/src/ATen/UndefinedType.h                 | 2 +-
 aten/src/ATen/gen.py                          | 2 +-
 aten/src/ATen/native/cuda/Gesv.cu             | 2 +-
 aten/src/ATen/templates/RegisterCPU.cpp       | 2 +-
 aten/src/ATen/templates/SparseTypeDerived.cpp | 6 +++---
 aten/src/ATen/templates/Type.cpp              | 4 ++--
 aten/src/ATen/templates/Type.h                | 7 ++-----
 aten/src/ATen/templates/TypeDerived.cpp       | 6 +++---
 aten/src/ATen/templates/TypeDerived.h         | 2 +-
 tools/autograd/templates/VariableType.cpp     | 2 +-
 11 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
index 60d9c884b8aef..2bc3965c6d33a 100644
--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -3,8 +3,8 @@
 
 namespace at {
 
-UndefinedType::UndefinedType(Context* context)
-    : Type(context, UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {}
+UndefinedType::UndefinedType()
+    : Type(UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {}
 ScalarType UndefinedType::scalarType() const {
   return ScalarType::Undefined;
 }
diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h
index 2cb14a3a652c4..d216e3131dd69 100644
--- a/aten/src/ATen/UndefinedType.h
+++ b/aten/src/ATen/UndefinedType.h
@@ -12,7 +12,7 @@
 namespace at {
 
 struct UndefinedType final : public Type {
-  explicit UndefinedType(Context* context);
+  explicit UndefinedType();
   virtual ScalarType scalarType() const override;
   virtual Backend backend() const override;
   virtual bool is_cuda() const override;
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 53879e56ffb34..bb6d71f54c2d1 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -125,7 +125,7 @@ def check_all_files_written(self):
 TYPE_REGISTER = CodeTemplate("""\
 context->type_registry[static_cast<int>(Backend::${backend})]
                       [static_cast<int>(ScalarType::${scalar_type})]
-                      .reset(new ${type_name}(context));
+                      .reset(new ${type_name}());
 detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type});
 """)
 
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index 0692dd0fea290..bc37e83990e19 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -48,7 +48,7 @@ void magmaGesvBatched<double>(
 }
 
 static magma_queue_t createMagmaQueue(const Tensor& tensor) {
-  auto& context = tensor.type().get_context();
+  auto& context = at::globalContext();
   magma_queue_t magma_queue;
   magma_queue_create_from_cuda(
       tensor.get_device(),
diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp
index 184af2c8c014d..0c1eeb4818fbb 100644
--- a/aten/src/ATen/templates/RegisterCPU.cpp
+++ b/aten/src/ATen/templates/RegisterCPU.cpp
@@ -14,7 +14,7 @@ namespace at {
 void register_cpu_types(Context * context) {
   ${cpu_type_registrations}
   context->type_registry[static_cast<int>(Backend::Undefined)]
-                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType(context));
+                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 4a17004bb5ff8..2ef9dbf398fa2 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -27,8 +27,8 @@
 
 namespace at {
 
-${Type}::${Type}(Context* context)
-  : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
+${Type}::${Type}()
+  : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
@@ -58,7 +58,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   AT_ERROR("unsafeTensorFromTH not supported on sparse");
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
-  return std::unique_ptr<Generator>(new ${Generator}(context));
+  return std::unique_ptr<Generator>(new ${Generator}(&at::globalContext()));
 }
 
 const char * ${Type}::toString() const {
diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp
index 90dbbb810ee30..ff154971e7bff 100644
--- a/aten/src/ATen/templates/Type.cpp
+++ b/aten/src/ATen/templates/Type.cpp
@@ -38,10 +38,10 @@ Tensor Type::copy(const Tensor & src, bool non_blocking) const {
 }
 
 Type & Type::toBackend(Backend b) const {
-  return context->getType(b,scalarType());
+  return at::globalContext().getType(b,scalarType());
 }
 Type & Type::toScalarType(ScalarType s) const {
-  return context->getType(backend(),s);
+  return at::globalContext().getType(backend(),s);
 }
 static std::vector<int64_t> defaultStrides(IntList sizes) {
   std::vector<int64_t> strides(sizes.size());
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 884bd3a3bdff7..10c52ac14b697 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -45,8 +45,8 @@ enum class TypeID {
 };
 
 struct AT_API Type {
-  explicit Type(Context* context, TensorTypeId type_id, bool is_variable, bool is_undefined)
-      : context(context), type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
+  explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
   virtual ~Type() {}
   virtual ScalarType scalarType() const = 0;
   virtual Backend backend() const = 0;
@@ -79,8 +79,6 @@ struct AT_API Type {
   Type & cuda() const {
     return this->toBackend(at::backendToCUDA(this->backend()));
   }
-  Context& get_context() const { return *context; }
-
   // contiguous IDs for all types in the system
   // for external dispatch
   virtual TypeID ID() const = 0;
@@ -111,7 +109,6 @@ struct AT_API Type {
   // virtual Tensor * add(Tensor & a, Tensor & b) = 0;
   ${type_method_declarations}
 protected:
-  Context* context;
   TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index fbafed82b57e0..4335a8f2209a2 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -38,8 +38,8 @@ static int getPointerDevice(void* ptr) {
 }
 #endif
 
-${Type}::${Type}(Context* context)
-  : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
+${Type}::${Type}()
+  : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
@@ -99,7 +99,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   return Storage((${THStorage}*) th_pointer);
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
-  return std::unique_ptr<Generator>(new ${Generator}(context));
+  return std::unique_ptr<Generator>(new ${Generator}(&at::globalContext()));
 }
 
 const char * ${Type}::toString() const {
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index e8613b62a333b..ec08e1a336daf 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -16,7 +16,7 @@
 namespace at {
 
 struct ${Type} final : public Type {
-  explicit ${Type}(Context* context);
+  explicit ${Type}();
   virtual ScalarType scalarType() const override;
   virtual Backend backend() const override;
   virtual bool is_cuda() const override;
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 89101a24714b7..244606ca7938d 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -43,7 +43,7 @@ using namespace torch::autograd::generated;
 namespace torch { namespace autograd {
 
 VariableType::VariableType(Context* context, Type* baseType)
-  : Type(context, baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
+  : Type(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
   , baseType(baseType)
   , id_(context->freshTypeID()) {
   str = std::string("Variable[") + baseType->toString() + "]";

From f687ff5a59f18120617b6ce0c45d4335f0ff65ab Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 29 Aug 2018 13:58:53 -0700
Subject: [PATCH 19/42] Delete unnecessary includes from TensorImpl.h (#11005)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11005

Reviewed By: smessmer

Differential Revision: D9558300

Pulled By: ezyang

fbshipit-source-id: ebebb3c6d3a1a2f7cc3da9fe9d3c56310ead46e1
---
 aten/src/ATen/TensorImpl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index 30b34cabec769..8976acb6a4090 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -3,8 +3,6 @@
 #include <atomic>
 #include <memory>
 
-#include "ATen/Retainable.h"
-#include "ATen/StorageImpl.h"
 #include "ATen/Storage.h"
 #include "ATen/core/optional.h"
 #include "ATen/core/TensorTypeId.h"

From e9eed8edb438ec5ed6b950225a6d315e30da4b70 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Wed, 29 Aug 2018 14:09:24 -0700
Subject: [PATCH 20/42] Add doc for Tensor.digamma_? (#11008)

Summary:
follow up for #10967

zou3519 vishwakftw
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11008

Differential Revision: D9559889

Pulled By: SsnL

fbshipit-source-id: a05d8fbad92a54bcdb93de6e62a7f94180da1d99
---
 test/test_torch.py    |  2 --
 torch/_tensor_docs.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 5167ac618bba7..167a400ec9147 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -179,8 +179,6 @@ def test_namespace(ns, *skips):
                        'as_strided_',
                        re.compile('^clamp_(min|max)_?$'),
                        'coalesce',
-                       'digamma',
-                       'digamma_',
                        'index_put',
                        'is_coalesced',
                        'is_distributed',
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 39d14668958c9..0a76a89a20d55 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -650,6 +650,20 @@ def add_docstr_all(method, docstr):
 See :func:`torch.diagonal`
 """)
 
+add_docstr_all('digamma',
+               r"""
+digamma() -> Tensor
+
+See :func:`torch.digamma`
+""")
+
+add_docstr_all('digamma_',
+               r"""
+digamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.digamma`
+""")
+
 add_docstr_all('dim',
                r"""
 dim() -> int

From 0b1de747329250fbb290411d819bf10edbc1b858 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 29 Aug 2018 14:22:23 -0700
Subject: [PATCH 21/42] Documentation improvement in caffe2/core/tensor.h
 (#11006)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11006

Reviewed By: smessmer

Differential Revision: D9558383

Pulled By: ezyang

fbshipit-source-id: 7d36fb69a6e8a7d064da2c8796dc263a9fd4e094
---
 caffe2/core/tensor.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 23740cfc5772e..21dc126c7f2c0 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -25,7 +25,7 @@ inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
 }
 
 /**
- * Return product of all dimensions starting from K
+ * Return product of all dimensions starting from k
  */
 inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
   TIndex r = 1;
@@ -35,7 +35,7 @@ inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
   return r;
 }
 
-// Product of all dims up to
+// Product of all dims up to k (not including dims[k])
 inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
   CAFFE_ENFORCE((unsigned)k <= dims.size());
   TIndex r = 1;
@@ -61,6 +61,7 @@ inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
   return r;
 }
 
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
 inline int canonical_axis_index_(int axis_index, int ndims) {
   CAFFE_ENFORCE_GE(axis_index, -ndims);
   CAFFE_ENFORCE_LT(axis_index, ndims);

From 6a8bc3804ac72f7c946038289b088e50a2672891 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Wed, 29 Aug 2018 14:25:49 -0700
Subject: [PATCH 22/42] Add flush to logging messages higher than INFO.
 (#10983)

Summary:
This probably fixes the logging test error that orionr is encountering - haven't tested locally but wanted to send out a PR to kick off CI.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10983

Reviewed By: ezyang

Differential Revision: D9552607

Pulled By: Yangqing

fbshipit-source-id: 9ac019031ffd9c03972144df04a836e5dcdafe02
---
 caffe2/core/logging.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index ec7518630c952..a394f91c729b8 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -236,6 +236,12 @@ MessageLogger::~MessageLogger() {
   if (severity_ >= FLAGS_caffe2_log_level) {
     // If not building on Android, log all output to std::cerr.
     std::cerr << stream_.str();
+    // Simulating the glog default behavior: if the severity is above INFO,
+    // we flush the stream so that the output appears immediately on std::cerr.
+    // This is expected in some of our tests.
+    if (severity_ > INFO) {
+      std::cerr << std::flush;
+    }
   }
 #endif  // ANDROID
   if (severity_ == FATAL) {

From 22e3b2c9c369c5fb44476eb538fa0a308df94eff Mon Sep 17 00:00:00 2001
From: Zhanibek Datbayev <datbayev@fb.com>
Date: Wed, 29 Aug 2018 14:37:39 -0700
Subject: [PATCH 23/42] Revert D9413150: [New Checkpoint] Kill the dummy
 TaskOutput when task.get_step()

Differential Revision:
D9413150

Original commit changeset: 51aaf3201e26

fbshipit-source-id: ac7c4c0960db03f344fe3eb2ad7f0e034db2371a
---
 caffe2/python/checkpoint_test.py |  4 +--
 caffe2/python/core_test.py       |  4 +--
 caffe2/python/task.py            | 48 +++++++++++++++++++-------------
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
index afba3dddcd5aa..a91bbf9910e29 100644
--- a/caffe2/python/checkpoint_test.py
+++ b/caffe2/python/checkpoint_test.py
@@ -161,9 +161,9 @@ def test_ckpt_name_and_load_model_from_ckpts(self):
                     num_epochs = job_runner.train(session)
                 self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
 
-                # There are 15 global blobs after finishing up the job runner.
+                # There are 17 global blobs after finishing up the job runner.
                 # (only blobs on init_group are checkpointed)
-                self.assertEquals(len(ws.blobs), 15)
+                self.assertEquals(len(ws.blobs), 17)
 
             ws = workspace.C.Workspace()
             session = LocalSession(ws)
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index d989471a16bab..7120843f33152 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -533,8 +533,8 @@ def test_create_plan_from_proto_correctly(self):
 
         self.assertEqual(len(plan.Steps()), 1)
         self.assertEqual(len(test_plan.Steps()), 1)
-        self.assertEqual(len(plan.Proto().network), 8)
-        self.assertEqual(len(test_plan.Proto().network), 8)
+        self.assertEqual(len(plan.Proto().network), 9)
+        self.assertEqual(len(test_plan.Proto().network), 9)
         self.assertEqual(len(plan.Proto().execution_step), 1)
         self.assertEqual(len(test_plan.Proto().execution_step), 1)
         self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name())
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index 5aafdf63c3b28..311211dfdff3e 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -150,7 +150,7 @@ def add_setup_steps(step, init_nets, exit_nets, name):
     if init_nets:
         steps.append(core.execution_step('%s:init' % name, init_nets))
     steps.append(step)
-    if exit_nets:
+    if len(exit_nets) > 0:
         steps.append(core.execution_step('%s:exit' % name, exit_nets))
     return core.execution_step(name, steps)
 
@@ -215,11 +215,10 @@ def add(self, task):
         self._tasks.append(task)
 
     def tasks(self):
-        if not self._already_used:
-            for task in self._tasks_to_add:
-                self.add(task)
-            self._tasks_to_add = []
-            self._already_used = True
+        for task in self._tasks_to_add:
+            self.add(task)
+        self._tasks_to_add = []
+        self._already_used = True
         return self._tasks
 
     def num_registered_tasks(self):
@@ -228,7 +227,7 @@ def num_registered_tasks(self):
     def used_nodes(self):
         # use list to keep order
         used = []
-        for task in self.tasks():
+        for task in self._tasks + self._tasks_to_add:
             if task.node not in used:
                 used.append(task.node)
         return used
@@ -260,8 +259,9 @@ def tasks_by_node(self, node_remap=None):
         # tasks_by_node can't be called twice because the setup won't
         # work properly a second time.
         node_map = {}
-        for node in self.used_nodes():
-            node_map[node] = node_remap(node) if node_remap else node
+        for task in self.tasks():
+            node_map[task.node] =\
+                node_remap(task.node) if node_remap else task.node
         if self._tasks_by_node is not None:
             tasks_by_node, prev_node_map = self._tasks_by_node
             assert prev_node_map == node_map, (
@@ -285,7 +285,11 @@ def tasks_by_node(self, node_remap=None):
         grouped_by_node = TaskGroup()
         for node, tasks in viewitems(tasks_by_node):
             report_steps = report_steps_by_node[node]
-
+            node_inits, node_exits = get_setup_nets(
+                TaskGroup.LOCAL_SETUP,
+                [t.get_step() for t in tasks] + report_steps,
+                self)
+            # shortcut for single task with no queue
             steps = report_steps
             outputs = []
             grouped_workspace_type = WorkspaceType.PRIVATE
@@ -307,15 +311,16 @@ def tasks_by_node(self, node_remap=None):
             else:
                 step = core.execution_step(
                     '%s:body' % node, steps, concurrent_substeps=True)
-
-            # Prepend and append setup nets.
-            node_inits, node_exits = get_setup_nets(
-                TaskGroup.LOCAL_SETUP,
-                [t.get_step() for t in tasks] + report_steps,
-                self,
-            )
-            step = add_setup_steps(step, node_inits, node_exits, node)
-
+            if len(node_inits) > 0 or len(node_exits) > 0:
+                steps = []
+                if len(node_inits) > 0:
+                    steps.append(
+                        core.execution_step('%s:init' % node, node_inits))
+                steps.append(step)
+                if len(node_exits) > 0:
+                    steps.append(
+                        core.execution_step('%s:exit' % node, node_exits))
+                step = core.execution_step(node, steps)
             Task(
                 node=node, step=step, outputs=outputs,
                 name='grouped_by_node',
@@ -577,6 +582,11 @@ def get_step(self):
             Task.TASK_SETUP, [self._step] + report_steps, self)
         instance_init_nets, instance_exit_nets = get_setup_nets(
             Task.TASK_INSTANCE_SETUP, [self._step] + report_steps, self)
+        if len(self._outputs) == 0:
+            output_net = core.Net('%s:output' % self.name)
+            self.add_output(output_net.ConstantFill(
+                [], 1, dtype=core.DataType.INT32, value=0))
+            task_exit_nets.append(output_net)
 
         # Add instance-level report steps
         body = self._step if not report_steps else core.execution_step(

From 89834dfe647d246f5bd3549a884e31ae602a25bd Mon Sep 17 00:00:00 2001
From: Tommy Yu <tommypacker@fb.com>
Date: Wed, 29 Aug 2018 14:47:36 -0700
Subject: [PATCH 24/42] Add GPU version of HardSigmoid Op to Caffe2 (#10955)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10955

Add GPU version of HardSigmoid Op to Caffe2. Updated test file to
include GPU tests.

Reviewed By: enosair

Differential Revision: D9499353

fbshipit-source-id: fcb51902063d0c3e4b10354533a8a42cf827c545
---
 caffe2/operators/hard_sigmoid_op.cu           | 91 +++++++++++++++++++
 .../operator_test/elementwise_ops_test.py     |  2 +-
 2 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 caffe2/operators/hard_sigmoid_op.cu

diff --git a/caffe2/operators/hard_sigmoid_op.cu b/caffe2/operators/hard_sigmoid_op.cu
new file mode 100644
index 0000000000000..ed3a4ec828688
--- /dev/null
+++ b/caffe2/operators/hard_sigmoid_op.cu
@@ -0,0 +1,91 @@
+#include "caffe2/operators/hard_sigmoid_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void HardSigmoidCUDAKernel(
+    const int N,
+    const T alpha,
+    const T beta,
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = max(T(0), min(T(1), alpha * __ldg(X + i) + beta));
+#else
+    Y[i] = max(T(0), min(T(1), alpha * X[i] + beta));
+#endif
+  }
+}
+
+template <typename T>
+__global__ void HardSigmoidGradientCUDAKernel(
+    const int N,
+    const T alpha,
+    const T* dY,
+    const T* Y,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = (__ldg(Y + i) > T(0) && __ldg(Y + i) < T(1)) ? __ldg(dY + i) * alpha
+                                                         : T(0);
+#else
+    dX[i] = (Y[i] > T(0) && Y[i] < T(1)) ? dY[i] * alpha : T(0);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool HardSigmoidFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  HardSigmoidCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, alpha, beta, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool HardSigmoidGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  HardSigmoidGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, alpha, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    HardSigmoid,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        HardSigmoidFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    HardSigmoidGradient,
+    BinaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        HardSigmoidGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 0e590307a8885..c20aad4218f17 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -338,7 +338,7 @@ def sigmoid_ref(X):
            alpha=st.floats(min_value=-100.0, max_value=100.0),
            beta=st.floats(min_value=-100.0, max_value=100.0),
            engine=st.sampled_from([""]),
-           **hu.gcs_cpu_only)
+           **hu.gcs)
     def test_hard_sigmoid(self, X, inplace, alpha, beta, engine, gc, dc):
         # Prevent alpha and beta from mutually being 0 to avoid a division
         # error when adjusting our inputs

From c755616e006efe011726105e2d7a1d7502c989b9 Mon Sep 17 00:00:00 2001
From: jgong5 <jiong.gong@intel.com>
Date: Wed, 29 Aug 2018 14:56:55 -0700
Subject: [PATCH 25/42] Enable Detectron model inference for CPU and MKL-DNN
 paths (#10157)

Summary:
1. Support ops needed for inference of Faster-RCNN/Mask-RCNN needed in Detectron, mostly direct fallbacks.
2. Use CPU device to hold 0-dim tensors and integer tensors in both fallback op and blob feeder, needed by Detectron models.
3. Ignore 0-dim tensor in MKL-DNN concat operator.
4. Generate dynamic library of Detectron module for CPU device.

This PR obsoletes #9164.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10157

Differential Revision: D9276837

Pulled By: yinghai

fbshipit-source-id: dc364932ae4a2e7fcefdee70b5fce3c0cee91b6f
---
 caffe2/ideep/operators/concat_split_op.cc     |  12 +-
 .../operators/operator_fallback_ideep.cc      |   8 +
 .../ideep/operators/operator_fallback_ideep.h |  63 ++++----
 .../python/ideep/operator_fallback_op_test.py |  99 ++++++++++++
 caffe2/python/pybind_state_ideep.cc           | 141 ++++++++++--------
 modules/detectron/CMakeLists.txt              |   4 +
 modules/detectron/batch_permutation_op.cc     |  10 ++
 modules/detectron/upsample_nearest_op.cc      |   9 ++
 modules/detectron/upsample_nearest_op.h       |  46 +++++-
 9 files changed, 299 insertions(+), 93 deletions(-)
 create mode 100644 caffe2/python/ideep/operator_fallback_op_test.py

diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index eb2d5b6acf1a6..25d4e16d2f9e7 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -25,13 +25,21 @@ class IDEEPConcatOp final : public IDEEPOperator {
   virtual ~IDEEPConcatOp() {}
 
   bool RunOnDevice() override {
-    const auto& input_zero = Input(INPUT0);
     auto* output = Output(OUTPUT);
     TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO, CPU);
 
     vector<itensor> inputs;
     for (int i = 0; i < InputSize(); ++i) {
-      inputs.emplace_back(Input(i));
+      if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
+        inputs.emplace_back(Input(i));
+      } else {
+        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType<Tensor>(CPU),
+                      "Expect cpu tensor if not itensor");
+        auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
+        CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
+                      tensor_cpu.size_from_dim(0) == 0,
+                      "Expect zero dim tensor");
+      }
     }
 
     auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output);
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index 8251b386eeb3c..75895c5d84434 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -32,6 +32,8 @@
 #include <caffe2/operators/tanh_op.h>
 #include <caffe2/operators/transpose_op.h>
 #include <caffe2/operators/utility_ops.h>
+#include <caffe2/operators/affine_channel_op.h>
+#include <caffe2/operators/stop_gradient.h>
 #include <caffe2/sgd/adam_op.h>
 #include <caffe2/sgd/iter_op.h>
 #include <caffe2/sgd/learning_rate_op.h>
@@ -116,6 +118,12 @@ REGISTER_IDEEP_OPERATOR(
 REGISTER_IDEEP_OPERATOR(
     BBoxTransform,
     IDEEPFallbackOp<BBoxTransformOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    AffineChannel,
+    IDEEPFallbackOp<AffineChannelOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    StopGradient,
+    IDEEPFallbackOp<StopGradientOp<CPUContext>>);
 
 REGISTER_IDEEP_OPERATOR(
     PadImage,
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index ae4f903c23c2f..31df729a21785 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -53,6 +53,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
     // then forward output blobs to local workspace.
     std::unordered_map<string, string> forwarded_output_blobs;
     for (int i = 0; i < base_def_.output_size(); i++) {
+      // For in-place case, the in/output tensor for local_ws must be
+      // re-created, instead of forwarding from current workspace.
       string parent_name(base_def_.output(i));
       if (!SkipOutputCopy::Contains(i)) {
         parent_name += "_cpu_output_blob_" + base_def_.type();
@@ -60,6 +62,13 @@ class IDEEPFallbackOp final : public IDEEPOperator {
       local_output_blobs_.push_back(ws->CreateBlob(parent_name));
       CHECK_NOTNULL(local_output_blobs_.back());
       forwarded_output_blobs[base_def_.output(i)] = parent_name;
+      output_inplace_.push_back(false);
+      for (const string &input_name : base_def_.input()) {
+        if (input_name == base_def_.output(i)) {
+          output_inplace_[i] = true;
+          break;
+        }
+      }
     }
     local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
     // Set up the symbols for the local workspace.
@@ -67,31 +76,26 @@ class IDEEPFallbackOp final : public IDEEPOperator {
       local_input_blobs_.push_back(local_ws_->CreateBlob(name));
       CHECK_NOTNULL(local_input_blobs_.back());
     }
+    input_share_.resize(local_input_blobs_.size(), false);
     base_op_.reset(new CPUOp(base_def_, local_ws_.get()));
   }
 
   bool RunOnDevice() override {
     for (int i = 0; i < InputSize(); ++i) {
-      if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
+      if (InputIsType<itensor>(i) &&
+          Input(i).get_data_type() == itensor::data_type::f32) {
         auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
-        dtensor->Resize(input.get_dims());
-        if (input.is_public_format()) {
-          dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
-        } else {
-          input.reorder_to(dtensor->template mutable_data<float>());
+        if (input_share_[i]) {
+          local_input_blobs_[i]->Reset();
         }
-      } else if (
-          InputIsType<itensor>(i) &&
-          Input(i).get_data_type() == itensor::data_type::s32) {
-        auto& input = Input(i);
+        input_share_[i] = false;
         auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
-              static_cast<long*>(input.get_data_handle()));
+              static_cast<float*>(input.get_data_handle()));
         } else {
-          input.reorder_to(dtensor->template mutable_data<long>());
+          input.reorder_to(dtensor->template mutable_data<float>());
         }
       } else {
         VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
@@ -99,8 +103,9 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         // local_input_blobs will only be used as const blob input for the
         // base op so we are still fine.
         local_input_blobs_[i]->ShareExternal(
-            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            const_cast<void *>(OperatorBase::Inputs()[i]->GetRaw()),
             OperatorBase::Inputs()[i]->meta());
+        input_share_[i] = true;
       }
     }
 
@@ -120,21 +125,16 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
-
       auto src_dims = src.dims();
-      if (src.ndim() == 0) {
-        VLOG(1) << "Copy output: index " << i << " skipped.";
+      if (src.template IsType<float>() &&
+          src.dims().size() != 0 && src.size_from_dim(0) != 0 &&
+          base_op_->type() != "Python") {
         Blob* dst = OperatorBase::OutputBlob(i);
-        dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
-        dtensor->Resize(src_dims);
-        dtensor->ShareData(src);
-        continue;
-      }
-
-      if (src.template IsType<float>()) {
-        Blob* dst = OperatorBase::OutputBlob(i);
-        if (!dst->template IsType<itensor>()) {
+        // The output tensor must be ideep tensor with public format.
+        // If reusing ideep tensor with non-public format, the tensor buffer
+        // will be interpreted incorrectly.
+        if (!dst->template IsType<itensor>() ||
+            !dst->template Get<itensor>().is_public_format()) {
           dst->Reset(new itensor());
         }
 
@@ -143,7 +143,12 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         if (dtensor->get_dims() != dst_dims) {
           dtensor->resize(dst_dims, itensor::data_type::f32);
         }
-        dtensor->set_data_handle(const_cast<void*>(src.raw_data()));
+        if (output_inplace_[i]) {
+          dtensor->reorder_from(dst_dims, itensor::data_type::f32,
+                                const_cast<void*>(src.raw_data()));
+        } else {
+          dtensor->set_data_handle(const_cast<void *>(src.raw_data()));
+        }
       } else {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         Blob* dst = OperatorBase::OutputBlob(i);
@@ -159,6 +164,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
  protected:
   vector<Blob*> local_input_blobs_;
   vector<Blob*> local_output_blobs_;
+  vector<bool> output_inplace_;
+  vector<bool> input_share_;
   std::unique_ptr<CPUOp> base_op_;
   std::unique_ptr<Workspace> local_ws_;
   OperatorDef base_def_;
diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py
new file mode 100644
index 0000000000000..19bdbaac8a217
--- /dev/null
+++ b/caffe2/python/ideep/operator_fallback_op_test.py
@@ -0,0 +1,99 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestFallbackOps(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 10),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 5),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           **mu.gcs)
+    def test_in_place(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, gc, dc):
+        # To expose fallback in-place potential issue, the fallback op
+        # following ideep op must be run at least two iterations.
+        conv = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            device_option=dc[0]
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(output_channels, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+
+        old_ws_name = workspace.CurrentWorkspace()
+        workspace.SwitchWorkspace("_device_check_", True)
+        workspace.FeedBlob('X', X, dc[0])
+        workspace.FeedBlob('w', w, dc[0])
+        workspace.FeedBlob('b', b, dc[0])
+        workspace.RunOperatorOnce(conv)
+        Y = workspace.FetchBlob('Y')
+
+        scale = np.random.randn(Y.shape[1]).astype(np.float32)
+        bias = np.random.randn(Y.shape[1]).astype(np.float32)
+        ac = core.CreateOperator(
+            "AffineChannel",
+            ["Y", "scale", "bias"],
+            ["Y"],
+            is_learnable=False,
+            device_option=dc[0]
+        )
+        workspace.FeedBlob('scale', scale, dc[0])
+        workspace.FeedBlob('bias', bias, dc[0])
+        workspace.RunOperatorOnce(ac)
+        workspace.RunOperatorOnce(conv)
+        workspace.RunOperatorOnce(ac)
+        Y0 = workspace.FetchBlob('Y')
+
+        workspace.ResetWorkspace()
+        dev_net = caffe2_pb2.NetDef()
+        conv_dev = caffe2_pb2.OperatorDef()
+        conv_dev.CopyFrom(conv)
+        conv_dev.device_option.CopyFrom(dc[1])
+        ac_dev = caffe2_pb2.OperatorDef()
+        ac_dev.CopyFrom(ac)
+        ac_dev.device_option.CopyFrom(dc[1])
+        dev_net.op.extend([conv_dev, ac_dev])
+        workspace.FeedBlob('X', X, dc[1])
+        workspace.FeedBlob('w', w, dc[1])
+        workspace.FeedBlob('b', b, dc[1])
+        workspace.FeedBlob('scale', scale, dc[1])
+        workspace.FeedBlob('bias', bias, dc[1])
+        workspace.RunNetOnce(dev_net)
+        workspace.RunNetOnce(dev_net)
+        Y1 = workspace.FetchBlob('Y')
+
+        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
+            print(Y1.flatten())
+            print(Y0.flatten())
+            print(np.max(np.abs(Y1 - Y0)))
+            self.assertTrue(False)
+
+        workspace.SwitchWorkspace(old_ws_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index 668c812cd8e1a..056558c9a7333 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -9,6 +9,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "caffe2/ideep/operators/operator_fallback_ideep.h"
 #include <caffe2/ideep/ideep_utils.h>
 
 namespace caffe2 {
@@ -19,42 +20,42 @@ USE_IDEEP_DEF_ALIASES();
 class IDeepFetcher;
 class IDeepFeeder;
 
-REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()),IDeepFetcher);
+REGISTER_IDEEP_OPERATOR(Python, IDEEPFallbackOp<PythonOp<CPUContext, false>>);
+
+REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()), IDeepFetcher);
 REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder);
 
 class IDeepFetcher : public BlobFetcherBase {
   TypeMeta type_transform(const itensor &atensor) {
-    switch(atensor.get_data_type()) {
-      case itensor::data_type::f32:
-        return TypeMeta::Make<float>();
-      case itensor::data_type::s16:
-        return TypeMeta::Make<float16>();
-      case itensor::data_type::s32:
-        return TypeMeta::Make<int>();
-      case itensor::data_type::s8:
-        return TypeMeta::Make<int8_t>();
-      case itensor::data_type::u8:
-        return TypeMeta::Make<uint8_t>();
-      default:
-        // Should we throw exception?
-        return TypeMeta();
+    switch (atensor.get_data_type()) {
+    case itensor::data_type::f32:
+      return TypeMeta::Make<float>();
+    case itensor::data_type::s32:
+      return TypeMeta::Make<int>();
+    case itensor::data_type::s8:
+      return TypeMeta::Make<int8_t>();
+    case itensor::data_type::u8:
+      return TypeMeta::Make<uint8_t>();
+    default:
+      // Should we throw exception?
+      return TypeMeta();
     }
   }
 
- public:
-  pybind11::object Fetch(const Blob& blob) override {
+public:
+  pybind11::object Fetch(const Blob &blob) override {
     try {
       return FetchTensor(blob.Get<itensor>(), true).obj;
-    } catch (ideep::error& e) {
-      VLOG(1) << "IDEEP error: " << e.message;
+    } catch (ideep::error &e) {
+      LOG(ERROR) << "IDEEP error: " << e.message;
       throw;
     }
   }
 
-  FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) {
+  FetchedBlob FetchTensor(const itensor &atensor, bool force_copy) {
     FetchedBlob result;
     CAFFE_ENFORCE(atensor.materialized(),
-        "Trying to fetch uninitialized tensor");
+                  "Trying to fetch uninitialized tensor");
     const int numpy_type = CaffeToNumpyType(type_transform(atensor));
     CAFFE_ENFORCE(
         numpy_type != -1,
@@ -64,17 +65,16 @@ class IDeepFetcher : public BlobFetcherBase {
     std::vector<npy_intp> npy_dims(dims.begin(), dims.end());
 
     result.copied = force_copy || atensor.need_reorder();
-    void* outPtr;
+    void *outPtr;
     if (result.copied) {
       result.obj = py::reinterpret_steal<py::object>(
           PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type));
       outPtr = static_cast<void *>(
-          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
+          PyArray_DATA(reinterpret_cast<PyArrayObject *>(result.obj.ptr())));
     } else {
       outPtr = atensor.get_data_handle();
-      result.obj = py::reinterpret_steal<py::object>(
-          PyArray_SimpleNewFromData(
-            atensor.ndims(), npy_dims.data(), numpy_type, outPtr));
+      result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
+          atensor.ndims(), npy_dims.data(), numpy_type, outPtr));
     }
 
     if (numpy_type == NPY_OBJECT) {
@@ -95,8 +95,6 @@ class IDeepFeeder : public BlobFeederBase {
       return itensor::data_type::f32;
     else if (meta == TypeMeta::Make<int>())
       return itensor::data_type::s32;
-    else if (meta == TypeMeta::Make<float16>())
-      return itensor::data_type::s16;
     else if (meta == TypeMeta::Make<int8_t>())
       return itensor::data_type::s8;
     else if (meta == TypeMeta::Make<uint8_t>())
@@ -105,53 +103,74 @@ class IDeepFeeder : public BlobFeederBase {
       return itensor::data_type::data_undef;
   }
 
- public:
-   void FeedTensor(
-       const DeviceOption& option,
-       PyArrayObject *original_array,
-       itensor *tensor) {
-     PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
-     auto g = MakeGuard([&]() {Py_XDECREF(array); });
-
-     const auto npy_type = PyArray_TYPE(array);
-     const TypeMeta& meta = NumpyTypeToCaffe(npy_type);
-     CAFFE_ENFORCE(
-        meta.id() != TypeIdentifier::uninitialized(),
+public:
+  void FeedTensor(
+      const DeviceOption &option,
+      PyArrayObject *original_array,
+      itensor *tensor) {
+    PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+    auto g = MakeGuard([&]() { Py_XDECREF(array); });
+    const auto npy_type = PyArray_TYPE(array);
+    const TypeMeta &meta = NumpyTypeToCaffe(npy_type);
+    CAFFE_ENFORCE_NE(
+        meta.id(),
+        TypeIdentifier::uninitialized(),
         "This numpy data type is not supported: ",
-        PyArray_TYPE(array),
-        ".");
+        PyArray_TYPE(array), ".");
 
-     int ndim = PyArray_NDIM(array);
-     npy_intp* npy_dims = PyArray_DIMS(array);
+    int ndim = PyArray_NDIM(array);
+    npy_intp *npy_dims = PyArray_DIMS(array);
 
-     itensor::dims adims;
-     for (int i = 0; i < ndim; i++) {
-       adims.push_back(static_cast<itensor::dims::value_type>(
-             npy_dims[i]));
-     }
+    itensor::dims adims;
+    for (int i = 0; i < ndim; i++) {
+      adims.push_back(static_cast<itensor::dims::value_type>(npy_dims[i]));
+    }
 
-     switch (npy_type) {
+    switch (npy_type) {
       case NPY_OBJECT:
       case NPY_UNICODE:
         CAFFE_THROW("IDeep doesn't support string");
         break;
       default:
         auto type = type_transform(meta);
-        tensor->resize(adims, type);
+        if (tensor->get_dims() != adims || type != tensor->get_data_type()) {
+          tensor->resize(adims, type);
+        }
         tensor->reorder_from(adims, type,
-            static_cast<void *>(PyArray_DATA(array)));
-     }
-   }
+                             static_cast<void *>(PyArray_DATA(array)));
+    }
+  }
 
-   void Feed(const DeviceOption& option, PyArrayObject* original_array,
-       Blob* blob) {
-      try {
+  bool ZeroDim(PyArrayObject *array) {
+    int ndim = PyArray_NDIM(array);
+    npy_intp *npy_dims = PyArray_DIMS(array);
+    return ndim == 0 ||
+      std::find(npy_dims, npy_dims + ndim, 0) != npy_dims + ndim;
+  }
+
+  void Feed(const DeviceOption &option, PyArrayObject *original_array,
+            Blob *blob) {
+    try {
+      PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+      auto g = MakeGuard([&]() { Py_XDECREF(array); });
+
+      const auto npy_type = PyArray_TYPE(array);
+      const TypeMeta &meta = NumpyTypeToCaffe(npy_type);
+      // TODO: if necessary, use dispatcher.
+      if (meta.Match<float>() && !ZeroDim(original_array)) {
         FeedTensor(option, original_array, blob->GetMutable<itensor>());
-      } catch (ideep::error& e) {
-        VLOG(1) << "IDEEP error: " << e.message;
-        throw;
+      } else {
+        DeviceOption cpu_option(option);
+        cpu_option.set_device_type(DeviceType::CPU);
+        TensorFeeder<CPUContext> cpu_tensor_feeder;
+        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
+                                     blob->GetMutableTensor(CPU));
       }
-   }
+    } catch (ideep::error &e) {
+      LOG(ERROR) << "IDEEP error: " << e.message;
+      throw;
+    }
+  }
 };
 
 } // namespace python
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
index f18077b829427..1791ca27a9859 100644
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@@ -11,4 +11,8 @@ if (USE_CUDA)
 
   target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
   install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
+elseif(NOT IOS_PLATFORM)
+  add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
+  target_link_libraries(caffe2_detectron_ops caffe2)
+  install(TARGETS caffe2_detectron_ops DESTINATION lib)
 endif()
diff --git a/modules/detectron/batch_permutation_op.cc b/modules/detectron/batch_permutation_op.cc
index f92d7dd236d75..032288f811de0 100644
--- a/modules/detectron/batch_permutation_op.cc
+++ b/modules/detectron/batch_permutation_op.cc
@@ -15,9 +15,19 @@
  */
 
 #include "batch_permutation_op.h"
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
 
 namespace caffe2 {
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(
+    BatchPermutation,
+    IDEEPFallbackOp<BatchPermutationOp<float, CPUContext>>);
+#endif
+
 REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
     BatchPermutationGradient,
diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc
index b668701b4ce4f..4fc4d6dcd93a3 100644
--- a/modules/detectron/upsample_nearest_op.cc
+++ b/modules/detectron/upsample_nearest_op.cc
@@ -15,8 +15,17 @@
  */
 
 #include "upsample_nearest_op.h"
+#ifdef CAFFE2_USE_IDEEP
+#include "caffe2/ideep/operators/operator_fallback_ideep.h"
+#include "caffe2/ideep/utils/ideep_operator.h"
+#endif
 
 namespace caffe2 {
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(
+    UpsampleNearest,
+    IDEEPFallbackOp<UpsampleNearestOp<float, CPUContext>>);
+#endif
 
 REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h
index e24d705bc14af..17f77855509e6 100644
--- a/modules/detectron/upsample_nearest_op.h
+++ b/modules/detectron/upsample_nearest_op.h
@@ -35,8 +35,50 @@ class UpsampleNearestOp final : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
+    auto translate_idx = [](int ii, int d1, int d2, int d3, int scale_factor) {
+      int x, y, z, w;
+      w = ii % d3;
+      ii = ii/d3;
+      z = ii % d2;
+      ii = ii/d2;
+      y = ii % d1;
+      ii = ii/d1;
+      x = ii;
+      w = w/scale_factor;
+      z = z/scale_factor;
+      d2 /= scale_factor;
+      d3 /= scale_factor;
+      return (((x*d1+y)*d2)+z)*d3+w;
+    };
+
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    auto out_shape = X.dims();
+    out_shape[X.ndim() - 1] *= scale_;
+    out_shape[X.ndim() - 2] *= scale_;
+    Y->Resize(out_shape);
+
+    int d1;
+    int d2;
+    int d3;
+    if (X.ndim() == 3) {
+      d1 = Y->dim32(0);
+      d2 = Y->dim32(1);
+      d3 = Y->dim32(2);
+    } else {
+      d1 = Y->dim32(1);
+      d2 = Y->dim32(2);
+      d3 = Y->dim32(3);
+    }
+
+    const T *input_data = X.template data<T>();
+    T *output_data = Y->template mutable_data<T>();
+
+    for (int ii = 0; ii < Y->size(); ii++) {
+      int ipidx = translate_idx(ii, d1, d2, d3, scale_);
+      output_data[ii] = input_data[ipidx];
+    }
+    return true;
   }
 
  protected:

From d9b74f6540abd8e969a9abae279cbc9055140709 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Wed, 29 Aug 2018 14:58:13 -0700
Subject: [PATCH 26/42] Make it possible to disable JIT using env variables
 (#10867)

Summary:
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10867

Differential Revision: D9556882

Pulled By: apaszke

fbshipit-source-id: 04c0ca875d15d37dd9ac05ac7b515cd899ddb7e4
---
 test/test_jit.py      |  22 +++++++
 torch/jit/__init__.py | 130 ++++++++++++++++++++++++++----------------
 2 files changed, 102 insertions(+), 50 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index d9d345b0e8fce..e99203333dc38 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -396,6 +396,28 @@ def fn(x, y):
 
         self.assertEqual(fn(x, y), fn_traced(x, y))
 
+    def test_disabled(self):
+        torch.jit._enabled = False
+        try:
+            def f(x, y):
+                return x + y
+
+            self.assertIs(torch.jit.trace(torch.randn(2, 2), torch.randn(2, 2))(f), f)
+            self.assertIs(torch.jit.script(f), f)
+
+            class MyModule(torch.jit.ScriptModule):
+                @torch.jit.script_method
+                def method(self, x):
+                    return x
+
+            # XXX: Unfortunately ScriptModule won't simply become Module now,
+            # because that requires disabling the JIT at startup time, which
+            # we can't do in here.
+            # We need to or those two conditions to make it work with all versions of Python
+            self.assertTrue(inspect.ismethod(MyModule.method) or inspect.isfunction(MyModule.method))
+        finally:
+            torch.jit._enabled = True
+
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
     # and we attempted to trace its derivative (which is not
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 30904ac7adff7..e0314acea4a17 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -21,6 +21,25 @@
 import collections
 import re
 
+
+def _parse_env(name, default, true_message, false_message):
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    if value.lower() in {'1', 'true', 'yes'}:
+        return True
+    elif value.lower() in {'0', 'false', 'no'}:
+        return False
+    if value == '1v':
+        print(true_message)
+        return True
+    elif value == '0v':
+        print(false_message)
+        return False
+    raise ValueError('Unknown setting of {}. Try using 0 or 1.'.format(name))
+
+
+_enabled = _parse_env('PYTORCH_JIT', True, "> Using PyTorch JIT", "> PyTorch JIT DISABLED")
 _flatten = torch._C._jit_flatten
 _unflatten = torch._C._jit_unflatten
 _jit_script_compile = torch._C._jit_script_compile
@@ -431,6 +450,8 @@ def trace(*args, **kwargs):
         ...     return x * 2
     """
     def wrapper(func):
+        if not _enabled:
+            return func
         executor_options = {'optimize': True}
         for name in executor_options:
             executor_options[name] = kwargs.pop(name, executor_options[name])
@@ -509,6 +530,8 @@ def __getattr__(self, attr):
 
 
 def script(fn, optimize=True, _frames_up=0):
+    if not _enabled:
+        return fn
     rcb = createResolutionCallback(_frames_up + 1)
     ast = get_jit_ast(fn, is_method=False)
     graph = _jit_script_compile(ast, rcb)
@@ -528,6 +551,8 @@ def script(fn, optimize=True, _frames_up=0):
 
 
 def script_method(fn):
+    if not _enabled:
+        return fn
     # NOTE: we need to traverse two frames here because the meta-class frame
     # for ScriptModule will be present, as opposed to invoking @script on a
     # a function or invoking define() on a CompilationUnit.
@@ -547,6 +572,8 @@ def script_method(fn):
 
 def batch(batch_size=1, optimize=True, _frames_up=0):
     def decorator(fn):
+        if not _enabled:
+            return fn
         import torch.jit.batchop
         mod = script(fn, optimize, _frames_up)
         res_graph = torch.to_batch_graph(mod.graph)
@@ -757,57 +784,60 @@ def init_then_register(self, *args, **kwargs):
         return super(ScriptMeta, cls).__init__(name, bases, attrs)
 
 
-class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)):
-    def __init__(self, optimize=True):
-        # must be before Module.init since the field is used in __getattr__
-        Module.__init__(self)
-        self._set_optimized(optimize)
-        self._parameters = OrderedParameterDict(self)
-        self._buffers = OrderedBufferDict(self)
-        self._modules = OrderedModuleDict(self)
-
-    def __getattr__(self, attr):
-        if self._has_method(attr):
-            if attr in self.__class__._original_methods:
-                original_method = self.__class__._original_methods[attr]
-                script_method = self._get_method(attr)
-                return functools.wraps(original_method)(script_method)
+if _enabled:
+    class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)):
+        def __init__(self, optimize=True):
+            # must be before Module.init since the field is used in __getattr__
+            Module.__init__(self)
+            self._set_optimized(optimize)
+            self._parameters = OrderedParameterDict(self)
+            self._buffers = OrderedBufferDict(self)
+            self._modules = OrderedModuleDict(self)
+
+        def __getattr__(self, attr):
+            if self._has_method(attr):
+                if attr in self.__class__._original_methods:
+                    original_method = self.__class__._original_methods[attr]
+                    script_method = self._get_method(attr)
+                    return functools.wraps(original_method)(script_method)
+                else:
+                    return self._get_method(attr)
+            if attr == 'graph' and self._has_method('forward'):
+                return self.__getattr__('forward').graph
+            return Module.__getattr__(self, attr)
+
+        def __setattr__(self, attr, value):
+            if attr not in self._constants_set:
+                return super(ScriptModule, self).__setattr__(attr, value)
+            if hasattr(self, attr):
+                raise RuntimeError("attempting to re-assign constant '{}'".format(attr))
+            if isinstance(value, ModuleList):
+                # special case for list of modules. Modules need to be registered with their
+                # parent module. To do this, we create a ConstModuleList, which is itself a module, that
+                # contains each of these modules as submodules. The ConstModuleList then
+                # is set as an attribute of the parent module.
+                super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value))
+            elif isinstance(value, Sequential):
+                super(ScriptModule, self).__setattr__(attr, _ConstSequential(value))
             else:
-                return self._get_method(attr)
-        if attr == 'graph' and self._has_method('forward'):
-            return self.__getattr__('forward').graph
-        return Module.__getattr__(self, attr)
-
-    def __setattr__(self, attr, value):
-        if attr not in self._constants_set:
-            return super(ScriptModule, self).__setattr__(attr, value)
-        if hasattr(self, attr):
-            raise RuntimeError("attempting to re-assign constant '{}'".format(attr))
-        if isinstance(value, ModuleList):
-            # special case for list of modules. Modules need to be registered with their
-            # parent module. To do this, we create a ConstModuleList, which is itself a module, that
-            # contains each of these modules as submodules. The ConstModuleList then
-            # is set as an attribute of the parent module.
-            super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value))
-        elif isinstance(value, Sequential):
-            super(ScriptModule, self).__setattr__(attr, _ConstSequential(value))
-        else:
-            super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value))
-
-    def __dir__(self):
-        return sorted(Module.__dir__(self) + self._method_names())
-
-    def define(self, lang):
-        # We use frames_up=1 to get to the proper surrounding scope. The stack
-        # will look like:
-        # 0. createResolutionCallback
-        # 1. define()
-        # 2. surrounding scope.
-        #
-        # createResolutionCallback internally adds 1 to get us to our frame, then
-        # we add 1 to get to the proper surrounding scope.
-        rcb = createResolutionCallback(frames_up=1)
-        self._define(lang, rcb, True)
+                super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value))
+
+        def __dir__(self):
+            return sorted(Module.__dir__(self) + self._method_names())
+
+        def define(self, lang):
+            # We use frames_up=1 to get to the proper surrounding scope. The stack
+            # will look like:
+            # 0. createResolutionCallback
+            # 1. define()
+            # 2. surrounding scope.
+            #
+            # createResolutionCallback internally adds 1 to get us to our frame, then
+            # we add 1 to get to the proper surrounding scope.
+            rcb = createResolutionCallback(frames_up=1)
+            self._define(lang, rcb, True)
+else:
+    ScriptModule = torch.nn.Module
 
 
 def _get_methods(cls):

From 6b87198245c29a73a4203576c5b8cb33bd71418d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 29 Aug 2018 15:28:03 -0700
Subject: [PATCH 27/42] Devirtualize StorageImpl deconstructor (#11018)

Summary:
Further align at::StorageImpl with caffe2::StorageImpl
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11018

Reviewed By: ezyang

Differential Revision: D9562256

Pulled By: cpuhrsch

fbshipit-source-id: d929317f6226a1e2550b78034b723afbae343aaa
---
 aten/src/ATen/StorageImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index a9394d5393563..a18318790eec2 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -21,7 +21,7 @@ struct Type;
 struct AT_API StorageImpl : public c10::intrusive_ptr_target {
  public:
   StorageImpl() = delete;
-  virtual ~StorageImpl() {};
+  ~StorageImpl() {};
   StorageImpl(
       at::DataType data_type,
       ptrdiff_t size,

From ef7fc2a3e15e9ff4dd242e8137306418bfb52c06 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 29 Aug 2018 16:00:34 -0700
Subject: [PATCH 28/42] Remove at::StorageImpl::finalizer_ (#11022)

Summary:
Unused member variable
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11022

Reviewed By: ezyang

Differential Revision: D9562520

Pulled By: cpuhrsch

fbshipit-source-id: af190b3ba06d33d65fa0fabffb34a0df769f38d0
---
 aten/src/ATen/StorageImpl.cpp | 3 +--
 aten/src/ATen/StorageImpl.h   | 5 -----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index af488472f24b5..bc2d69a7aa8f5 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -12,8 +12,7 @@ StorageImpl::StorageImpl(
       data_ptr_(std::move(data_ptr)),
       size_(size),
       resizable_(resizable),
-      allocator_(allocator),
-      finalizer_(nullptr) {}
+      allocator_(allocator) {}
 
 StorageImpl::StorageImpl(
     at::DataType data_type,
diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index a18318790eec2..f484cadbdac97 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -61,10 +61,6 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   }
 
   void release_resources() override {
-    if (finalizer_) {
-      (*finalizer_)();
-    }
-    finalizer_ = nullptr;
     data_ptr_.clear();
   }
 
@@ -135,6 +131,5 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   ptrdiff_t size_;
   bool resizable_;
   at::Allocator* allocator_;
-  std::unique_ptr<THFinalizer> finalizer_;
 };
 } // namespace at

From 98d85b1790fb2dd7600de7b6f18b00e20d9750d5 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Wed, 29 Aug 2018 16:13:10 -0700
Subject: [PATCH 29/42] Debugging help + test

Summary: When conversion fails, dump more information to help fix up the netdef

Reviewed By: hyuen, yinghai

Differential Revision: D9558667

fbshipit-source-id: 8917cc61c9be6285697e4f8395a9dbc7135f618e
---
 caffe2/opt/converter.cc               | 22 +++++++++++++++++-----
 caffe2/python/transformations_test.py | 14 ++++++++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 6a8d22253444a..80e2308eabf3c 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -322,14 +322,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
     currentBasicBlock->pushInstructionNode(opNode);
   }
 
-  CAFFE_ENFORCE(
-      externalInputNames.size() == 0,
-      "Attempting to convert an ill-formed network: \
-      external_input contains unused blobs");
+  if (externalInputNames.size()) {
+    std::ostringstream os;
+    for (const auto& inputName : externalInputNames) {
+      os << "\"" << inputName << "\" ";
+    }
+
+    CAFFE_ENFORCE(
+        externalInputNames.size() == 0,
+        "Attempting to convert an ill-formed network: external_input contains ",
+        externalInputNames.size(),
+        " unused blobs: ",
+        os.str());
+  }
 
   for (const auto& outputName : net.external_output()) {
     CAFFE_ENFORCE(
-        blobMap.count(outputName), "NetDef has ill-formed external_output");
+        blobMap.count(outputName),
+        "NetDef has ill-formed external_output: \"",
+        outputName,
+        "\"");
     module.outputs.insert(blobMap[outputName]);
   }
 
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 2437933ae624e..1a579b519fe09 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -391,3 +391,17 @@ def test_transformer_FuseConv3DBN(
             rtol=1e-02,
             atol=1e-04
         )
+
+    def test_converterEnforceUnusedInputs(self):
+        net = core.Net("net")
+        net.Relu(["X"], ["Y"])
+        net.Proto().external_input.extend(["fake"])
+        with self.assertRaises(Exception):
+            transformer.AddNNPACK(net)  # just testing the converter
+
+    def test_converterEnforceUnusedOutputs(self):
+        net = core.Net("net")
+        net.Relu(["X"], ["Y"])
+        net.Proto().external_output.extend(["fake"])
+        with self.assertRaises(Exception):
+            transformer.AddNNPACK(net)  # just testing the converter

From 2cc98d8df7365aa26eca555028035aef20da3088 Mon Sep 17 00:00:00 2001
From: pbialecki <bialecki.emb@gmail.com>
Date: Wed, 29 Aug 2018 16:24:16 -0700
Subject: [PATCH 30/42] Adds `dim` argument to `torch.unique` (#10423)

Summary:
Initial version of `unique` supporting a `dim` argument.

As discussed in [this issue](https://github.com/pytorch/pytorch/issues/9997) I added the `dim` argument to `torch.unique` with the same behavior like [numpy](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.unique.html).

Since the implementation is based on `std/thrust::unique`, the `tensor` always needs to be sorted. The `sorted` argument in `torch.unique` does not have any function, as in the CUDA version of the plain `torch.unique`.

To check the performance and equal behavior between `torch.unique` and `np.unique`, I've used [this gist](https://gist.github.com/ptrblck/ac0dc862f4e1766f0e1036c252cdb105).

Currently we achieve the following timings for an input of `x = torch.randint(2, (1000, 1000))`:
(The values are calculated by taking the average of the times for both dimension)

| Device | PyTorch (return_inverse=False) | Numpy (return_inverse=False) | PyTorch (return_inverse=True) | Numpy (return_inverse=True) |
| --- | --- | --- | --- | --- |
| CPU | ~0.007331s | ~0.022452s | ~0.011139s | ~0.044800s |
| GPU | ~0.006154s | - | ~0.105373s | - |

Many thanks to colesbury for the awesome mentoring and the valuable advices on the general implementation and performance issues!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10423

Differential Revision: D9517289

Pulled By: soumith

fbshipit-source-id: a4754f805223589c2847c98b8e4e39d8c3ddb7b5
---
 aten/src/ATen/native/Unique.cpp            | 84 +++++++++++++++++++
 aten/src/ATen/native/cuda/Unique.cu        | 97 ++++++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  5 ++
 test/test_torch.py                         | 61 ++++++++++++++
 torch/functional.py                        | 20 +++--
 torch/tensor.py                            | 15 +++-
 6 files changed, 273 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index d9bd94e1f7810..d5ff300c0dd9e 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -47,6 +47,82 @@ std::tuple<Tensor, Tensor> _unique_cpu_template(
   }
   return std::make_tuple(output, inverse_indices);
 }
+
+template<class ForwardIt>
+ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
+  std::vector<int64_t>& indices, Tensor inverse_indices_vec) {
+    if (first == last) {
+      return last;
+    }
+    // save to calculate distance to iterators
+    ForwardIt begin = first;
+
+    // set first inverse index
+    inverse_indices_vec[indices[0]] = 0;
+
+    ForwardIt result = first;
+    while (++first != last) {
+      if (!at::equal(*result, *first) && ++result != first) {
+          *result = std::move(*first);
+      }
+      int64_t idx_result = std::distance(begin, result);
+      int64_t idx_first = std::distance(begin, first);
+      inverse_indices_vec[indices[idx_first]] = idx_result;
+    }
+
+    return ++result;
+  }
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor> _unique_dim_cpu_template(
+    const Tensor& self,
+    const int64_t dim,
+    const bool return_inverse) {
+  // reshape tensor as [dim, -1]
+  Tensor input_flat = self.transpose(dim, 0);
+  auto orig_sizes = input_flat.sizes().vec();
+  input_flat = input_flat.contiguous().view({input_flat.size(0), -1});
+
+  std::vector<int64_t> indices(input_flat.size(0));
+  std::iota(indices.begin(), indices.end(), 0);
+  int64_t numel = input_flat.size(1);
+  scalar_t* input_flat_ptr = ((scalar_t*)input_flat.data_ptr());
+
+  // sort indices using data
+  std::sort(indices.begin(), indices.end(),
+    [&](int64_t a, int64_t b) -> bool {
+      for (int64_t i = 0; i < numel; ++i) {
+        scalar_t lhs = input_flat_ptr[i + a * numel];
+        scalar_t rhs = input_flat_ptr[i + b * numel];
+        if (lhs < rhs) {
+          return true;
+        } else if (lhs > rhs) {
+          return false;
+        }
+      }
+      return false;
+    });
+
+  Tensor input_sorted = at::empty(input_flat.sizes(), input_flat.type());
+  for (int i = 0; i < indices.size(); ++i) {
+    input_sorted[i] = input_flat[indices[i]];
+  }
+
+  Tensor inverse_indices = at::empty(indices.size(), self.type().toScalarType(kLong));
+  std::vector<Tensor> input_unbind = at::unbind(input_sorted, 0);
+  auto last = _unique_dim_cpu_impl(
+    input_unbind.begin(), input_unbind.end(), indices, inverse_indices);
+  input_unbind.erase(last, input_unbind.end());
+
+  // reshape back
+  auto output = at::stack(input_unbind, 0);
+  auto new_sizes = std::vector<int64_t>(orig_sizes);
+  new_sizes[0] = -1;
+  output = output.view(new_sizes);
+  output = output.transpose(0, dim);
+
+  return std::make_tuple(output, inverse_indices);
+}
 } // namespace
 
 std::tuple<Tensor, Tensor>
@@ -56,5 +132,13 @@ _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
   });
 }
 
+std::tuple<Tensor, Tensor>
+_unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
+  return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+    // The current implementation using `dim` always sorts due to unhashable tensors
+    return _unique_dim_cpu_template<scalar_t>(self, dim, return_inverse);
+  });
+}
+
 }  // namespace native
 }  // namespace at
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index f2e13b4c708b6..c29337f90f134 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -69,6 +69,92 @@ template <typename scalar_t>
     return std::tuple<Tensor, Tensor>(output, inverse_indices);
 
   }
+
+template <typename scalar_t>
+  std::tuple<Tensor, Tensor> _unique_dim_cuda_template(
+    const Tensor& self,
+    const int64_t dim,
+    const bool return_inverse) {
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    Tensor input_flat = self.transpose(dim, 0);
+    auto orig_sizes = input_flat.sizes().vec();
+    input_flat = input_flat.contiguous().view({input_flat.size(0), -1});
+
+    scalar_t* input_flat_ptr = input_flat.data<scalar_t>();
+
+    Tensor indices = at::arange(0, input_flat.size(0), self.type().toScalarType(kLong));
+    int64_t* indices_ptr = indices.data<int64_t>();
+    int64_t numel = input_flat.size(1);
+
+    // sort indices using data
+    thrust::sort(policy, indices_ptr, indices_ptr + indices.numel(),
+      [=] __device__ (int64_t a, int64_t b) -> bool {
+        for (int64_t i = 0; i < numel; ++i) {
+          scalar_t lhs = input_flat_ptr[i + a * numel];
+          scalar_t rhs = input_flat_ptr[i + b * numel];
+          if (lhs < rhs) {
+            return true;
+          } else if (lhs > rhs) {
+            return false;
+          }
+        }
+        return false;
+      });
+
+    Tensor input_sorted = input_flat.index_select(0, indices);
+
+    // get unique tensors
+    scalar_t* input_sorted_ptr = input_sorted.data<scalar_t>();    
+    Tensor input_sorted_indices = at::arange(0, input_sorted.size(0), self.type().toScalarType(kLong));
+    int64_t* input_sorted_indices_ptr = input_sorted_indices.data<int64_t>();
+    auto last = thrust::unique(policy, input_sorted_indices_ptr, input_sorted_indices_ptr + input_sorted_indices.numel(),
+      [=] __device__ (int64_t a, int64_t b) -> bool {
+        for (int64_t i = 0; i < numel; ++i) {
+          scalar_t lhs = input_sorted_ptr[i + a * numel];
+          scalar_t rhs = input_sorted_ptr[i + b * numel];
+          if (lhs != rhs) {
+            return false;
+          }
+        }
+        return true;
+      });
+    input_sorted_indices.resize_(last - input_sorted_indices_ptr);
+    Tensor output = input_sorted.index_select(0, input_sorted_indices);
+
+    // reshape back
+    auto new_sizes = std::vector<int64_t>(orig_sizes);
+    new_sizes[0] = -1;
+    output = output.view(new_sizes);
+    output = output.transpose(0, dim);
+
+    // calculate inverse indices
+    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
+    if (return_inverse) {
+      int64_t size = self.size(dim);
+      inverse_indices.resize_(size);
+      Tensor mask = at::empty(input_sorted.size(0), self.type().toScalarType(kLong));
+      mask[0] = 1;
+      for (int i = 0; i < input_sorted.size(0) - 1; ++i) {
+        if (!at::equal(input_sorted[i], input_sorted[i+1])) {
+          mask[i+1] = 1; 
+        } else {
+          mask[i+1] = 0;
+        }
+      }
+
+      Tensor imask = at::cumsum(mask, 0) - 1;
+      for (int i = 0; i < indices.size(0); ++i) {
+        inverse_indices[indices[i]] = imask[i];
+      }
+    }
+
+    THCudaCheck(cudaGetLastError());  
+    return std::tuple<Tensor, Tensor>(output, inverse_indices);
+  }
 } // namespace
 
 #endif
@@ -86,5 +172,16 @@ _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
 #endif
 }
 
+std::tuple<Tensor, Tensor>
+_unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
+  #ifndef __HIP_PLATFORM_HCC__
+    return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+      return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
+    });
+  #else
+    AT_ERROR("unique_dim_cuda: HIP not supported");
+  #endif
+}
+
 }  // namespace native
 }  // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 466fe6c3134e8..cb194cd0c7bde 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1748,6 +1748,11 @@
     CPU: _unique_cpu
     CUDA: _unique_cuda
 
+- func: _unique_dim(Tensor self, int64_t dim, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
+  dispatch:
+    CPU: _unique_dim_cpu
+    CUDA: _unique_dim_cuda
+
 - func: _unsafe_view(Tensor self, IntList size) -> Tensor
   variants: function
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 167a400ec9147..863f97ff1d20e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8485,6 +8485,67 @@ def test_unique(self):
         self.assertEqual(torch.ByteTensor([7, 42, 128, 133]), byte_unique)
         self.assertEqual(torch.LongTensor([3, 0, 0, 0, 1, 2]), byte_inverse)
 
+    def test_unique_dim(self):
+        def run_test(dtype=torch.float):
+            x = torch.tensor([[[1., 1.],
+                               [0., 1.],
+                               [2., 1.],
+                               [0., 1.]],
+                              [[1., 1.],
+                               [0., 1.],
+                               [2., 1.],
+                               [0., 1.]]], dtype=dtype)
+            expected_unique_dim0 = torch.tensor([[[1., 1.],
+                                                  [0., 1.],
+                                                  [2., 1.],
+                                                  [0., 1.]]], dtype=dtype)
+            expected_inverse_dim0 = torch.tensor([0, 0])
+            expected_unique_dim1 = torch.tensor([[[0., 1.],
+                                                  [1., 1.],
+                                                  [2., 1.]],
+                                                 [[0., 1.],
+                                                  [1., 1.],
+                                                  [2., 1.]]], dtype=dtype)
+            expected_inverse_dim1 = torch.tensor([1, 0, 2, 0])
+            expected_unique_dim2 = torch.tensor([[[1., 1.],
+                                                  [0., 1.],
+                                                  [2., 1.],
+                                                  [0., 1.]],
+                                                 [[1., 1.],
+                                                  [0., 1.],
+                                                  [2., 1.],
+                                                  [0., 1.]]], dtype=dtype)
+            expected_inverse_dim2 = torch.tensor([0, 1])
+
+            # dim0
+            x_unique = torch.unique(x, dim=0)
+            self.assertEqual(expected_unique_dim0, x_unique)
+
+            x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=0)
+            self.assertEqual(expected_unique_dim0, x_unique)
+            self.assertEqual(expected_inverse_dim0, x_inverse)
+
+            # dim1
+            x_unique = torch.unique(x, dim=1)
+            self.assertEqual(expected_unique_dim1, x_unique)
+
+            x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=1)
+            self.assertEqual(expected_unique_dim1, x_unique)
+            self.assertEqual(expected_inverse_dim1, x_inverse)
+
+            # dim2
+            x_unique = torch.unique(x, dim=2)
+            self.assertEqual(expected_unique_dim2, x_unique)
+
+            x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=2)
+            self.assertEqual(expected_unique_dim2, x_unique)
+            self.assertEqual(expected_inverse_dim2, x_inverse)
+
+        run_test(torch.float)
+        run_test(torch.double)
+        run_test(torch.long)
+        run_test(torch.uint8)
+
     @staticmethod
     def _test_bincount(self, device):
         # negative input throws
diff --git a/torch/functional.py b/torch/functional.py
index 055141b7469a2..8c78b6efe9f80 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -389,7 +389,7 @@ def isnan(tensor):
     return tensor != tensor
 
 
-def unique(input, sorted=False, return_inverse=False):
+def unique(input, sorted=False, return_inverse=False, dim=None):
     r"""Returns the unique scalar elements of the input tensor as a 1-D tensor.
 
     Arguments:
@@ -431,11 +431,19 @@ def unique(input, sorted=False, return_inverse=False):
                 [ 1,  2]])
 
     """
-    output, inverse_indices = torch._unique(
-        input,
-        sorted=sorted,
-        return_inverse=return_inverse,
-    )
+    if dim is not None:
+        output, inverse_indices = torch._unique_dim(
+            input,
+            dim,
+            sorted=sorted,
+            return_inverse=return_inverse
+        )
+    else:
+        output, inverse_indices = torch._unique(
+            input,
+            sorted=sorted,
+            return_inverse=return_inverse,
+        )
     if return_inverse:
         return output, inverse_indices
     else:
diff --git a/torch/tensor.py b/torch/tensor.py
index ed2f7f0c10a56..904d3a5eeb376 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -319,13 +319,22 @@ def masked_fill(self, mask, value):
         """
         return self.clone().masked_fill_(mask, value)
 
-    def unique(self, sorted=False, return_inverse=False):
+    def unique(self, sorted=False, return_inverse=False, dim=None):
         r"""Returns the unique scalar elements of the tensor as a 1-D tensor.
 
         See :func:`torch.unique`
         """
-        output, inverse_indices = self._unique(
-            sorted=sorted, return_inverse=return_inverse)
+        if dim is not None:
+            output, inverse_indices = self._unique_dim(
+                sorted=sorted,
+                return_inverse=return_inverse,
+                dim=dim
+            )
+        else:
+            output, inverse_indices = self._unique(
+                sorted=sorted,
+                return_inverse=return_inverse
+            )
         if return_inverse:
             return output, inverse_indices
         else:

From c4e1adf29d0b22fa5ff0ea2206a22f4d035c36cb Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Wed, 29 Aug 2018 16:26:51 -0700
Subject: [PATCH 31/42] Remove THHalf type

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11010

Reviewed By: ezyang

Differential Revision: D9561325

Pulled By: li-roy

fbshipit-source-id: 053cf2925ec1fc458db31e92bd31ffd23389f3e8
---
 aten/src/ATen/StorageImpl.h        |  3 +--
 aten/src/ATen/gen.py               | 12 ----------
 aten/src/TH/CMakeLists.txt         |  1 -
 aten/src/TH/THHalf.h               | 36 ++++++++----------------------
 aten/src/TH/THStorageFunctions.hpp |  1 -
 aten/src/TH/THTypeConversion.hpp   | 24 --------------------
 aten/src/TH/generic/THStorage.cpp  | 10 ++++-----
 7 files changed, 15 insertions(+), 72 deletions(-)
 delete mode 100644 aten/src/TH/THTypeConversion.hpp

diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index f484cadbdac97..68c5012777edd 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -3,7 +3,6 @@
 #include <ATen/Allocator.h>
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
-#include <TH/THTypeConversion.hpp>
 
 #include <ATen/core/intrusive_ptr.h>
 
@@ -44,7 +43,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   template <typename T>
   inline T* data() const {
     auto data_type_T =
-        at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<T>>::to());
+        at::scalarTypeToDataType(at::CTypeToScalarType<T>::to());
     if (dtype() != data_type_T) {
       AT_ERROR(
           "Attempt to access StorageImpl having data type ",
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index bb6d71f54c2d1..f7a4deb58dc94 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -283,19 +283,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     if scalar_name == "Half":
         env['SparseTensor'] = 'Tensor'
         if backend == "CUDA":
-            env['to_th_type'] = 'HalfFix<__half,Half>'
-            env['to_at_type'] = 'HalfFix<Half,__half>'
             env['AS_REAL'] = 'convert<half,double>'
-            env['THScalarType'] = 'half'
-        else:
-            env['to_th_type'] = 'HalfFix<THHalf,Half>'
-            env['to_at_type'] = 'HalfFix<Half,THHalf>'
-    elif scalar_name == 'Long':
-        env['to_th_type'] = 'long'
-        env['to_at_type'] = 'int64_t'
-    else:
-        env['to_th_type'] = ''
-        env['to_at_type'] = ''
 
     declarations, definitions = function_wrapper.create_derived(
         env, declarations)
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index ab9f5343eddad..9fe22beb0dc54 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -102,7 +102,6 @@ INSTALL(FILES
   THTensor.hpp
   THStorageFunctions.hpp
   THGenerator.hpp
-  THTypeConversion.hpp
   DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
 
 INSTALL(FILES
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
index 5ff85eb2c8f40..fb68639ec4475 100644
--- a/aten/src/TH/THHalf.h
+++ b/aten/src/TH/THHalf.h
@@ -2,40 +2,22 @@
 #define TH_HALF_H
 
 #include <TH/THGeneral.h>
-#include <stdint.h>
 
-/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
-#if defined(__GNUC__)
-#define __thalign__(n) __attribute__((aligned(n)))
-#elif defined(_WIN32)
-#define __thalign__(n) __declspec(align(n))
-#else
-#define __thalign__(n)
+#ifdef __cplusplus
+#include <ATen/TensorImpl.h>
 #endif
 
-typedef struct __thalign__(2){
-  unsigned short x;
-} __THHalf;
-
-typedef struct __thalign__(4) {
-  unsigned int x;
-} __THHalf2;
-
-typedef __THHalf THHalf;
-typedef __THHalf2 THHalf2;
+#ifdef __cplusplus
+#define THHalf at::Half
+#else
+typedef struct at_Half at_Half;
+#define THHalf at_Half
+#endif
 
 TH_API void TH_float2halfbits(float*, unsigned short*);
 TH_API void TH_halfbits2float(unsigned short*, float*);
 
 TH_API THHalf TH_float2half(float);
-TH_API float  TH_half2float(THHalf);
-
-#ifndef TH_HALF_BITS_TO_LITERAL
-# define TH_HALF_BITS_TO_LITERAL(n) { n }
-#endif
-
-#define TH_HALF_ZERO 0x0U
-#define TH_HALF_INF  0x7C00U
+TH_API float TH_half2float(THHalf);
 
-#undef __thalign__
 #endif
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 9fe0db5e5497f..362fa6e2c83de 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -8,7 +8,6 @@
 
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
-#include "THTypeConversion.hpp"
 #include <atomic>
 
 // Note [Weak references for intrusive refcounting]
diff --git a/aten/src/TH/THTypeConversion.hpp b/aten/src/TH/THTypeConversion.hpp
deleted file mode 100644
index d40169e7180e5..0000000000000
--- a/aten/src/TH/THTypeConversion.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include <ATen/core/Half.h>
-#include "THHalf.h"
-
-// Type traits to convert types to TH-specific types. Used primarily to
-// convert at::Half to TH's half type. This makes the conversion explicit.
-// FIXME: we should just use the same type
-
-namespace th {
-
-template <typename T>
-struct FromTypeConversion {
-  using type = T;
-};
-
-template <>
-struct FromTypeConversion<THHalf> {
-  using type = at::Half;
-};
-
-template <typename T>
-using from_type = typename FromTypeConversion<T>::type;
-}
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 21431ef778d5a..384ce9c632e22 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -21,13 +21,13 @@ size_t THStorage_(elementSize)()
 
 THStorage* THStorage_(new)(void)
 {
-  return THStorage_new(at::CTypeToScalarType<th::from_type<real>>::to());
+  return THStorage_new(at::CTypeToScalarType<real>::to());
 }
 
 THStorage* THStorage_(newWithSize)(ptrdiff_t size)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<real>>::to()),
+      at::scalarTypeToDataType(at::CTypeToScalarType<real>::to()),
       size,
       getTHDefaultAllocator(),
       true).release();
@@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
                                         at::Allocator *allocator)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<real>>::to()),
+      at::scalarTypeToDataType(at::CTypeToScalarType<real>::to()),
       size,
       allocator,
       true).release();
@@ -48,7 +48,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
 
 THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
 {
-  auto scalar_type = at::CTypeToScalarType<th::from_type<real>>::to();
+  auto scalar_type = at::CTypeToScalarType<real>::to();
   size_t actual_size = -1;
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
       at::scalarTypeToDataType(scalar_type),
@@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage)
 THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size,
                                                at::Allocator* allocator) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<real>>::to()),
+      at::scalarTypeToDataType(at::CTypeToScalarType<real>::to()),
       size,
       std::move(data),
       allocator,

From ae635b16f76ee4e0abf5f4848cca810bfaf17ba9 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Wed, 29 Aug 2018 17:08:31 -0700
Subject: [PATCH 32/42] Record tensor factory functions in trace (#10935)

Summary:
Things like torch.zeros now appear in traces rather than constants.

To continue to support our current level of ONNX export, we run
constant prop to turn these back into constants where possible before
export.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10935

Differential Revision: D9527427

Pulled By: zdevito

fbshipit-source-id: 552a8bcc01b911251dab7d7026faafdd7a3c758a
---
 ...it.test_constant_prop_loop_constant.expect | 20 ++++++
 ...test_call_traced_mod_from_script_fn.expect | 18 +++--
 ...cript.test_onnx_export_speculate-f1.expect | 23 +++---
 ...cript.test_onnx_export_speculate-f2.expect | 25 +++----
 test/test_jit.py                              | 28 ++++++--
 tools/autograd/gen_variable_factories.py      | 13 +++-
 tools/autograd/gen_variable_type.py           | 71 ++++++++++---------
 tools/autograd/templates/variable_factories.h |  2 +-
 tools/jit/gen_jit_dispatch.py                 |  3 +-
 torch/csrc/jit/constants.cpp                  |  4 +-
 .../csrc/jit/passes/constant_propagation.cpp  | 24 ++++---
 torch/csrc/jit/python_ir.cpp                  | 21 +++++-
 torch/csrc/jit/tracer.cpp                     | 10 +++
 torch/csrc/jit/tracer.h                       | 21 +++---
 torch/csrc/jit/type.cpp                       |  2 +
 torch/jit/__init__.py                         |  8 +--
 torch/onnx/utils.py                           | 35 +++++----
 17 files changed, 218 insertions(+), 110 deletions(-)
 create mode 100644 test/expect/TestJit.test_constant_prop_loop_constant.expect

diff --git a/test/expect/TestJit.test_constant_prop_loop_constant.expect b/test/expect/TestJit.test_constant_prop_loop_constant.expect
new file mode 100644
index 0000000000000..5bdca2f2c4789
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_loop_constant.expect
@@ -0,0 +1,20 @@
+graph() {
+  %b.1 : int = prim::Constant[value=0]()
+  %1 : int = prim::Constant[value=2147483647]()
+  %2 : int = prim::Constant[value=1]()
+  %b.3 : int = prim::Loop(%1, %2, %b.1)
+    block0(%4 : int, %5 : int) {
+      %b.2 : int = prim::Constant[value=1]()
+      %7 : int = prim::Constant[value=1]()
+      -> (%7, %b.2)
+    }
+  %8 : int = prim::Constant[value=2147483647]()
+  %9 : int = prim::Constant[value=0]()
+  %b : int = prim::Loop(%8, %9, %b.3)
+    block0(%11 : int, %12 : int) {
+      %b.4 : int = prim::Constant[value=2]()
+      %14 : int = prim::Constant[value=0]()
+      -> (%14, %b.4)
+    }
+  return (%b);
+}
diff --git a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
index 6a9a3a571967a..078091d52268e 100644
--- a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
@@ -1,8 +1,14 @@
 graph(%x : Dynamic) {
-  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
-  %2 : Double(3, 3) = aten::mm(%x, %1)
-  %3 : int = prim::Constant[value=1]()
-  %4 : int = prim::Constant[value=1]()
-  %5 : Dynamic = aten::add(%2, %3, %4)
-  return (%5);
+  %1 : int = prim::Constant[value=4]()
+  %2 : int = prim::Constant[value=3]()
+  %3 : int[] = prim::ListConstruct(%1, %2)
+  %4 : int = prim::Constant[value=7]()
+  %5 : int = prim::Constant[value=0]()
+  %6 : int[] = prim::Constant[value=[0, -1]]()
+  %7 : Double(4, 3) = aten::zeros(%3, %4, %5, %6)
+  %8 : Double(3, 3) = aten::mm(%x, %7)
+  %9 : int = prim::Constant[value=1]()
+  %10 : int = prim::Constant[value=1]()
+  %11 : Dynamic = aten::add(%8, %9, %10)
+  return (%11);
 }
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f1.expect b/test/expect/TestScript.test_onnx_export_speculate-f1.expect
index 47f55eb41ccda..4e8e51552ea4a 100644
--- a/test/expect/TestScript.test_onnx_export_speculate-f1.expect
+++ b/test/expect/TestScript.test_onnx_export_speculate-f1.expect
@@ -6,27 +6,28 @@ ModelProto {
     GraphProto {
       name: "torch-jit-export"
       inputs: [{name: "x.1", type:Tensor dims: 1 10}]
-      outputs: [{name: "6", type:Tensor dims: 10 1}]
+      outputs: [{name: "8", type:Tensor dims: 10 1}]
       initializers: []
       nodes: [
         Node {type: "Add", inputs: [x.1,x.1], outputs: [1], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "Transpose", inputs: [1], outputs: [3], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
-        Node {type: "Transpose", inputs: [1], outputs: [4], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "ReduceSum", inputs: [1], outputs: [2], attributes: [{ name: 'keepdims', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Greater", inputs: [2,3], outputs: [4], attributes: []},
         Node {type: "Transpose", inputs: [1], outputs: [5], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
-        Node {type: "If", inputs: [2], outputs: [6], attributes: [{ name: 'then_branch', type: graph, value:
+        Node {type: "Transpose", inputs: [1], outputs: [6], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "Transpose", inputs: [1], outputs: [7], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "If", inputs: [4], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
               inputs: []
-              outputs: [{name: "8", type:Tensor dims: }]
+              outputs: [{name: "9", type:Tensor dims: }]
               initializers: []
               nodes: [
-                Node {type: "Constant", inputs: [], outputs: [7], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-                Node {type: "If", inputs: [7], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
+                Node {type: "If", inputs: [4], outputs: [9], attributes: [{ name: 'then_branch', type: graph, value:
                     GraphProto {
                       name: "torch-jit-export2"
                       inputs: []
-                      outputs: [{name: "3", type:Tensor dims: }]
+                      outputs: [{name: "5", type:Tensor dims: }]
                       initializers: []
                       nodes: [
                         
@@ -37,7 +38,7 @@ ModelProto {
                     GraphProto {
                       name: "torch-jit-export3"
                       inputs: []
-                      outputs: [{name: "4", type:Tensor dims: }]
+                      outputs: [{name: "6", type:Tensor dims: }]
                       initializers: []
                       nodes: [
                         
@@ -52,7 +53,7 @@ ModelProto {
             GraphProto {
               name: "torch-jit-export4"
               inputs: []
-              outputs: [{name: "5", type:Tensor dims: }]
+              outputs: [{name: "7", type:Tensor dims: }]
               initializers: []
               nodes: [
                 
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f2.expect b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
index e7d04f54309b0..2820ce5f639ec 100644
--- a/test/expect/TestScript.test_onnx_export_speculate-f2.expect
+++ b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
@@ -6,27 +6,28 @@ ModelProto {
     GraphProto {
       name: "torch-jit-export"
       inputs: [{name: "x.1", type:Tensor dims: 1 10},{name: "1", type:Tensor dims: 20 10},{name: "2", type:Tensor dims: 20}]
-      outputs: [{name: "5", type:Tensor dims: 1 20}]
+      outputs: [{name: "7", type:Tensor dims: 1 20}]
       initializers: [TensorProto shape: [20 10],TensorProto shape: [20]]
       nodes: [
         Node {type: "Add", inputs: [x.1,x.1], outputs: [3], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "If", inputs: [4], outputs: [5], attributes: [{ name: 'then_branch', type: graph, value:
+        Node {type: "ReduceSum", inputs: [3], outputs: [4], attributes: [{ name: 'keepdims', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Greater", inputs: [4,5], outputs: [6], attributes: []},
+        Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
               inputs: []
-              outputs: [{name: "7", type:Tensor dims: 1 20}]
+              outputs: [{name: "8", type:Tensor dims: 1 20}]
               initializers: []
               nodes: [
-                Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-                Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value:
+                Node {type: "If", inputs: [6], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
                     GraphProto {
                       name: "torch-jit-export2"
                       inputs: []
-                      outputs: [{name: "8", type:Tensor dims: 1 20}]
+                      outputs: [{name: "9", type:Tensor dims: 1 20}]
                       initializers: []
                       nodes: [
-                        Node {type: "Gemm", inputs: [3,1,2], outputs: [8], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
                       ]
                     }
 
@@ -34,10 +35,10 @@ ModelProto {
                     GraphProto {
                       name: "torch-jit-export3"
                       inputs: []
-                      outputs: [{name: "9", type:Tensor dims: 1 20}]
+                      outputs: [{name: "10", type:Tensor dims: 1 20}]
                       initializers: []
                       nodes: [
-                        Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
                       ]
                     }
 
@@ -49,10 +50,10 @@ ModelProto {
             GraphProto {
               name: "torch-jit-export4"
               inputs: []
-              outputs: [{name: "10", type:Tensor dims: 1 20}]
+              outputs: [{name: "11", type:Tensor dims: 1 20}]
               initializers: []
               nodes: [
-                Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
+                Node {type: "Gemm", inputs: [3,1,2], outputs: [11], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
               ]
             }
 
diff --git a/test/test_jit.py b/test/test_jit.py
index e99203333dc38..a3e3ebed0e5af 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -975,6 +975,24 @@ def fn(x, y):
         self.assertExpectedGraph(traced_fn.graph)
         self.assertExportImport(traced_fn.graph, (x, y))
 
+    def test_trace_tensor_factory(self):
+        def run(**kwargs):
+            inputs_require_grads = kwargs.pop('inputs_require_grads', True)
+
+            def fn(x):
+                return x + torch.ones(2, 3, **kwargs)
+            input = torch.ones(2, 3, **kwargs)
+            self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads)
+            # check we recorded 'ones' and did not just record a constant
+            tfn = torch.jit.trace(input)(fn)
+            self.assertTrue("ones" in str(tfn.graph))
+        run()
+        run(dtype=torch.int, inputs_require_grads=False)
+        if RUN_CUDA:
+            run(device="cuda:0")
+        if RUN_CUDA_MULTI_GPU:
+            run(device="cuda:1")
+
     # TODO: implement
     @unittest.expectedFailure
     def test_output_unflatten(self):
@@ -1403,8 +1421,6 @@ def constant_prop(a, b):
         self.run_pass('constant_propagation', constant_prop.graph)
         self.assertExpected(canonical(constant_prop.graph))
 
-    # TODO: implement
-    @unittest.expectedFailure
     def test_constant_prop_loop_constant(self):
         @torch.jit.script
         def constant_prop():
@@ -4716,8 +4732,12 @@ def __init__(self, m):
             @torch.jit.script_method
             def forward(self, x):
                 x += x
-                if True:
-                    if True:
+                # because we are testing if we emit `if` statement correctly
+                # we cannot use `True` as the condition. Constant prop
+                # would remove the `if` statements.
+                c = sum(x) > 4
+                if c:
+                    if c:
                         y = self.m(x)
                     else:
                         y = self.m(x)
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
index c963650933cf2..ac3e8782eb355 100644
--- a/tools/autograd/gen_variable_factories.py
+++ b/tools/autograd/gen_variable_factories.py
@@ -5,11 +5,16 @@
 import re
 
 from .utils import CodeTemplate, write
+from .gen_variable_type import format_trace
+
 
 FUNCTION_TEMPLATE = CodeTemplate("""\
 inline at::Tensor ${name}(${formals}) {
+  ${pre_record_trace}
   at::Tensor tensor = at::${name}(${actuals});
-  return autograd::make_variable(tensor, /*requires_grad=*/${requires_grad});
+  auto result = autograd::make_variable(tensor, /*requires_grad=*/${requires_grad});
+  ${post_record_trace}
+  return result;
 }
 """)
 
@@ -53,6 +58,10 @@ def process_function(decl, has_tensor_options):
     requires_grad = "options.requires_grad()" if has_tensor_options else "false"
     if decl['name'].endswith('_like') and not has_tensor_options:
         actuals.append('at::TensorOptions({}, /*discard_runtime_type=*/true)'.format(actuals[0]))
+
+    pre_record_trace, post_record_trace = format_trace(decl)
+
     return FUNCTION_TEMPLATE.substitute(
-        name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad
+        name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad,
+        pre_record_trace=pre_record_trace, post_record_trace=post_record_trace
     )
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 0fe32115da314..caa6744bb3854 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -141,7 +141,7 @@
 
 POST_RECORD_TRACE = CodeTemplate("""\
 if (jit::tracer::isTracing()) {
-  jit::tracer::postRecordTrace(node, ArrayRef<Variable>(${trace_outputs}) );
+  jit::tracer::postRecordTrace(node, at::ArrayRef<autograd::Variable>(${trace_outputs}) );
 }
 """)
 
@@ -183,6 +183,41 @@ def should_trace(declaration):
     return True
 
 
+def get_trace_outputs(declaration):
+    if declaration['return_type'] == 'std::vector<Tensor>':
+        return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name'])
+    elif declaration['name'].endswith('_out'):
+        output_args = [arg['name'] for arg in declaration['arguments']
+                       if arg.get('output', False)]
+        return '{' + ', '.join(output_args) + '}'
+    trace_outs = [r['name'] for r in declaration['returns']]
+    if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']):
+        return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs)
+    else:
+        return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs)
+
+
+def format_trace(declaration):
+    local = {}
+
+    add_trace_inputs = []
+    for argument in declaration['arguments']:
+        add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
+    local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
+
+    # Record inplace operations as out-of-place operations (e.g.,
+    # not add_ but add)
+    # TODO: Add a proper concept of side effects to the IR, and
+    # properly record inplace operations.
+    local['trace_name'] = uninplace_api_name(declaration['api_name'])
+    if local['trace_name'] in RENAME_TRACE:
+        local['trace_name'] = RENAME_TRACE[local['trace_name']]
+
+    local['trace_outputs'] = get_trace_outputs(declaration)
+
+    return (PRE_RECORD_TRACE.substitute(local), POST_RECORD_TRACE.substitute(local))
+
+
 def gen_variable_type(out, aten_declarations, template_path):
     """VariableType.h and VariableType.cpp body
 
@@ -361,42 +396,10 @@ def reference_args(args):
                 res.append(arg['name'])
         return res
 
-    def get_trace_outputs(declaration):
-        if declaration['return_type'] == 'std::vector<Tensor>':
-            return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name'])
-        elif name.endswith('_out'):
-            output_args = [arg['name'] for arg in arguments
-                           if arg.get('output', False)]
-            return '{' + ', '.join(output_args) + '}'
-        trace_outs = [r['name'] for r in declaration['returns']]
-        if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']):
-            return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs)
-        else:
-            return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs)
-
     def emit_record_trace(env):
         if not should_trace(declaration):
             return ('', '')
-
-        local = {}
-
-        add_trace_inputs = []
-        for argument in declaration['arguments']:
-            add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
-        local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
-
-        # Record inplace operations as out-of-place operations (e.g.,
-        # not add_ but add)
-        # TODO: Add a proper concept of side effects to the IR, and
-        # properly record inplace operations.
-        local['trace_name'] = uninplace_api_name(declaration['api_name'])
-        if local['trace_name'] in RENAME_TRACE:
-            local['trace_name'] = RENAME_TRACE[local['trace_name']]
-
-        local['trace_outputs'] = get_trace_outputs(declaration)
-
-        combined = nested_dict(local, nested_dict(env, declaration))
-        return (PRE_RECORD_TRACE.substitute(combined), POST_RECORD_TRACE.substitute(combined))
+        return format_trace(declaration)
 
     def declare_returned_variables():
         if modifies_arguments:
diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h
index bc2fa21385777..bf74abc9138c6 100644
--- a/tools/autograd/templates/variable_factories.h
+++ b/tools/autograd/templates/variable_factories.h
@@ -3,7 +3,7 @@
 // ${generated_comment}
 
 #include <torch/csrc/autograd/variable.h>
-
+#include <torch/csrc/jit/tracer.h>
 #include <ATen/ATen.h>
 #include <ATen/core/ArrayRef.h>
 
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index d337143dd8b09..ff7fce56e9155 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -262,7 +262,8 @@ def declkey(decl):
                 arguments.extend([
                     # XXX - until we actually have first-class interpreter types for these
                     # concepts, the default values to be encoded in Tensors
-
+                    # If you change this, you also need to update [TensorOptions in script]
+                    # in the tracer code.
                     # dtype is specified as an int64_t of at::ScalarType
                     {'name': 'dtype', 'simple_type': 'ScalarType', 'default': 'float', 'kwarg_only': True},
                     # layout is specified as an int64_t of at::Layout
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index f51a735acea1b..d7876411c687a 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -13,7 +13,9 @@ Value* insertConstant(
   Node * n = g.create(prim::Constant);
   if(val.isTensor()) {
     at::Tensor ref = std::move(val).toTensor();
-    JIT_ASSERT(ref.defined());
+    if(!ref.defined()) {
+      throw constant_not_supported_error("undefined tensors cannot become constants");
+    }
     n->output()->inferTypeFrom(ref); // note: before t_ because of std::move(ref)
     n->t_(attr::value, std::move(ref));
   } else if(val.isInt()) {
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index 6855002d4fd9c..bfd8ec9b9f176 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -31,6 +31,10 @@ std::unordered_set<Symbol> skip_list = {
   aten::randn_like,
   aten::randperm,
   aten::randperm_out,
+  prim::Constant,
+  prim::Undefined,
+  // TODO (zach): we should consider skipping tensor factories in the cases
+  // where the constant tensor would be large but cheap to create.
  };
 
 std::vector<IValue> runNode(Node* n) {
@@ -40,9 +44,14 @@ std::vector<IValue> runNode(Node* n) {
     stack.push_back(*(toIValue(input)));
   }
   op(stack);
-  auto var_outputs = fmap(stack, [&](IValue v) {
+  auto var_outputs = fmap(stack, [&](IValue v) -> IValue {
     if (v.isTensor()) {
-      return IValue(autograd::as_variable_ref(v.toTensor()).data());
+      auto t = std::move(v).toTensor();
+      if(t.defined()) {
+        return IValue(autograd::as_variable_ref(t).data());
+      } else {
+        return t;
+      }
     } else {
       return v;
     }
@@ -119,11 +128,11 @@ bool removeExtraNodeOutputs(Node *n) {
 } // anonymous namespace
 
 void ConstantPropagation(Node* n, bool recurse) {
-  bool constant_inputs = (n->inputs().size() > 0) &&
-    std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
-      return v->node()->kind() == prim::Constant;
-    });
-  bool supported_node = skip_list.count(n->kind()) == 0;
+  bool constant_inputs =
+      std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
+        return v->node()->kind() == prim::Constant;
+      });
+  bool supported_node = !n->kind().is_onnx() && skip_list.count(n->kind()) == 0;
   auto run_blocks = [&]() {
     if (recurse) {
       for (Block * block : n->blocks()) {
@@ -150,7 +159,6 @@ void ConstantPropagation(Node* n, bool recurse) {
 }
 
 void ConstantPropagation(Block* block, bool recurse) {
-  ConstantPropagation(block->param_node(), recurse);
   for(auto it = block->nodes().begin(); it != block->nodes().end();) {
     Node *n = *it;
     it++; //advance iterator bc the current node may be destroyed
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index d16d4b00f07e9..d685584a4045b 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -443,14 +443,29 @@ void initPythonIRBindings(PyObject * module_) {
       switch(t->kind()) {
         case TypeKind::DynamicType:
           return "DynamicType";
+        case TypeKind::TensorType:
+          return "TensorType";
+        case TypeKind::NumberType:
+          return "NumberType";
+        case TypeKind::NoneType:
+          return "NoneType";
         case TypeKind::CompleteTensorType:
           return "CompleteTensorType";
         case TypeKind::TupleType:
           return "TupleType";
-        default:
-          AT_ERROR("unknown type kind");
-          return "";
+        case TypeKind::ListType:
+          return "ListType";
+        case TypeKind::IntType:
+          return "IntType";
+        case TypeKind::FloatType:
+          return "FloatType";
+        case TypeKind::StringType:
+          return "StringType";
+        case TypeKind::GeneratorType:
+          return "GeneratorType";
         }
+        // not reachable, but some compilers complain
+        AT_ERROR("Unknown Type Kind");
     })
     .def("sizes",[](Type& t) {
       return t.expect<CompleteTensorType>()->sizes();
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 5bc7bd574cf76..fee8924277d11 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -48,6 +48,16 @@ void addInputs(Node *n, const char * name, at::TensorList value) {
   n->addInput(list_node->output());
 }
 
+void addInputs(Node* n, const char * name, const at::TensorOptions& options) {
+  // [TensorOptions in script] - update this when you change how we schematize TensorOptions
+  detail::genericAddInput(n, static_cast<int64_t>(options.dtype()));
+  detail::genericAddInput(n, static_cast<int64_t>(options.layout()));
+  std::vector<int64_t> device = {
+      static_cast<int64_t>(options.device().type()),
+      static_cast<int64_t>(options.device().index())};
+  detail::genericAddInput(n, std::move(device));
+}
+
 void addInputs(Node *n, const char * name, at::IntList value) {
   using ArgumentStash = jit::tracer::ArgumentStash;
   std::vector<Value*> info = ArgumentStash::hasIntList(name) ?
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 789b3fd2d4591..b811534ce2740 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -229,16 +229,17 @@ inline void abandon() {
 
 // NB: those serve both as an intermediate steps in addInputs below,
 // as well as the overloads that terminate template recursion
-void addInputs(Node *n, const char * name, int64_t value);
-void addInputs(Node *n, const char * name, bool value);
-void addInputs(Node *n, const char * name, double value);
-void addInputs(Node *n, const char * name, const at::Scalar& value);
-void addInputs(Node *n, const char * name, const at::Tensor& value);
-void addInputs(Node *n, const char * name, at::IntList value);
-void addInputs(Node *n, const char * name, at::TensorList value);
-void addInputs(Node *n, const char * name, const ArrayRef<double>& value);
-void addInputs(Node *n, const char * name, const std::string& value);
-void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
+TORCH_API void addInputs(Node *n, const char * name, int64_t value);
+TORCH_API void addInputs(Node *n, const char * name, bool value);
+TORCH_API void addInputs(Node *n, const char * name, double value);
+TORCH_API void addInputs(Node *n, const char * name, const at::Scalar& value);
+TORCH_API void addInputs(Node *n, const char * name, const at::Tensor& value);
+TORCH_API void addInputs(Node *n, const char * name, at::IntList value);
+TORCH_API void addInputs(Node *n, const char * name, at::TensorList value);
+TORCH_API void addInputs(Node *n, const char * name, const ArrayRef<double>& value);
+TORCH_API void addInputs(Node *n, const char * name, const std::string& value);
+TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
+TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value);
 
 template<size_t N>
 void addInputs(Node *n, const char * name, std::array<bool, N> value) {
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index c7e33fae7e20a..e5a3e64ac067d 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -51,6 +51,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << "None";
   } else if(t.kind() == TypeKind::StringType) {
     out << "string";
+  } else if(t.kind() == TypeKind::GeneratorType) {
+    out << "Generator";
   } else {
     AT_ERROR("unknown type kind");
   }
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index e0314acea4a17..551a17565e176 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -996,12 +996,12 @@ def register_all(mod):
     return _builtin_table
 
 
-def _register_builtin(callable, op):
-    _get_builtin_table()[id(callable)] = op
+def _register_builtin(fn, op):
+    _get_builtin_table()[id(fn)] = op
 
 
-def _find_builtin(callable):
-    return _get_builtin_table().get(id(callable))
+def _find_builtin(fn):
+    return _get_builtin_table().get(id(fn))
 
 
 if not torch._C._jit_init():
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 34c30aea654ed..b65ea160b5c21 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -19,6 +19,7 @@
 from torch.autograd import Function, function
 from torch.jit import _unique_state_dict
 from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes
+from torch._C import ListType
 
 
 @contextlib.contextmanager
@@ -103,24 +104,32 @@ def export(model, args, f, export_params=True, verbose=False, training=False,
             operator_export_type=operator_export_type)
 
 
-def _list_constant_prop(g, block):
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+def _split_tensor_list_constants(g, block):
     for node in block.nodes():
         for subblock in node.blocks():
-            _list_constant_prop(g, subblock)
-        if node.kind() == "prim::ListConstruct":
-            input_nodes = [i.node() for i in node.inputs()]
-            if all(inode.kind() == "prim::Constant" and inode.kindOf("value") == "i" for inode in input_nodes):
-                input_values = [inode['value'] for inode in input_nodes]
-                const_node = g.create("prim::Constant")
-                const_node.insertBefore(node)
-                const_node.is_("value", input_values)
-                const_node.output().setType(torch._C.ListType.ofInts())
-                node.output().replaceAllUsesWith(const_node.output())
+            _split_tensor_list_constants(g, subblock)
+        if node.kind() == "prim::Constant":
+            output_type = node.output().type()
+            if output_type.isSubtypeOf(ListType.ofTensors()):
+                inputs = [g.create("prim::Constant").t_('value', t)
+                           .insertBefore(node).output()
+                          for t in node['value']]
+                lc = (g.create("prim::ListConstruct", inputs)
+                      .insertBefore(node)
+                      .output()
+                      .setType(ListType.ofTensors()))
+                node.output().replaceAllUsesWith(lc)
 
 
 def _optimize_graph(graph, operator_export_type):
-    _list_constant_prop(graph, graph)
-
+    # we record now record some ops like ones/zeros
+    # into a trace where we previously recorded constants
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    torch._C._jit_pass_constant_propagation(graph)
+    _split_tensor_list_constants(graph, graph)
     # run dce to eliminate dead parts of the graph that might have been
     # left behind by things like symbolic_override
     torch._C._jit_pass_dce(graph)

From 91ecbf8b1d3e21feb03b2546cd12e9e456291a1f Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 29 Aug 2018 17:22:55 -0700
Subject: [PATCH 33/42] Remove TensorBase (#11036)

Summary:
Not subclassed except by Tensor. Also requried to align further with
caffe2.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11036

Reviewed By: ezyang

Differential Revision: D9565640

Pulled By: cpuhrsch

fbshipit-source-id: ff7203a2c95d3f3956282b4f2d8dda6c2b93f4a6
---
 aten/src/ATen/TensorBase.h       | 53 --------------------------------
 aten/src/ATen/templates/Tensor.h | 50 +++++++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 60 deletions(-)
 delete mode 100644 aten/src/ATen/TensorBase.h

diff --git a/aten/src/ATen/TensorBase.h b/aten/src/ATen/TensorBase.h
deleted file mode 100644
index 1bda3ddfa1491..0000000000000
--- a/aten/src/ATen/TensorBase.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include "ATen/TensorImpl.h"
-#include "ATen/UndefinedTensor.h"
-#include "ATen/core/Error.h"
-
-namespace at { namespace detail {
-
-// TensorBase is the base class for Tensor.
-// TODO: Eliminate this, once we remove TensorBase from Scalar.  At
-// the moment it's only used to break an include cycle for Scalar
-struct TensorBase {
-  TensorBase() {}
-  TensorBase(TensorImpl * tensor_impl, bool retain) : tensor_impl_(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(tensor_impl)) {
-    if (tensor_impl == nullptr) {
-      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
-    }
-    if (retain && tensor_impl != UndefinedTensor::singleton()) {
-      c10::raw::intrusive_ptr::incref(tensor_impl);
-    }
-  }
-  TensorBase(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr) : tensor_impl_(std::move(ptr)) {}
-  TensorBase(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr) : tensor_impl_(ptr) {}
-
-  int64_t dim() const {
-    return tensor_impl_->dim();
-  }
-
-  TensorImpl * unsafeGetTensorImpl() const {
-    return tensor_impl_.get();
-  }
-  TensorImpl * unsafeReleaseTensorImpl() {
-    return tensor_impl_.release();
-  }
-  const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& getIntrusivePtr() const {
-    return tensor_impl_;
-  }
-
-  bool defined() const {
-    return tensor_impl_;
-  }
-
-  void reset() {
-    tensor_impl_.reset();
-  }
-
-  friend struct WeakTensor;
-
-protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl_;
-};
-
-}} // namespace at::detail
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index f426c6753adc3..4d8bf60522f7d 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -9,9 +9,10 @@
 #include "ATen/core/SparseTensorRef.h"
 #include "ATen/Storage.h"
 #include "ATen/TensorAccessor.h"
-#include "ATen/TensorBase.h"
 #include "ATen/TensorImpl.h"
 #include "ATen/core/optional.h"
+#include "ATen/UndefinedTensor.h"
+#include "ATen/core/Error.h"
 
 namespace at {
 struct Generator;
@@ -38,16 +39,48 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct AT_API Tensor : public detail::TensorBase {
-  using TensorBase = detail::TensorBase;
-  Tensor() : TensorBase() {}
-  Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {}
-  Tensor(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr) : TensorBase(ptr) {}
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr) : TensorBase(std::move(ptr)) {}
+struct AT_API Tensor {
+  Tensor(){};
+  Tensor(TensorImpl* tensor_impl, bool retain)
+      : tensor_impl_(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(
+            tensor_impl)) {
+    if (tensor_impl == nullptr) {
+      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+    }
+    if (retain && tensor_impl != UndefinedTensor::singleton()) {
+      c10::raw::intrusive_ptr::incref(tensor_impl);
+    }
+  }
+  Tensor(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr)
+      : tensor_impl_(std::move(ptr)) {}
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr)
+      : tensor_impl_(ptr) {}
 
   Tensor(const Tensor&) = default;
   Tensor(Tensor&&) = default;
 
+  int64_t dim() const {
+    return tensor_impl_->dim();
+  }
+
+  TensorImpl * unsafeGetTensorImpl() const {
+    return tensor_impl_.get();
+  }
+  TensorImpl * unsafeReleaseTensorImpl() {
+    return tensor_impl_.release();
+  }
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& getIntrusivePtr() const {
+    return tensor_impl_;
+  }
+
+  bool defined() const {
+    return tensor_impl_;
+  }
+
+  void reset() {
+    tensor_impl_.reset();
+  }
+
   // The following overloads are very intruiging.  Consider the following
   // program:
   //
@@ -242,6 +275,9 @@ struct AT_API Tensor : public detail::TensorBase {
   }
 
   friend struct WeakTensor;
+
+protected:
+  c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl_;
 };
 
 struct AT_API WeakTensor {

From e550eab3e20d58a68e24aaab1902c410f253914e Mon Sep 17 00:00:00 2001
From: Yi Cheng <eason@fb.com>
Date: Wed, 29 Aug 2018 17:53:35 -0700
Subject: [PATCH 34/42] Remove MetaNetDef test case in Predictor (#11052)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11052

Delete the test case for Predictor with constructing by MetaNetDef since the constructor
actually has been deprecated. The broken PR is for construcing predictor from DB instance.

Reviewed By: highker

Differential Revision: D9566935

fbshipit-source-id: 5511883953a2d3f6eb0a4f1c5518a1bc4b3ffbdc
---
 caffe2/predictor/predictor_test.cc | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index 40e4f720c6190..326265fc66d03 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -209,33 +209,4 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
 }
 
-class PredictorMetaNetDefTest : public testing::Test {
- public:
-  void SetUp() override {
-    DeviceOption op;
-    op.set_random_seed(1701);
-    ctx_ = caffe2::make_unique<CPUContext>(op);
-    p_ = caffe2::make_unique<Predictor>(
-        makePredictorConfig(parseMetaNetDef(metaSpec)));
-  }
-
-  std::unique_ptr<CPUContext> ctx_;
-  std::unique_ptr<Predictor> p_;
-};
-
-TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) {
-  auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorMap input;
-  auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = inputData->GetMutableTensor(CPU);
-  iter.first->second.ResizeLike(*tensor);
-  iter.first->second.ShareData(*tensor);
-  Predictor::TensorList output;
-  (*p_)(input, &output);
-  EXPECT_EQ(output.size(), 1);
-  EXPECT_EQ(output.front().dims().size(), 2);
-  EXPECT_EQ(output.front().dim(0), 1);
-  EXPECT_EQ(output.front().dim(1), 10);
-  EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
-}
 } // namespace caffe2

From 394bdcd49a603f4e391abcfcf11b5b34e2868922 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Wed, 29 Aug 2018 17:56:20 -0700
Subject: [PATCH 35/42] Fix the build of aten tests when FULL_CAFFE2=1

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11019

Reviewed By: orionr

Differential Revision: D9562691

Pulled By: houseroad

fbshipit-source-id: 95a8dee580e5f4dc9af3a2e1f68ec6c62a0e4e04
---
 tools/build_pytorch_libs.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 994a96ad822b4..d1cdb855c9099 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -281,6 +281,12 @@ function build_caffe2() {
       # STOP!!! Are you trying to add a C or CXX flag?  Add it
       # to CMakeLists.txt and aten/CMakeLists.txt, not here.
       # We need the vanilla cmake build to work.
+
+  # This is needed by the aten tests built with caffe2
+  if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
+    cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
+  fi
+
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
 
   # Install Python proto files

From 16b8e0a787fac2988e8adeba452ab6d02e6dde79 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 29 Aug 2018 20:01:38 -0700
Subject: [PATCH 36/42] at::StorageImpl: Rename size_ to numel_ and
 elementSize() to itemsize()

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11011

Reviewed By: ezyang

Differential Revision: D9561898

Pulled By: cpuhrsch

fbshipit-source-id: 0cf5cdc3e7acd397f7e2d66097856aaad0581147
---
 aten/src/ATen/Storage.h                 |  4 +--
 aten/src/ATen/StorageImpl.cpp           | 10 +++----
 aten/src/ATen/StorageImpl.h             | 17 +++++------
 aten/src/TH/THFile.cpp                  |  4 +--
 aten/src/TH/THMemoryFile.cpp            | 40 ++++++++++++-------------
 aten/src/TH/THStorageFunctions.cpp      | 14 ++++-----
 aten/src/TH/THTensor.cpp                |  2 +-
 aten/src/TH/generic/THStorage.cpp       |  8 ++---
 aten/src/TH/generic/THStorageCopy.cpp   | 18 +++++------
 aten/src/THC/THCStorage.cpp             | 10 +++----
 aten/src/THC/THCTensor.cpp              |  2 +-
 aten/src/THC/generic/THCStorage.cpp     |  4 +--
 aten/src/THC/generic/THCStorage.cu      |  2 +-
 aten/src/THC/generic/THCStorageCopy.cpp | 16 +++++-----
 aten/src/THC/generic/THCStorageCopy.cu  |  8 ++---
 torch/csrc/generic/Storage.cpp          |  4 +--
 torch/csrc/generic/StorageSharing.cpp   | 10 +++----
 17 files changed, 86 insertions(+), 87 deletions(-)

diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
index 8db0b231bf53f..d797618b285e6 100644
--- a/aten/src/ATen/Storage.h
+++ b/aten/src/ATen/Storage.h
@@ -26,8 +26,8 @@ struct AT_API Storage {
   template <typename T>
   T* unsafe_data() const { return storage_impl_->unsafe_data<T>(); }
 
-  size_t elementSize() const { return storage_impl_->elementSize(); }
-  ptrdiff_t size() const { return storage_impl_->size(); }
+  size_t elementSize() const { return storage_impl_->itemsize(); }
+  ptrdiff_t size() const { return storage_impl_->numel(); }
   bool resizable() const { return storage_impl_->resizable(); }
   // get() use here is to get const-correctness
   void* data() const { return storage_impl_.get()->data(); }
diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index bc2d69a7aa8f5..0ed836b9b3010 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -4,26 +4,26 @@ namespace at {
 
 StorageImpl::StorageImpl(
     at::DataType data_type,
-    ptrdiff_t size,
+    int64_t numel,
     at::DataPtr data_ptr,
     at::Allocator* allocator,
     bool resizable)
     : data_type_(data_type),
       data_ptr_(std::move(data_ptr)),
-      size_(size),
+      numel_(numel),
       resizable_(resizable),
       allocator_(allocator) {}
 
 StorageImpl::StorageImpl(
     at::DataType data_type,
-    ptrdiff_t size,
+    int64_t numel,
     at::Allocator* allocator,
     bool resizable)
     : StorageImpl(
           data_type,
-          size,
+          numel,
           allocator->allocate(
-              at::elementSize(dataTypeToScalarType(data_type)) * size),
+              at::elementSize(dataTypeToScalarType(data_type)) * numel),
           allocator,
           resizable) {}
 
diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index 68c5012777edd..35639478df664 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -23,13 +23,13 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   ~StorageImpl() {};
   StorageImpl(
       at::DataType data_type,
-      ptrdiff_t size,
+      int64_t numel,
       at::DataPtr data_ptr,
       at::Allocator* allocator,
       bool resizable);
   StorageImpl(
       at::DataType data_type,
-      ptrdiff_t size,
+      int64_t numel,
       at::Allocator* allocator,
       bool resizable);
   StorageImpl(StorageImpl&) = delete;
@@ -65,18 +65,17 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
 
   void operator=(const StorageImpl&) = delete;
 
-  size_t elementSize() const {
+  size_t itemsize() const {
     return at::elementSize(dataTypeToScalarType(data_type_));
   }
 
   Type& type();
 
-  // TODO: Rename to size() and size to size_
-  ptrdiff_t size() const {
-    return size_;
+  int64_t numel() const {
+    return numel_;
   };
-  void set_size(ptrdiff_t size) {
-    size_ = size;
+  void set_numel(int64_t numel) {
+    numel_ = numel;
   };
   bool resizable() const {
     return resizable_;
@@ -127,7 +126,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
  private:
   at::DataType data_type_;
   at::DataPtr data_ptr_;
-  ptrdiff_t size_;
+  int64_t numel_;
   bool resizable_;
   at::Allocator* allocator_;
 };
diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp
index c8924b54f4bf7..4a2cb18b92e07 100644
--- a/aten/src/TH/THFile.cpp
+++ b/aten/src/TH/THFile.cpp
@@ -140,12 +140,12 @@ IMPLEMENT_THFILE_SCALAR(Half, THHalf)
 #define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE)                           \
   size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage)    \
   {                                                                     \
-    return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \
+    return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \
   }                                                                     \
                                                                         \
   size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage)   \
   {                                                                     \
-    return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \
+    return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \
   }
 
 IMPLEMENT_THFILE_STORAGE(Byte, uint8_t)
diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp
index 011c1d1f54aae..3f2187b68f74e 100644
--- a/aten/src/TH/THMemoryFile.cpp
+++ b/aten/src/TH/THMemoryFile.cpp
@@ -56,7 +56,7 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size)
     return;
   else
   {
-    if(size < self->storage->size()) /* note the "<" and not "<=" */
+    if(size < self->storage->numel()) /* note the "<" and not "<=" */
     {
       self->size = size;
       THCharStorage_data(self->storage)[self->size] = '\0';
@@ -64,10 +64,10 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size)
     }
   }
 
-  missingSpace = size-self->storage->size()+1; /* +1 for the '\0' */
-  THCharStorage_resize(self->storage, (self->storage->size()/2 > missingSpace ?
-                                       self->storage->size() + (self->storage->size()/2)
-                                       : self->storage->size() + missingSpace));
+  missingSpace = size-self->storage->numel()+1; /* +1 for the '\0' */
+  THCharStorage_resize(self->storage, (self->storage->numel()/2 > missingSpace ?
+                                       self->storage->numel() + (self->storage->numel()/2)
+                                       : self->storage->numel() + missingSpace));
 }
 
 static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
@@ -188,12 +188,12 @@ static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
         while (1)                                                       \
         {                                                               \
           ASCII_WRITE_ELEM;                                             \
-          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) ) \
+          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) ) \
           {                                                             \
             mfself->position += nByteWritten;                           \
             break;                                                      \
           }                                                             \
-          THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2); \
+          THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2); \
         }                                                               \
         if(mfself->file.isAutoSpacing)                                  \
         {                                                               \
@@ -297,7 +297,7 @@ static void THMemoryFile_free(THFile *self)
 
 /* READ_WRITE_METHODS(bool, Bool, */
 /*                    int value = 0; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */
-/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", value), */
+/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", value), */
 /*                    1) */
 
 READ_WRITE_METHODS(uint8_t, Byte,
@@ -307,7 +307,7 @@ READ_WRITE_METHODS(uint8_t, Byte,
                    nread = ret; \
                    i = n-1; \
                    memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead),
-                   nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \
+                   nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \
                    i = n-1; \
                    if(nByteWritten > -1)
                      memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten),
@@ -322,7 +322,7 @@ READ_WRITE_METHODS(int8_t, Char,
                    nread = ret; \
                    i = n-1; \
                    memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead),
-                   nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \
+                   nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \
                    i = n-1; \
                    if(nByteWritten > -1)
                      memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten),
@@ -330,29 +330,29 @@ READ_WRITE_METHODS(int8_t, Char,
 
 READ_WRITE_METHODS(int16_t, Short,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%hd", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%hd", data[i]),
                    1)
 
 READ_WRITE_METHODS(int32_t, Int,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", data[i]),
                    1)
 
 READ_WRITE_METHODS(float, Float,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]),
                    1)
 
 READ_WRITE_METHODS(THHalf, Half,
                    int nByteRead_; float buf; \
                    int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \
                    data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", TH_half2float(data[i])),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])),
                    1)
 
 READ_WRITE_METHODS(double, Double,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.17g", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.17g", data[i]),
                    1)
 
 static ssize_t THMemoryFile_readLong(THFile *self, int64_t *data, ssize_t n)
@@ -491,13 +491,13 @@ static ssize_t THMemoryFile_writeLong(THFile *self, int64_t *data, ssize_t n)
       ssize_t nByteWritten;
       while (1)
       {
-        nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%" PRId64, data[i]);
-        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) )
+        nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%" PRId64, data[i]);
+        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) )
         {
           mfself->position += nByteWritten;
           break;
         }
-        THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2);
+        THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2);
       }
       if(mfself->file.isAutoSpacing)
       {
@@ -654,7 +654,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
 
   if(storage)
   {
-    THArgCheck(THCharStorage_data(storage)[storage->size()-1] == '\0', 1, "provided CharStorage must be terminated by 0");
+    THArgCheck(THCharStorage_data(storage)[storage->numel()-1] == '\0', 1, "provided CharStorage must be terminated by 0");
     THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
     THCharStorage_retain(storage);
   }
@@ -668,7 +668,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
   mfself = static_cast<THMemoryFile*>(THAlloc(sizeof(THMemoryFile)));
 
   mfself->storage = storage;
-  mfself->size = (storage ? storage->size()-1 : 0);
+  mfself->size = (storage ? storage->numel()-1 : 0);
   mfself->position = 0;
   mfself->longSize = 0;
 
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index b0e4abe9329db..a5319e67dabe6 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -34,7 +34,7 @@ void THStorage_free(THStorage* storage) {
 
 ptrdiff_t THStorage_size(const THStorage *self)
 {
-  return self->size();
+  return self->numel();
 }
 
 void THStorage_retain(THStorage *storage)
@@ -49,21 +49,21 @@ void THStorage_resize(THStorage* storage, ptrdiff_t size) {
     /* case when the allocator does not have a realloc defined */
     at::DataPtr new_data;
     if (size != 0) {
-      new_data = storage->allocator()->allocate(storage->elementSize() * size);
+      new_data = storage->allocator()->allocate(storage->itemsize() * size);
     }
     at::DataPtr old_data = storage->set_data_ptr(std::move(new_data));
-    ptrdiff_t old_size = storage->size();
-    storage->set_size(size);
+    ptrdiff_t old_size = storage->numel();
+    storage->set_numel(size);
     if (old_data != nullptr) {
       ptrdiff_t copy_size = old_size;
-      if (storage->size() < copy_size) {
-        copy_size = storage->size();
+      if (storage->numel() < copy_size) {
+        copy_size = storage->numel();
       }
       if (copy_size > 0) {
         memcpy(
             storage->data(),
             old_data.get(),
-            storage->elementSize() * copy_size);
+            storage->itemsize() * copy_size);
       }
     }
   } else {
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 1b1f493ac4e28..0c731779b9568 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -125,7 +125,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons
     if(!THTensor_getStoragePtr(self)) {
       THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type()));
     }
-    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) {
+    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) {
       THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset());
     }
   }
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 384ce9c632e22..992cbd5bb7509 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -59,7 +59,7 @@ THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int
       false).release();
 
   if (size <= 0) {
-    storage->set_size(actual_size / at::elementSize(scalar_type));
+    storage->set_numel(actual_size / at::elementSize(scalar_type));
   }
 
   return storage;
@@ -132,19 +132,19 @@ void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
 void THStorage_(fill)(THStorage *storage, real value)
 {
   ptrdiff_t i;
-  for(i = 0; i < storage->size(); i++)
+  for(i = 0; i < storage->numel(); i++)
     THStorage_(data)(storage)[i] = value;
 }
 
 void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value)
 {
-  THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds");
+  THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds");
   THStorage_(data)(self)[idx] = value;
 }
 
 real THStorage_(get)(const THStorage *self, ptrdiff_t idx)
 {
-  THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds");
+  THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds");
   return THStorage_(data)(self)[idx];
 }
 
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
index 0cde162d4c284..442f7dbde2925 100644
--- a/aten/src/TH/generic/THStorageCopy.cpp
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -6,13 +6,13 @@ void THStorage_(rawCopy)(THStorage *storage, real *src)
 {
   ptrdiff_t i;
   real *data = THStorage_(data)(storage);
-  for(i = 0; i < storage->size(); i++)
+  for(i = 0; i < storage->numel(); i++)
     data[i] = src[i];
 }
 
 void THStorage_(copy)(THStorage *storage, THStorage *src)
 {
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch");
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch");
   THStorage_(rawCopy)(storage, THStorage_(data)(src));
 }
 
@@ -25,40 +25,40 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage
   ptrdiff_t i;                                                          \
   auto data = THStorage_(data)(storage);                                \
   auto src_data = TH##TYPENAMESRC##Storage_data(src);                   \
-  for(i = 0; i < storage->size(); i++)                                    \
+  for(i = 0; i < storage->numel(); i++)                                    \
     data[i] = static_cast<real>(src_data[i]);                           \
 }
 
 #define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
   ptrdiff_t i;								\
   auto data = THStorage_(data)(storage);      \
   auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->size(); i++)					\
+  for(i = 0; i < storage->numel(); i++)					\
     data[i] = (real)TH_half2float(src_data[i]); \
 }
 
 #define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
   ptrdiff_t i;								\
   auto data = THStorage_(data)(storage);      \
   auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->size(); i++)					\
+  for(i = 0; i < storage->numel(); i++)					\
     data[i] = TH_float2half((float)(src_data[i])); \
 }
 
 #define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
   ptrdiff_t i;								\
   auto data = THStorage_(data)(storage);      \
   auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->size(); i++)					\
+  for(i = 0; i < storage->numel(); i++)					\
     data[i] = static_cast<real>(src_data[i]); \
 }
 
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index a9a1790c58c83..96e3938e20b0f 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -20,17 +20,17 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
   if (!self->resizable())
     THError("Trying to resize storage that is not resizable");
 
-  size_t elementSize = self->elementSize();
+  size_t itemsize = self->itemsize();
 
   if(size == 0)
   {
     self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device)));
-    self->set_size(0);
+    self->set_numel(0);
   }
   else
   {
     at::DataPtr data =
-      self->allocator()->allocate(size * elementSize);
+      self->allocator()->allocate(size * itemsize);
 
     if (self->data_ptr()) {
       // Enable p2p access when the memcpy is across devices
@@ -38,14 +38,14 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
 
       THCudaCheck(cudaMemcpyAsync(data.get(),
                                   self->data(),
-                                  THMin(self->size(), size) * elementSize,
+                                  THMin(self->numel(), size) * itemsize,
                                   cudaMemcpyDeviceToDevice,
                                   THCState_getCurrentStream(state)));
     }
 
     // Destructively overwrite data_ptr
     self->set_data_ptr(std::move(data));
-    self->set_size(size);
+    self->set_numel(size);
   }
 }
 
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index 3826ea57fc5da..de787bd380b6e 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -148,7 +148,7 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, const
     if(!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) {
+    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) {
       THCStorage_resize(state, THTensor_getStoragePtr(self), totalSize+self->storage_offset());
     }
   }
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index aef30d6251706..feb2e94959abf 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -21,7 +21,7 @@ int THCStorage_(elementSize)(THCState *state)
 
 void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value)
 {
-  THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds");
+  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(real),
                               cudaMemcpyHostToDevice,
@@ -31,7 +31,7 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real v
 
 real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
 {
-  THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds");
+  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
   real value;
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(real),
diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu
index a6b3bf557e2f6..95f2bc7163d46 100644
--- a/aten/src/THC/generic/THCStorage.cu
+++ b/aten/src/THC/generic/THCStorage.cu
@@ -10,7 +10,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
 #if CUDA_VERSION >= 7000
     thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #endif
-    self_data, self_data+self->size(), value);
+    self_data, self_data+self->numel(), value);
 }
 
 void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp
index 9194ab7d3c80d..546777baaf98c 100644
--- a/aten/src/THC/generic/THCStorageCopy.cpp
+++ b/aten/src/THC/generic/THCStorageCopy.cpp
@@ -4,11 +4,11 @@
 
 void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
 {
-  THArgCheck(self->size() == src->size(), 2, "size does not match");
+  THArgCheck(self->numel() == src->numel(), 2, "size does not match");
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self),
                               THStorage_(data)(src),
-                              self->size() * sizeof(real),
+                              self->numel() * sizeof(real),
                               cudaMemcpyHostToDevice,
                               stream));
   THCudaCheck(cudaStreamSynchronize(stream));
@@ -18,9 +18,9 @@ void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *s
 void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src)  \
 {                                                                      \
   THCTensor* selfTensor =                                              \
-      THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1);     \
+      THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1);     \
   struct TH##TYPEC##Tensor* srcTensor =                                \
-      TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size(), 1);        \
+      TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->numel(), 1);        \
   THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor);               \
   TH##TYPEC##Tensor_free(srcTensor);                                   \
   THCTensor_(free)(state, selfTensor);                                 \
@@ -36,11 +36,11 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Double)
 
 void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src)
 {
-  THArgCheck(self->size() == src->size(), 2, "size does not match");
+  THArgCheck(self->numel() == src->numel(), 2, "size does not match");
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self),
                               THCStorage_(data)(state, src),
-                              self->size() * sizeof(real),
+                              self->numel() * sizeof(real),
                               cudaMemcpyDeviceToHost,
                               stream));
   THCudaCheck(cudaStreamSynchronize(stream));
@@ -50,9 +50,9 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s
 void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
 {                                                                           \
   TH##TYPEC##Tensor* selfTensor =                                           \
-      TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size(), 1);           \
+      TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->numel(), 1);           \
   struct THCTensor* srcTensor =                                             \
-      THCTensor_(newWithStorage1d)(state, src, 0, src->size(), 1);            \
+      THCTensor_(newWithStorage1d)(state, src, 0, src->numel(), 1);            \
   TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \
   THCTensor_(free)(state, srcTensor);                                       \
   TH##TYPEC##Tensor_free(selfTensor);                                   \
diff --git a/aten/src/THC/generic/THCStorageCopy.cu b/aten/src/THC/generic/THCStorageCopy.cu
index bea4fe699623f..962167c73b82c 100644
--- a/aten/src/THC/generic/THCStorageCopy.cu
+++ b/aten/src/THC/generic/THCStorageCopy.cu
@@ -4,17 +4,17 @@
 
 void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src)
 {
-  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->size() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state)));
+  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->numel() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state)));
 }
 
 // conversions are delegated to THCTensor implementation
 #define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA)                                 \
 void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src)  \
 {                                                                                       \
-  THArgCheck(self->size() == src->size(), 2, "size does not match");                        \
-  THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1);  \
+  THArgCheck(self->numel() == src->numel(), 2, "size does not match");                        \
+  THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1);  \
   struct THCuda##TYPECUDA##Tensor* srcTensor =                                          \
-      THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size(), 1);           \
+      THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->numel(), 1);           \
   THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor);                            \
   THCuda##TYPECUDA##Tensor_free(state, srcTensor);                                      \
   THCTensor_(free)(state, selfTensor);                                                  \
diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp
index 42f3f583b848e..d8f33c533b203 100644
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@@ -151,9 +151,9 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
     int64_t nindex = THPUtils_unpackLong(index);
     if (nindex < 0)
       nindex += THWStorage_(size)(LIBRARY_STATE self->cdata);
-    if (nindex < 0 || nindex >= self->cdata->size()) {
+    if (nindex < 0 || nindex >= self->cdata->numel()) {
       PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of "
-              "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size());
+              "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->numel());
       return NULL;
     }
     real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex);
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
index 4a7c01b2ca2e8..6b462160c6d0b 100644
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -79,7 +79,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self)
   } else {
     // TODO: retry on collision
     // TODO: free GIL - but remember to reacquire it when an exception is thrown
-    THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size()));
+    THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->numel()));
     THWStorage_(copy)(new_storage, storage);
     THWStorage_(swap)(storage, new_storage);
     ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr());
@@ -90,7 +90,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self)
   if (!manager_handle) return NULL;
   THPObjectPtr storage_handle(PyBytes_FromString(ctx->filename()));
   if (!storage_handle) return NULL;
-  THPObjectPtr size(PyLong_FromLong(storage->size()));
+  THPObjectPtr size(PyLong_FromLong(storage->numel()));
   if (!size) return NULL;
 
   THPObjectPtr tuple(PyTuple_New(3));
@@ -158,7 +158,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self)
   if ((ctx = THMapAllocator::fromDataPtr(storage->data_ptr()))) {
     // done
   } else {
-    THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size()));
+    THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->numel()));
     THWStorage_(copy)(new_storage, storage);
     THWStorage_(swap)(storage, new_storage);
     ctx = THMapAllocator::fromDataPtr(storage->data_ptr());
@@ -167,7 +167,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self)
 
   THPObjectPtr storage_handle(PyLong_FromLong(ctx->fd()));
   if (!storage_handle) return NULL;
-  THPObjectPtr size(PyLong_FromLong(storage->size()));
+  THPObjectPtr size(PyLong_FromLong(storage->numel()));
   if (!size) return NULL;
 
   THPObjectPtr tuple(PyTuple_New(2));
@@ -220,7 +220,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self)
   THPObjectPtr device(PyLong_FromLong(storage->device().index()));
   THPObjectPtr _handle(Py_None);
   Py_INCREF(Py_None);
-  THPObjectPtr size(PyLong_FromLong(storage->size()));
+  THPObjectPtr size(PyLong_FromLong(storage->numel()));
   THPObjectPtr _offset(PyLong_FromLong(0));
   if (THWStorage_(data)(LIBRARY_STATE storage)) {
     size_t base_size;

From ad1670cf547940ebbaa63818585e61c19f795ce6 Mon Sep 17 00:00:00 2001
From: Shihao Xu <shihaoxu@fb.com>
Date: Wed, 29 Aug 2018 20:09:02 -0700
Subject: [PATCH 37/42] Kill the dummy TaskOutput when task.get_step() (#11048)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11048

Pull Request resolved: https://github.com/pytorch/pytorch/pull/10739

I wanted to assert that the blobs in the workspace of the new session after loading checkpoint are exactly the same as the blobs in the workspace of the old session before saving to a checkpoint.

But I found that when calling `task.get_step()`, a dummy task output blob, `task:output/ConstIntFill:0`, is added. Also a dummy net `task:output` was also added along with it. See https://fburl.com/937lf2yk

This makes it hard to assert "Equal", forcing me to assert "LessThan" or "GreaterThan".

This adding a dummy TaskOutput when user specifies no TaskOutput is a hack.
The reason for this is that ZMQ socket can't send empty blob list.
As a result, if the Task on the Worker had no output,
The master would never stop waiting and hang forever. See https://fburl.com/rd7fhy6p and imagine `socket.recv(net, 0)`.

TaskOuput is at user layer. The hack shouldn't be exposed to user layer, polluting user workspaces.

Instead, we should move the creating of the dummy blob to some deeper layer,
and remove the dummy blob in the workspace afterwards to avoid polluting user workspaces.
After this change, the workaround becomes totally transparent and no side-effect to users.

Reviewed By: mraway

Differential Revision: D9566744

fbshipit-source-id: 18292dd64a6d48192c34034200a7c9811d2172af
---
 caffe2/python/checkpoint_test.py |  4 +--
 caffe2/python/core_test.py       |  4 +--
 caffe2/python/task.py            | 46 +++++++++++++-------------------
 3 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
index a91bbf9910e29..afba3dddcd5aa 100644
--- a/caffe2/python/checkpoint_test.py
+++ b/caffe2/python/checkpoint_test.py
@@ -161,9 +161,9 @@ def test_ckpt_name_and_load_model_from_ckpts(self):
                     num_epochs = job_runner.train(session)
                 self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
 
-                # There are 17 global blobs after finishing up the job runner.
+                # There are 15 global blobs after finishing up the job runner.
                 # (only blobs on init_group are checkpointed)
-                self.assertEquals(len(ws.blobs), 17)
+                self.assertEquals(len(ws.blobs), 15)
 
             ws = workspace.C.Workspace()
             session = LocalSession(ws)
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 7120843f33152..d989471a16bab 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -533,8 +533,8 @@ def test_create_plan_from_proto_correctly(self):
 
         self.assertEqual(len(plan.Steps()), 1)
         self.assertEqual(len(test_plan.Steps()), 1)
-        self.assertEqual(len(plan.Proto().network), 9)
-        self.assertEqual(len(test_plan.Proto().network), 9)
+        self.assertEqual(len(plan.Proto().network), 8)
+        self.assertEqual(len(test_plan.Proto().network), 8)
         self.assertEqual(len(plan.Proto().execution_step), 1)
         self.assertEqual(len(test_plan.Proto().execution_step), 1)
         self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name())
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index 311211dfdff3e..9cfe7089332a1 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -150,7 +150,7 @@ def add_setup_steps(step, init_nets, exit_nets, name):
     if init_nets:
         steps.append(core.execution_step('%s:init' % name, init_nets))
     steps.append(step)
-    if len(exit_nets) > 0:
+    if exit_nets:
         steps.append(core.execution_step('%s:exit' % name, exit_nets))
     return core.execution_step(name, steps)
 
@@ -215,10 +215,11 @@ def add(self, task):
         self._tasks.append(task)
 
     def tasks(self):
-        for task in self._tasks_to_add:
-            self.add(task)
-        self._tasks_to_add = []
-        self._already_used = True
+        if not self._already_used:
+            for task in self._tasks_to_add:
+                self.add(task)
+            self._tasks_to_add = []
+            self._already_used = True
         return self._tasks
 
     def num_registered_tasks(self):
@@ -259,9 +260,8 @@ def tasks_by_node(self, node_remap=None):
         # tasks_by_node can't be called twice because the setup won't
         # work properly a second time.
         node_map = {}
-        for task in self.tasks():
-            node_map[task.node] =\
-                node_remap(task.node) if node_remap else task.node
+        for node in self.used_nodes():
+            node_map[node] = node_remap(node) if node_remap else node
         if self._tasks_by_node is not None:
             tasks_by_node, prev_node_map = self._tasks_by_node
             assert prev_node_map == node_map, (
@@ -285,11 +285,7 @@ def tasks_by_node(self, node_remap=None):
         grouped_by_node = TaskGroup()
         for node, tasks in viewitems(tasks_by_node):
             report_steps = report_steps_by_node[node]
-            node_inits, node_exits = get_setup_nets(
-                TaskGroup.LOCAL_SETUP,
-                [t.get_step() for t in tasks] + report_steps,
-                self)
-            # shortcut for single task with no queue
+
             steps = report_steps
             outputs = []
             grouped_workspace_type = WorkspaceType.PRIVATE
@@ -311,16 +307,15 @@ def tasks_by_node(self, node_remap=None):
             else:
                 step = core.execution_step(
                     '%s:body' % node, steps, concurrent_substeps=True)
-            if len(node_inits) > 0 or len(node_exits) > 0:
-                steps = []
-                if len(node_inits) > 0:
-                    steps.append(
-                        core.execution_step('%s:init' % node, node_inits))
-                steps.append(step)
-                if len(node_exits) > 0:
-                    steps.append(
-                        core.execution_step('%s:exit' % node, node_exits))
-                step = core.execution_step(node, steps)
+
+            # Prepend and append setup nets.
+            node_inits, node_exits = get_setup_nets(
+                TaskGroup.LOCAL_SETUP,
+                [t.get_step() for t in tasks] + report_steps,
+                self,
+            )
+            step = add_setup_steps(step, node_inits, node_exits, node)
+
             Task(
                 node=node, step=step, outputs=outputs,
                 name='grouped_by_node',
@@ -582,11 +577,6 @@ def get_step(self):
             Task.TASK_SETUP, [self._step] + report_steps, self)
         instance_init_nets, instance_exit_nets = get_setup_nets(
             Task.TASK_INSTANCE_SETUP, [self._step] + report_steps, self)
-        if len(self._outputs) == 0:
-            output_net = core.Net('%s:output' % self.name)
-            self.add_output(output_net.ConstantFill(
-                [], 1, dtype=core.DataType.INT32, value=0))
-            task_exit_nets.append(output_net)
 
         # Add instance-level report steps
         body = self._step if not report_steps else core.execution_step(

From 23af7deea7efa1172a54133cba5d13358cdf3cc0 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Wed, 29 Aug 2018 22:33:28 -0700
Subject: [PATCH 38/42] Add has_lapack flag (#11024)

Summary:
Currently our `skipIfLapack` has uses a try-catch block and regex match the error message. It is highly unreliable. This PR adds `hasLAPACK` and `hasMAGMA` on ATen context, and expose the flags to python.

Also fixes refcounting bug with `PyModule_AddObject`. The method steals reference, but we didn't `Py_INCREF` in some places before calling it with `Py_True` or `Py_False`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11024

Differential Revision: D9564898

Pulled By: SsnL

fbshipit-source-id: f46862ec3558d7e0058ef48991cd9c720cb317e2
---
 aten/src/ATen/Context.cpp                 | 10 ++++++++++
 aten/src/ATen/Context.h                   | 12 ++++++++++++
 aten/src/ATen/cuda/detail/CUDAHooks.cpp   | 10 +++++++++-
 aten/src/ATen/cuda/detail/CUDAHooks.h     |  1 +
 aten/src/ATen/detail/CUDAHooksInterface.h |  4 ++++
 test/common.py                            |  8 +++-----
 torch/csrc/Module.cpp                     | 22 +++++++++++++++-------
 torch/csrc/cuda/Module.cpp                | 11 +++++------
 8 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index a2c3fb40a7d41..5b420d87b34fc 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -11,6 +11,8 @@
 #include "ATen/CPUGenerator.h"
 #include "ATen/RegisterCPU.h"
 
+#include "TH/TH.h"  // for USE_LAPACK
+
 #ifdef USE_SSE3
 #include <pmmintrin.h>
 #endif
@@ -80,6 +82,14 @@ bool Context::hasMKL() const {
 #endif
 }
 
+bool Context::hasLAPACK() const {
+#ifdef USE_LAPACK
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool Context::setFlushDenormal(bool on) {
 #ifdef USE_SSE3
   // Setting flush-to-zero (FTZ) flag
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 5584963fefe57..bab1fa5dc5d06 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -50,6 +50,10 @@ class AT_API Context {
     return *generator;
   }
   bool hasMKL() const;
+  bool hasLAPACK() const;
+  bool hasMAGMA() const {
+    return detail::getCUDAHooks().hasMAGMA();
+  }
   bool hasCUDA() const {
     return detail::getCUDAHooks().hasCUDA();
   }
@@ -158,6 +162,14 @@ static inline bool hasMKL() {
   return globalContext().hasMKL();
 }
 
+static inline bool hasLAPACK() {
+  return globalContext().hasLAPACK();
+}
+
+static inline bool hasMAGMA() {
+  return globalContext().hasMAGMA();
+}
+
 static inline int64_t current_device() {
   return globalContext().current_device();
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 7d73fafc994da..570a375e3888a 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -69,7 +69,7 @@ DynamicCUDAInterfaceSetter _;
 // let's not if we don't need to!)
 std::unique_ptr<THCState, void (*)(THCState*)> CUDAHooks::initCUDA() const {
   THCState* thc_state = THCState_alloc();
-  
+
   THCudaInit(thc_state);
   return std::unique_ptr<THCState, void (*)(THCState*)>(
       thc_state, [](THCState* p) {
@@ -92,6 +92,14 @@ bool CUDAHooks::hasCUDA() const {
   return true;
 }
 
+bool CUDAHooks::hasMAGMA() const {
+#ifdef USE_MAGMA
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool CUDAHooks::hasCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 766ab62b8ef79..491adfc4d73f1 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -13,6 +13,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   std::unique_ptr<THCState, void(*)(THCState*)> initCUDA() const override;
   std::unique_ptr<Generator> initCUDAGenerator(Context*) const override;
   bool hasCUDA() const override;
+  bool hasMAGMA() const override;
   bool hasCuDNN() const override;
   int64_t current_device() const override;
   Allocator* getPinnedMemoryAllocator() const override;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 6b2e87c4f762a..cccf6dc28453d 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -65,6 +65,10 @@ struct AT_API CUDAHooksInterface {
     return false;
   }
 
+  virtual bool hasMAGMA() const {
+    return false;
+  }
+
   virtual bool hasCuDNN() const {
     return false;
   }
diff --git a/test/common.py b/test/common.py
index 545ba4f1f0dd2..e7d6940ea56cc 100644
--- a/test/common.py
+++ b/test/common.py
@@ -112,12 +112,10 @@ def wrapper(*args, **kwargs):
 def skipIfNoLapack(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
-        try:
+        if not torch._C.has_lapack:
+            raise unittest.SkipTest('PyTorch compiled without Lapack')
+        else:
             fn(*args, **kwargs)
-        except Exception as e:
-            if 'Lapack library not found' in repr(e):
-                raise unittest.SkipTest('Compiled without Lapack')
-            raise
     return wrapper
 
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index af367c3e54490..e17997e6e9bab 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -584,13 +584,20 @@ static PyObject* initModule() {
   ASSERT_TRUE(THCPStream_init(module));
 #endif
 
+  auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
+    // PyModule_AddObject steals reference
+    if (incref) {
+      Py_INCREF(v);
+    }
+    return PyModule_AddObject(module, name, v) == 0;
+  };
+
 #ifdef USE_CUDNN
   PyObject *has_cudnn = Py_True;
 #else
   PyObject *has_cudnn = Py_False;
 #endif
-  Py_INCREF(has_cudnn);
-  ASSERT_TRUE(PyModule_AddObject(module, "has_cudnn", has_cudnn) == 0);
+ ASSERT_TRUE(set_module_attr("has_cudnn", has_cudnn));
 
 #ifdef USE_DISTRIBUTED_MW
   // See comment on CUDA objects
@@ -611,19 +618,20 @@ static PyObject* initModule() {
   // Set ATen warnings to issue Python warnings
   at::Warning::set_warning_handler(&warning_handler);
 
-  ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0);
+  ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
+  ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
 
 #ifdef _GLIBCXX_USE_CXX11_ABI
-  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI",
-        _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0);
+  ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False));
 #else
-  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0);
+  ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False));
 #endif
 
   auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU);
   THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator(
     defaultGenerator);
-  ASSERT_TRUE(PyModule_AddObject(module, "default_generator", (PyObject*)THPDefaultGenerator) == 0);
+  // This reference is meant to be given away, so no need to incref here.
+  ASSERT_TRUE(set_module_attr("default_generator", (PyObject*)THPDefaultGenerator, /* incref= */ false));
 
 #ifdef USE_NUMPY
   if (_import_array() < 0) return NULL;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index a4fcc6c45e874..8fd95eda86f12 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -333,16 +333,15 @@ static PyObject * THCPModule_initExtension(PyObject *self)
   THCPCharStorage_postInit(m);
   THCPByteStorage_postInit(m);
 
-#ifdef USE_MAGMA
-  THCMagma_init(state);
-  bool has_magma = true;
-#else
-  bool has_magma = false;
-#endif
+  bool has_magma = at::hasMAGMA();
+  if (has_magma) {
+    THCMagma_init(state);
+  }
 
   bool has_half = true;
 
   auto set_module_attr = [&](const char* name, PyObject* v) {
+    // PyObject_SetAttrString doesn't steal reference. So no need to incref.
     if (PyObject_SetAttrString(m, name, v) < 0) {
       throw python_error();
     }

From dbc0004f995556b288b43077458b61683ac93855 Mon Sep 17 00:00:00 2001
From: Tullie Murrell <tullie@fb.com>
Date: Wed, 29 Aug 2018 23:40:15 -0700
Subject: [PATCH 39/42] Remove use_count() == 1 in Tensor::Extend (#11046)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11046

As suggested by jerryzh168, temporary fix for a new constraint that was added D9350686 is to remove this assert. Long term jerryzh168 is going to work out a better way of handling this.

Reviewed By: jerryzh168

Differential Revision: D9566323

fbshipit-source-id: e4630c7cbe0cc68a084974ea7048654811fae01f
---
 caffe2/core/tensor.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 21dc126c7f2c0..f1934f5ddbc28 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -275,9 +275,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(
         num, 0, "`num` must be non-negative for Extend");
-    CAFFE_ENFORCE(
-        storage_.use_count() == 1,
-        "Can't call Extend on shared storage, please call Resize instead");
     auto newDims = dims_;
     newDims[0] += num;
     if (!storage_->data()) {

From a8af7fe46ab1e79066f418b4cc79a5fd56316df5 Mon Sep 17 00:00:00 2001
From: Xingdong Zuo <zuoxingdong@users.noreply.github.com>
Date: Thu, 30 Aug 2018 08:09:23 -0700
Subject: [PATCH 40/42] Support import of `nn.RNNCellBase` in `__all__`

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10992

Differential Revision: D9572005

Pulled By: soumith

fbshipit-source-id: 26b546830b6a25a4f7ba6f825cd888d678233a97
---
 torch/nn/modules/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 01ce9bf2ac7ca..f2683756ce59f 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -43,10 +43,10 @@
     'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm',
     'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
     'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
-    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
-    'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance',
-    'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d',
+    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell',
+    'LSTMCell', 'GRUCell', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d',
+    'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d',
     'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
     'AdaptiveLogSoftmaxWithLoss',
 ]

From e7195431e0dffa7a065b65286ce414ac68f6bd7d Mon Sep 17 00:00:00 2001
From: Fei Sun <feisun@fb.com>
Date: Thu, 30 Aug 2018 09:47:16 -0700
Subject: [PATCH 41/42] Add benchmarking functionality to the benchmark app
 (#10976)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10976

The app can run in XCode with the benchmark metrics collected.
It can also run when building with buck

Reviewed By: llyfacebook

Differential Revision: D9546755

fbshipit-source-id: 60ad0112946f8cf57138417f6838a58ed6d2c90f
---
 binaries/benchmark_helper.cc | 88 +++++++++++++++++++++++++++++++++++-
 binaries/benchmark_helper.h  | 18 ++++++++
 binaries/caffe2_benchmark.cc | 55 ++++++----------------
 3 files changed, 118 insertions(+), 43 deletions(-)

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 7a441dc1c5c2e..daf3ccac90eec 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include <string>
 #include <chrono>
+#include <fstream>
+#include <string>
 #include <thread>
 
 #include "binaries/benchmark_helper.h"
@@ -309,3 +310,88 @@ void writeOutput(
     }
   }
 }
+
+int benchmark(
+    int argc,
+    char* argv[],
+    const string& FLAGS_backend,
+    const string& FLAGS_init_net,
+    const string& FLAGS_input,
+    const string& FLAGS_input_dims,
+    const string& FLAGS_input_file,
+    const string& FLAGS_input_type,
+    int FLAGS_iter,
+    const string& FLAGS_net,
+    const string& FLAGS_output,
+    const string& FLAGS_output_folder,
+    bool FLAGS_run_individual,
+    int FLAGS_sleep_before_run,
+    bool FLAGS_text_output,
+    int FLAGS_warmup,
+    bool FLAGS_wipe_cache) {
+  caffe2::GlobalInit(&argc, &argv);
+  // Check arguments to be correct
+  {
+    // Need to check whether file exists, as the file reader does not assert if
+    // file does not exist
+    std::ifstream net_file(FLAGS_net);
+    CAFFE_ENFORCE(net_file.good());
+
+    std::ifstream init_net_file(FLAGS_init_net);
+    CAFFE_ENFORCE(init_net_file.good());
+
+    if (FLAGS_input_file.size() > 0) {
+      vector<string> input_files = caffe2::split(',', FLAGS_input_file);
+      for (auto input_file : input_files) {
+        std::ifstream ifile(input_file);
+        CAFFE_ENFORCE(ifile.good());
+      }
+    }
+  }
+
+  observerConfig();
+  caffe2::ShowLogInfoToStderr();
+
+  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
+  bool run_on_gpu = backendCudaSet(FLAGS_backend);
+  // Run initialization network.
+  caffe2::NetDef init_net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
+  setOperatorEngine(&init_net_def, FLAGS_backend);
+  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
+
+  // Run main network.
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
+  setOperatorEngine(&net_def, FLAGS_backend);
+
+  map<string, caffe2::TensorProtos> tensor_protos_map;
+
+  loadInput(
+      workspace,
+      run_on_gpu,
+      tensor_protos_map,
+      FLAGS_input,
+      FLAGS_input_file,
+      FLAGS_input_dims,
+      FLAGS_input_type);
+
+  runNetwork(
+      workspace,
+      net_def,
+      tensor_protos_map,
+      FLAGS_wipe_cache,
+      FLAGS_run_individual,
+      FLAGS_warmup,
+      FLAGS_iter,
+      FLAGS_sleep_before_run);
+
+  writeOutput(
+      workspace,
+      run_on_gpu,
+      FLAGS_output,
+      FLAGS_output_folder,
+      FLAGS_text_output);
+
+  return 0;
+}
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
index df23ed8651118..5bf79182dab7e 100644
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@@ -98,3 +98,21 @@ void runNetwork(
     const int,
     const int,
     const int);
+int benchmark(
+    int argc,
+    char* argv[],
+    const string& FLAGS_backend,
+    const string& FLAGS_init_net,
+    const string& FLAGS_input,
+    const string& FLAGS_input_dims,
+    const string& FLAGS_input_file,
+    const string& FLAGS_input_type,
+    int FLAGS_iter,
+    const string& FLAGS_net,
+    const string& FLAGS_output,
+    const string& FLAGS_output_folder,
+    bool FLAGS_run_individual,
+    int FLAGS_sleep_before_run,
+    bool FLAGS_text_output,
+    int FLAGS_warmup,
+    bool FLAGS_wipe_cache);
diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc
index c5a93ae7cbae3..38badccfa1e4b 100644
--- a/binaries/caffe2_benchmark.cc
+++ b/binaries/caffe2_benchmark.cc
@@ -77,51 +77,22 @@ CAFFE2_DEFINE_bool(
     "Whether to evict the cache before running network.");
 
 int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-
-  observerConfig();
-  caffe2::ShowLogInfoToStderr();
-
-  auto workspace = make_shared<caffe2::Workspace>(new caffe2::Workspace());
-  bool run_on_gpu = backendCudaSet(caffe2::FLAGS_backend);
-  // Run initialization network.
-  caffe2::NetDef init_net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def));
-  setOperatorEngine(&init_net_def, caffe2::FLAGS_backend);
-  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
-
-  // Run main network.
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
-  setOperatorEngine(&net_def, caffe2::FLAGS_backend);
-
-  map<string, caffe2::TensorProtos> tensor_protos_map;
-
-  loadInput(
-      workspace,
-      run_on_gpu,
-      tensor_protos_map,
+  benchmark(
+      argc,
+      argv,
+      caffe2::FLAGS_backend,
+      caffe2::FLAGS_init_net,
       caffe2::FLAGS_input,
-      caffe2::FLAGS_input_file,
       caffe2::FLAGS_input_dims,
-      caffe2::FLAGS_input_type);
-
-  runNetwork(
-      workspace,
-      net_def,
-      tensor_protos_map,
-      caffe2::FLAGS_wipe_cache,
-      caffe2::FLAGS_run_individual,
-      caffe2::FLAGS_warmup,
+      caffe2::FLAGS_input_file,
+      caffe2::FLAGS_input_type,
       caffe2::FLAGS_iter,
-      caffe2::FLAGS_sleep_before_run);
-
-  writeOutput(
-      workspace,
-      run_on_gpu,
+      caffe2::FLAGS_net,
       caffe2::FLAGS_output,
       caffe2::FLAGS_output_folder,
-      caffe2::FLAGS_text_output);
-
-  return 0;
+      caffe2::FLAGS_run_individual,
+      caffe2::FLAGS_sleep_before_run,
+      caffe2::FLAGS_text_output,
+      caffe2::FLAGS_warmup,
+      caffe2::FLAGS_wipe_cache);
 }

From 535633bddc9e52bb70b5be0dce80002e55d07cbd Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Thu, 30 Aug 2018 10:31:09 -0700
Subject: [PATCH 42/42] Export MPI functions (#11037)

Summary:
Potential fix for https://github.com/caffe2/caffe2/issues/2551#issuecomment-417124872

cc Yangqing mingzhe09088
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11037

Reviewed By: mingzhe09088

Differential Revision: D9580937

Pulled By: orionr

fbshipit-source-id: 5e1fbf718728271a5b5af526d8e67cc5b48f0575
---
 caffe2/mpi/mpi_common.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/caffe2/mpi/mpi_common.h b/caffe2/mpi/mpi_common.h
index 3e1e7a5625bd2..b283a0aea382c 100644
--- a/caffe2/mpi/mpi_common.h
+++ b/caffe2/mpi/mpi_common.h
@@ -4,6 +4,7 @@
 #include <mpi.h>
 #include <mutex>
 
+#include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 
 namespace caffe2 {
@@ -29,7 +30,7 @@ MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
 #undef MPI_DATATYPE_WRAPPER
 
 // For all Caffe MPI calls, we will wrap it inside an MPI mutex lock guard.
-std::mutex& MPIMutex();
+CAFFE2_API std::mutex& MPIMutex();
 
 #define MPI_CHECK(condition)                                 \
   do {                                                       \
@@ -49,23 +50,23 @@ std::mutex& MPIMutex();
  * @brief Gets the global MPI communicator used by Caffe2. In default, this
  * is MPI_COMM_WORLD unless you call SetGlobalMPIComm().
  */
-MPI_Comm GlobalMPIComm();
+CAFFE2_API MPI_Comm GlobalMPIComm();
 
 /**
  * @brief Sets the global MPI communicator. Caffe2 takes over the ownership
  * of the passed in communicator.
  */
-void SetGlobalMPIComm(MPI_Comm new_comm);
+CAFFE2_API void SetGlobalMPIComm(MPI_Comm new_comm);
 
 /**
  * @brief A helper function to return the size of the given communicator.
  */
-int MPICommSize(MPI_Comm comm);
+CAFFE2_API int MPICommSize(MPI_Comm comm);
 
 /**
  * @brief A helper function to return the rank of the given communicator.
  */
-int MPICommRank(MPI_Comm comm);
+CAFFE2_API int MPICommRank(MPI_Comm comm);
 
 /**
  * @brief A simple wrapper over an MPI common world.