From 7169906249fb57e0a9be00902abcb26457dcc14c Mon Sep 17 00:00:00 2001 From: zou3519 Date: Wed, 29 Aug 2018 09:32:08 -0700 Subject: [PATCH 01/42] torch.digamma (#10967) Summary: Fixes #10307 cc SsnL Pull Request resolved: https://github.com/pytorch/pytorch/pull/10967 Differential Revision: D9546748 Pulled By: zou3519 fbshipit-source-id: 764e27b1cc8dd487270b3ffa653b806c86f717dd --- docs/source/torch.rst | 1 + torch/_torch_docs.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/docs/source/torch.rst b/docs/source/torch.rst index fa2f92092758a4..d385ff07d323d5 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -169,6 +169,7 @@ Pointwise Ops .. autofunction:: cos .. autofunction:: cosh .. autofunction:: div +.. autofunction:: digamma .. autofunction:: erf .. autofunction:: erfc .. autofunction:: erfinv diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 6561c7a7c23889..27f111a471fa16 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -1168,6 +1168,26 @@ def parse_kwargs(desc): [ 1.0500, 0.7336, -0.3836, -1.1015]]]) """) +add_docstr(torch.digamma, + r""" +digamma(input) -> Tensor + +Computes the logarithmic derivative of the gamma function on `input`. + +.. math:: + \psi(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)} + +Args: + input (Tensor): the tensor to compute the digamma function on + +Example:: + + >>> a = torch.tensor([1, 0.5]) + >>> torch.digamma(a) + tensor([-0.5772, -1.9635]) +""") + + add_docstr(torch.dist, r""" dist(input, other, p=2) -> Tensor From b41988c71ed7d40af7a314b2049a4b0d5909fed2 Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Wed, 29 Aug 2018 10:02:12 -0700 Subject: [PATCH 02/42] Cleanup BUILD_DOCS cmake section (#11000) Summary: Breaking out of https://github.com/pytorch/pytorch/pull/8338 cc mingzhe09088 Yangqing Pull Request resolved: https://github.com/pytorch/pytorch/pull/11000 Differential Revision: D9557474 Pulled By: orionr fbshipit-source-id: 7d84914b67ff37bdb7738f9b7846dfeb5b975c00 --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1009e5a4ec30f7..75b4bf7b4512d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,7 +306,7 @@ if(BUILD_DOCS) if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs) file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs) - endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs) + endif() file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs) configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY) @@ -323,10 +323,10 @@ if(BUILD_DOCS) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Generating Python API documentation with Doxygen" VERBATIM) - else (DOXYGEN_FOUND) + else() message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation") - endif (DOXYGEN_FOUND) -endif (BUILD_DOCS) + endif() +endif() # ---[ CMake related files # Uninistall option. From a9469c9c8ab046a7961c1c357d84f60063507c4b Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Wed, 29 Aug 2018 10:48:04 -0700 Subject: [PATCH 03/42] Fill eigenvector with zeros if not required (#10645) Summary: Fix #10345, which only happens in CUDA case. * Instead of returning some random buffer, we fill it with zeros. * update torch.symeig doc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10645 Reviewed By: soumith Differential Revision: D9395762 Pulled By: ailzhang fbshipit-source-id: 0f3ed9bb6a919a9c1a4b8eb45188f65a68bfa9ba --- aten/src/THC/generic/THCTensorMathMagma.cu | 8 +++++- test/test_cuda.py | 12 +-------- test/test_torch.py | 30 +++++++++++++++------- torch/_torch_docs.py | 9 +++++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index aee04a8e22a4e4..3b63c3ae1c7b2f 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -235,7 +235,13 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T else if (info < 0) THError("MAGMA syev : Argument %d : illegal value", -info); } - THCTensor_(freeCopyTo)(state, input, rv_); + if (jobzs[0] == 'N') { + // If eigenvector is not needed, fill the result with zeros. + THCTensor_(zero)(state, rv_); + THCTensor_(free)(state, input); + } else { + THCTensor_(freeCopyTo)(state, input, rv_); + } #else THError(NoMagma(syev)); #endif diff --git a/test/test_cuda.py b/test/test_cuda.py index 73ba3880697b1f..088919ad595a9d 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -1734,17 +1734,7 @@ def test(use_double=False): @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") def test_symeig(self): - # Small case - tensor = torch.randn(3, 3).cuda() - tensor = torch.mm(tensor, tensor.t()) - eigval, eigvec = torch.symeig(tensor, eigenvectors=True) - self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t())) - - # Large case - tensor = torch.randn(257, 257).cuda() - tensor = torch.mm(tensor, tensor.t()) - eigval, eigvec = torch.symeig(tensor, eigenvectors=True) - self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t())) + TestTorch._test_symeig(self, lambda t: t.cuda()) def test_arange(self): for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']: diff --git a/test/test_torch.py b/test/test_torch.py index 3bab1927a5de9c..34b8256763c658 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -4279,13 +4279,12 @@ def test_eig(self): Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t()) self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong') - @skipIfNoLapack - @skipIfRocm - def test_symeig(self): - xval = torch.rand(100, 3) + @staticmethod + def _test_symeig(self, conv_fn): + xval = conv_fn(torch.rand(100, 3)) cov = torch.mm(xval.t(), xval) - rese = torch.zeros(3) - resv = torch.zeros(3, 3) + rese = conv_fn(torch.zeros(3)) + resv = conv_fn(torch.zeros(3, 3)) # First call to symeig self.assertTrue(resv.is_contiguous(), 'resv is not contiguous') @@ -4299,17 +4298,30 @@ def test_symeig(self): ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t()) self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong') + # test eigenvectors=False + rese2 = conv_fn(torch.zeros(3)) + resv2 = conv_fn(torch.randn(3, 3)) + expected_resv2 = conv_fn(torch.zeros(3, 3)) + torch.symeig(cov.clone(), False, out=(rese2, resv2)) + self.assertEqual(rese, rese2) + self.assertEqual(resv2, expected_resv2) + # test non-contiguous - X = torch.rand(5, 5) + X = conv_fn(torch.rand(5, 5)) X = X.t() * X - e = torch.zeros(4, 2).select(1, 1) - v = torch.zeros(4, 2, 4)[:, 1] + e = conv_fn(torch.zeros(4, 2)).select(1, 1) + v = conv_fn(torch.zeros(4, 2, 4))[:, 1] self.assertFalse(v.is_contiguous(), 'V is contiguous') self.assertFalse(e.is_contiguous(), 'E is contiguous') torch.symeig(X, True, out=(e, v)) Xhat = torch.mm(torch.mm(v, torch.diag(e)), v.t()) self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong') + @skipIfNoLapack + @skipIfRocm + def test_symeig(self): + self._test_symeig(self, lambda x: x) + @skipIfNoLapack def test_svd(self): a = torch.Tensor(((8.79, 6.11, -9.15, 9.57, -3.49, 9.84), diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 27f111a471fa16..fd70fd20f5f450 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -4459,6 +4459,15 @@ def parse_kwargs(desc): upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region out (tuple, optional): the output tuple of (Tensor, Tensor) +Returns: + (Tensor, Tensor): A tuple containing + + - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``, + The eigenvalues are in ascending order. + - **V** (*Tensor*): Shape :math:`(m \times m)`. + If ``eigenvectors=False``, it's a tensor filled with zeros. + Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``. + Examples:: From 1b0d5e60abe8eae3ebaaa3c16eb387314b455d5c Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Wed, 29 Aug 2018 10:59:05 -0700 Subject: [PATCH 04/42] Get rid of some unnecessary includes of Context. (#10951) Summary: This is part of splitting Context from what needs to go in ATen/core. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10951 Differential Revision: D9540369 Pulled By: gchanan fbshipit-source-id: 73b0e8c4493785fbab368a989f46137c51f6ea0b --- aten/src/ATen/Formatting.cpp | 1 - aten/src/ATen/StorageImpl.cpp | 1 - aten/src/ATen/UndefinedTensor.cpp | 1 - aten/src/ATen/UndefinedType.h | 1 - 4 files changed, 4 deletions(-) diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp index ef04cc4bdfd975..dcdf7653f2308b 100644 --- a/aten/src/ATen/Formatting.cpp +++ b/aten/src/ATen/Formatting.cpp @@ -1,6 +1,5 @@ #include "ATen/Formatting.h" #include "ATen/Tensor.h" -#include "ATen/Context.h" #include "ATen/TensorMethods.h" #include diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index 233540bfa06f28..af488472f24b5b 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -1,4 +1,3 @@ -#include #include namespace at { diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp index 79f58479e90b52..f50a4e71da9cae 100644 --- a/aten/src/ATen/UndefinedTensor.cpp +++ b/aten/src/ATen/UndefinedTensor.cpp @@ -1,5 +1,4 @@ #include "ATen/UndefinedTensor.h" -#include "ATen/Context.h" #include "ATen/core/Error.h" namespace at { diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h index 9ca00cfb516ff7..2cb14a3a652c4f 100644 --- a/aten/src/ATen/UndefinedType.h +++ b/aten/src/ATen/UndefinedType.h @@ -1,7 +1,6 @@ #pragma once #include "ATen/Type.h" -#include "ATen/Context.h" #include "ATen/CheckGenerator.h" #ifdef _MSC_VER From 562fc7631ff8b25487c9a3886f57b74bd7008c97 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Wed, 29 Aug 2018 10:59:29 -0700 Subject: [PATCH 05/42] Add test cases for ONNX unsqueeze (#10924) Summary: PyTorch exporting test and end to end cases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10924 Reviewed By: Ac2zoom Differential Revision: D9548210 Pulled By: houseroad fbshipit-source-id: 2381d1ad92a4e07f97060eb65c9fd09f60ad3de6 --- .../TestOperators.test_unsqueeze.expect | 54 +++++++++++++++++++ test/onnx/test_operators.py | 4 ++ test/onnx/test_pytorch_onnx_caffe2.py | 12 +++++ 3 files changed, 70 insertions(+) create mode 100644 test/onnx/expect/TestOperators.test_unsqueeze.expect diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect new file mode 100644 index 00000000000000..3a8e01092f8d0b --- /dev/null +++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect @@ -0,0 +1,54 @@ +ir_version: 3 +producer_name: "pytorch" +producer_version: "0.4" +graph { + node { + input: "0" + output: "1" + op_type: "Unsqueeze" + attribute { + name: "axes" + ints: 2 + type: INTS + } + } + name: "torch-jit-export" + input { + name: "0" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "1" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + dim { + dim_value: 1 + } + } + } + } + } +} +opset_import { + version: 7 +} diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index f476cde7afd935..d8e0b6be0d94a9 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -428,6 +428,10 @@ def test_upsample(self): x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2., mode='bilinear'), x) + def test_unsqueeze(self): + x = Variable(torch.randn(3, 4), requires_grad=True) + self.assertONNX(lambda x: x.unsqueeze(len(x.shape)), x) + def test_symbolic_override(self): """Lifted from fast-neural-style: custom implementation of instance norm to be mapped to ONNX operator""" diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 9b31d02d6e385d..349e7fc1eec375 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -798,6 +798,18 @@ def test_convtranspose(self): model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2) self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7) + def test_unsqueeze(self): + shape = (3, 4, 5) + for dim in range(len(shape) + 1): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return x.unsqueeze(dim) + x = Variable(torch.randn(*shape)) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7) + # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed # TODO: We should have another pass to eliminate the unused initializers in ONNX models. @skipIfEmbed From 206d52d0e3ad4ef547b5bb566cdf7ca2e7c824ae Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Wed, 29 Aug 2018 11:07:29 -0700 Subject: [PATCH 06/42] Disable smart_tensor_printer_test without glog (#10999) Summary: Breaking out of https://github.com/pytorch/pytorch/pull/8338 This test fails once we start building with `-DUSE_GLOG=OFF` since the non-glog logging case doesn't support flushing or streaming to the right location. For now, we just disable this test in that case. cc Yangqing mingzhe09088 Pull Request resolved: https://github.com/pytorch/pytorch/pull/10999 Reviewed By: mingzhe09088 Differential Revision: D9557488 Pulled By: orionr fbshipit-source-id: 8b306f210411dfc8ccc404bdccf77ddcd36a4830 --- caffe2/utils/smart_tensor_printer_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc index 82a59ad60aa950..e207f7c7b05284 100644 --- a/caffe2/utils/smart_tensor_printer_test.cc +++ b/caffe2/utils/smart_tensor_printer_test.cc @@ -39,6 +39,9 @@ void printTensorAndCheck(const std::vector& values) { expect_stderr_contains(values); } +// We need real glog for this test to pass +#ifdef CAFFE2_USE_GOOGLE_GLOG + #if !(__APPLE__) // TODO(janusz): thread_local does not work under mac. TEST(SmartTensorPrinterTest, SimpleTest) { @@ -48,4 +51,6 @@ TEST(SmartTensorPrinterTest, SimpleTest) { #endif // !(__APPLE__) +#endif // CAFFE2_USE_GOOGLE_GLOG + } // namespace caffe2 From e0dbb91060f1d9dbc45ae8f37b8613e487e2e4b0 Mon Sep 17 00:00:00 2001 From: Mingzhe Li Date: Wed, 29 Aug 2018 11:26:56 -0700 Subject: [PATCH 07/42] Windows raw string fix (#10998) Summary: Breaking this out of https://github.com/pytorch/pytorch/pull/8338 mingzhe09088's fix of the docstrings for Windows builds. Unfortunately some versions of Windows seem to try and parse the `#` inside the string as a pre-processor declaration. We might need to change this to something else later, but want to get this landed first. cc mingzhe09088 Yangqing Pull Request resolved: https://github.com/pytorch/pytorch/pull/10998 Reviewed By: mingzhe09088 Differential Revision: D9557480 Pulled By: orionr fbshipit-source-id: c6a6237c27b7cf35c81133fd9faefead675a9f59 --- caffe2/operators/concat_split_op.cc | 4 ++-- caffe2/operators/conv_op.cc | 10 +++++----- caffe2/operators/conv_transpose_op.cc | 10 +++++----- caffe2/operators/counter_ops.cc | 10 +++++----- caffe2/operators/cross_entropy_op.cc | 20 +++++++++---------- caffe2/operators/distance_op.cc | 20 +++++++++---------- caffe2/operators/elementwise_linear_op.cc | 12 +++++------ caffe2/operators/elementwise_logical_ops.cc | 4 ++-- caffe2/operators/elementwise_sum_op.cc | 2 +- caffe2/operators/filler_op.cc | 8 ++++---- caffe2/operators/fully_connected_op.cc | 16 +++++++-------- caffe2/operators/gather_op.cc | 2 +- .../local_response_normalization_op.cc | 2 +- caffe2/operators/lp_pool_op.cc | 2 +- caffe2/operators/lpnorm_op.cc | 2 +- caffe2/operators/pool_op.cc | 4 ++-- caffe2/operators/reduction_ops.cc | 16 +++++++-------- caffe2/operators/relu_op.cc | 2 +- caffe2/operators/sparse_to_dense_mask_op.cc | 4 ++-- caffe2/operators/sparse_to_dense_op.cc | 2 +- caffe2/operators/stats_ops.cc | 4 ++-- caffe2/operators/utility_ops.cc | 12 +++++------ 22 files changed, 84 insertions(+), 84 deletions(-) diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc index a8f4c91e7e5404..31256026028dfa 100644 --- a/caffe2/operators/concat_split_op.cc +++ b/caffe2/operators/concat_split_op.cc @@ -311,8 +311,8 @@ op = core.CreateOperator( axis=3 ) -workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW -workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW +workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW +workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW print("X1:", workspace.FetchBlob("X1")) print("X2:", workspace.FetchBlob("X2")) workspace.RunOperatorOnce(op) diff --git a/caffe2/operators/conv_op.cc b/caffe2/operators/conv_op.cc index 082c94fb6c18fb..30fb79d3846942 100644 --- a/caffe2/operators/conv_op.cc +++ b/caffe2/operators/conv_op.cc @@ -42,24 +42,24 @@ op = core.CreateOperator( stride=2 ) -# Create X: (N,C,H,W) +// Create X: (N,C,H,W) data = np.random.randn(1,1,8,8).astype(np.float32) print("Data shape: ",data.shape) -# Create W: (M,C,Kh,Kw) +// Create W: (M,C,Kh,Kw) filters = np.random.randn(3,1,5,5).astype(np.float32) print("Filter shape: ",filters.shape) -# Create b: M +// Create b: M bias = np.array([1.,1.,1.]).astype(np.float32) print("Bias shape: ",bias.shape) -# Put the inputs into the workspace +// Put the inputs into the workspace workspace.FeedBlob("X", data) workspace.FeedBlob("filter", filters) workspace.FeedBlob("bias", bias) -# Run the operator +// Run the operator workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/conv_transpose_op.cc b/caffe2/operators/conv_transpose_op.cc index 57ec02b63ea0dd..7de16afaed9158 100644 --- a/caffe2/operators/conv_transpose_op.cc +++ b/caffe2/operators/conv_transpose_op.cc @@ -44,24 +44,24 @@ op = core.CreateOperator( strides=[2,2] ) -# Create X: (N,C,H,W) +// Create X: (N,C,H,W) data = np.random.randn(2,3,5,5).astype(np.float32) print("Data shape: ",data.shape) -# Create filter: (M,C,Kh,Kw) +// Create filter: (M,C,Kh,Kw) filters = np.random.randn(3,1,2,2).astype(np.float32) print("Filter shape: ",filters.shape) -# Create b: M +// Create b: M bias = np.array([1.]).astype(np.float32) print("Bias shape: ",bias.shape) -# Put the inputs into the workspace +// Put the inputs into the workspace workspace.FeedBlob("X", data) workspace.FeedBlob("filter", filters) workspace.FeedBlob("bias", bias) -# Run the operator +// Run the operator workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc index 15cdab5849cc1f..50e4b9448af310 100644 --- a/caffe2/operators/counter_ops.cc +++ b/caffe2/operators/counter_ops.cc @@ -58,22 +58,22 @@ resetcounter_op = core.CreateOperator( ) -# Create counter +// Create counter workspace.RunOperatorOnce(createcounter_op) print("'counter' pointer:", workspace.FetchBlob("counter")) -# Retrieve initial counter value +// Retrieve initial counter value workspace.RunOperatorOnce(retrievecount_op) print("Initial 'count':", workspace.FetchBlob("count")) -# Check if counter is done +// Check if counter is done workspace.RunOperatorOnce(checkcounterdone_op) print("Initial 'done' value:", workspace.FetchBlob("done")) -# Test CountUp operator +// Test CountUp operator print("\nTesting CountUp operator...") for i in range(5): workspace.RunOperatorOnce(countup_op) @@ -83,7 +83,7 @@ workspace.RunOperatorOnce(retrievecount_op) print("'count' value after CountUp test:", workspace.FetchBlob("count")) -# Test CountDown operator +// Test CountDown operator print("\nTesting CountDown operator...") for i in range(11): workspace.RunOperatorOnce(countdown_op) diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc index 584b7abd5a183f..0473e7d4e435b3 100644 --- a/caffe2/operators/cross_entropy_op.cc +++ b/caffe2/operators/cross_entropy_op.cc @@ -401,22 +401,22 @@ op = core.CreateOperator( ["Y"] ) -# Create X: Sample softmax output for 5-class model +// Create X: Sample softmax output for 5-class model X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) print("X:\n",X) -# Create label: Sample 1-hot ground truth label vectors +// Create label: Sample 1-hot ground truth label vectors label = np.array([4,2]) print("label:\n",label) -# Feed X & label into workspace +// Feed X & label into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("label", label.astype(np.int32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` @@ -635,22 +635,22 @@ op = core.CreateOperator( ["Y"] ) -# Create X: Sample softmax output for 5-class model +// Create X: Sample softmax output for 5-class model X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) print("X:\n",X) -# Create label: Sample 1-hot ground truth label vectors +// Create label: Sample 1-hot ground truth label vectors label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]]) print("label:\n",label) -# Feed X & label into workspace +// Feed X & label into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("label", label.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc index d9abfa0e254336..9a38a4a77a0043 100644 --- a/caffe2/operators/distance_op.cc +++ b/caffe2/operators/distance_op.cc @@ -437,22 +437,22 @@ op = core.CreateOperator( ["Z"] ) -# Create X +// Create X X = 5*np.ones((1, 4)) print("X:\n",X) -# Create Y +// Create Y Y = np.ones((1, 4)) print("Y:\n",Y) -# Feed X & Y into workspace +// Feed X & Y into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("Y", Y.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Z:\n", workspace.FetchBlob("Z")) ``` @@ -645,22 +645,22 @@ op = core.CreateOperator( ["Z"] ) -# Create X +// Create X X = np.random.randn(3, 3) print("X:\n",X) -# Create Y +// Create Y Y = np.random.randn(3, 3) print("Y:\n",Y) -# Feed X & Y into workspace +// Feed X & Y into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("Y", Y.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Z:\n", workspace.FetchBlob("Z")) ``` diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc index d68bfbc5a0eb93..371aae78a25201 100644 --- a/caffe2/operators/elementwise_linear_op.cc +++ b/caffe2/operators/elementwise_linear_op.cc @@ -112,28 +112,28 @@ op = core.CreateOperator( ["Y"] ) -# Create X +// Create X X = np.array([[1,2,3,4,5],[6,8,9,16,10]]) print("X:\n",X) -# Create w +// Create w w = np.array([1,1/2.,1/3.,1/4.,1/5.]) print("w:\n",w) -# Create b +// Create b b = np.array([1.,1.,1.,1.,1.]) print("b:\n",b) -# Feed X & w & b into workspace +// Feed X & w & b into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("w", w.astype(np.float32)) workspace.FeedBlob("b", b.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc index 5ddd4570356e9d..0e2da569dcb11f 100644 --- a/caffe2/operators/elementwise_logical_ops.cc +++ b/caffe2/operators/elementwise_logical_ops.cc @@ -63,7 +63,7 @@ op = core.CreateOperator( value=[0,2,4,6,8], ) -# Use a not-empty tensor +// Use a not-empty tensor workspace.FeedBlob("X", np.array([0,1,2,3,4,5,6,7,8]).astype(np.int32)) print("X:\n", workspace.FetchBlob("X")) @@ -75,7 +75,7 @@ print("Y: \n", workspace.FetchBlob("Y")) **Result** ``` -# value=[0,2,4,6,8] +// value=[0,2,4,6,8] X: [0 1 2 3 4 5 6 7 8] diff --git a/caffe2/operators/elementwise_sum_op.cc b/caffe2/operators/elementwise_sum_op.cc index 861f4f115c0a41..dee3671f5bdc4a 100644 --- a/caffe2/operators/elementwise_sum_op.cc +++ b/caffe2/operators/elementwise_sum_op.cc @@ -86,7 +86,7 @@ workspace.ResetWorkspace() op = core.CreateOperator( "Sum", ["A", "B"], - ["A"], # inplace + ["A"], // inplace ) workspace.FeedBlob("A", np.array([[1,2,5],[8,3,4]]).astype(np.float32)) diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc index ff3eac217390a4..c5a121e3a222d6 100644 --- a/caffe2/operators/filler_op.cc +++ b/caffe2/operators/filler_op.cc @@ -298,11 +298,11 @@ op_2 = core.CreateOperator( input_as_shape=1 ) -# Test arg-based op +// Test arg-based op workspace.RunOperatorOnce(op_1) print("output (op_1):\n", workspace.FetchBlob("output")) -# Test input-based op +// Test input-based op workspace.ResetWorkspace() workspace.FeedBlob("shape", np.array([5,5])) workspace.FeedBlob("min", np.array(13.8, dtype=np.float32)) @@ -389,11 +389,11 @@ op_2 = core.CreateOperator( input_as_shape=1 ) -# Test arg-based op +// Test arg-based op workspace.RunOperatorOnce(op_1) print("output (op_1):\n", workspace.FetchBlob("output")) -# Test input-based op +// Test input-based op workspace.ResetWorkspace() workspace.FeedBlob("shape", np.array([5,5])) workspace.FeedBlob("min", np.array(13, dtype=np.int32)) diff --git a/caffe2/operators/fully_connected_op.cc b/caffe2/operators/fully_connected_op.cc index 6fe95eefbac476..e14fec6f8464b8 100644 --- a/caffe2/operators/fully_connected_op.cc +++ b/caffe2/operators/fully_connected_op.cc @@ -182,9 +182,9 @@ Github Links: ``` -# In this example, our batch size is 1 (M=1), the input observation will have -# 6 features (K=6), and the layer will have one hidden node (N=1). The -# expected output is Y=7. +// In this example, our batch size is 1 (M=1), the input observation will have +// 6 features (K=6), and the layer will have one hidden node (N=1). The +// expected output is Y=7. workspace.ResetWorkspace() op = core.CreateOperator( @@ -193,23 +193,23 @@ op = core.CreateOperator( ["Y"] ) -# Create X: MxK +// Create X: MxK data = np.array([1,2,3,4,5,6]).astype(np.float32) data = data[np.newaxis,:] -# Create W: NxK +// Create W: NxK weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32) weights = weights[np.newaxis,:] -# Create b: N +// Create b: N bias = np.array([1.]).astype(np.float32) -# Put the inputs into the workspace +// Put the inputs into the workspace workspace.FeedBlob("X", data) workspace.FeedBlob("W", weights) workspace.FeedBlob("b", bias) -# Run the operator +// Run the operator workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/gather_op.cc b/caffe2/operators/gather_op.cc index cee268ddafdcbd..34c42bfc983f84 100644 --- a/caffe2/operators/gather_op.cc +++ b/caffe2/operators/gather_op.cc @@ -37,7 +37,7 @@ print("DATA:\n",data) inds = np.array([[0, 1],[1, 2]]) print("INDICES:\n",inds) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("DATA", data.astype(np.float32)) workspace.FeedBlob("INDICES", inds.astype(np.int32)) diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc index 1cba60e86d9787..81499b4a5d6abf 100644 --- a/caffe2/operators/local_response_normalization_op.cc +++ b/caffe2/operators/local_response_normalization_op.cc @@ -342,7 +342,7 @@ op = core.CreateOperator("LRN", order="NHWC" ) -workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc index f877786648350b..f39aaaa6397a3e 100644 --- a/caffe2/operators/lp_pool_op.cc +++ b/caffe2/operators/lp_pool_op.cc @@ -258,7 +258,7 @@ op = core.CreateOperator( p=2.0 ) -workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc index 6af404d1153588..79c35cd83a2148 100644 --- a/caffe2/operators/lpnorm_op.cc +++ b/caffe2/operators/lpnorm_op.cc @@ -100,7 +100,7 @@ op = core.CreateOperator( X = np.array([5., 2.]) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.RunOperatorOnce(op) diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc index eca7978e024aac..87d67b17e2b6ce 100644 --- a/caffe2/operators/pool_op.cc +++ b/caffe2/operators/pool_op.cc @@ -764,7 +764,7 @@ op = core.CreateOperator( stride=2, ) -workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) @@ -832,7 +832,7 @@ op = core.CreateOperator( stride=2, ) -workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc index 0d01d50ca000e3..95f15b56a720e9 100644 --- a/caffe2/operators/reduction_ops.cc +++ b/caffe2/operators/reduction_ops.cc @@ -139,17 +139,17 @@ op = core.CreateOperator( ["Y"] ) -# Create X, simulating a batch of 2, 4x4 matricies +// Create X, simulating a batch of 2, 4x4 matricies X = np.random.randint(0,high=20,size=(2,4,4)) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` @@ -226,17 +226,17 @@ op = core.CreateOperator( ["Y"] ) -# Create X, simulating a batch of 2, 4x4 matricies +// Create X, simulating a batch of 2, 4x4 matricies X = np.random.randint(0,high=20,size=(2,4,4)) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` diff --git a/caffe2/operators/relu_op.cc b/caffe2/operators/relu_op.cc index 03205241efc3e1..0f1abd82396156 100644 --- a/caffe2/operators/relu_op.cc +++ b/caffe2/operators/relu_op.cc @@ -105,7 +105,7 @@ op = core.CreateOperator( ["Y"] ) -workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc index bea0b43d751ccf..d968112c9ecc2d 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.cc +++ b/caffe2/operators/sparse_to_dense_mask_op.cc @@ -48,8 +48,8 @@ vector and `values` tensor into a compacted tensor where the first dimension corresponds to each id provided in mask argument. Missing values are filled with the value of `default_value`. After running this op: - output[j, :] = values[i] # where mask[j] == indices[i] - output[j, ...] = default_value # when mask[j] doesn't appear in indices + output[j, :] = values[i] // where mask[j] == indices[i] + output[j, ...] = default_value // when mask[j] doesn't appear in indices If `lengths` is provided and not empty, and extra "batch" dimension is prepended to the output. diff --git a/caffe2/operators/sparse_to_dense_op.cc b/caffe2/operators/sparse_to_dense_op.cc index 4f6a49796df826..0c9519e6576122 100644 --- a/caffe2/operators/sparse_to_dense_op.cc +++ b/caffe2/operators/sparse_to_dense_op.cc @@ -23,7 +23,7 @@ representation. After running this op: - output[indices[i], :] += values[i] # sum over all indices[i] equal to the index + output[indices[i], :] += values[i] // sum over all indices[i] equal to the index output[j, ...] = 0 if j not in indices )DOC") .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data") diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc index 508dd1ae82060a..d07f9cace13636 100644 --- a/caffe2/operators/stats_ops.cc +++ b/caffe2/operators/stats_ops.cc @@ -290,7 +290,7 @@ timergetandend_op = core.CreateOperator( ["nanos"] ) -# Test TimerBegin/TimerGet/TimerEnd +// Test TimerBegin/TimerGet/TimerEnd workspace.RunOperatorOnce(timerbegin_op) print("timer:", workspace.FetchBlob("timer")) workspace.RunOperatorOnce(timerget_op) @@ -298,7 +298,7 @@ print("nanos:", workspace.FetchBlob("nanos")) workspace.RunOperatorOnce(timerend_op) -# Test TimerBegin/TimerGetAndEnd +// Test TimerBegin/TimerGetAndEnd workspace.RunOperatorOnce(timerbegin_op) print("timer:", workspace.FetchBlob("timer")) workspace.RunOperatorOnce(timergetandend_op) diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index cc7c037a6d332d..eb771974fbf397 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -103,17 +103,17 @@ op = core.CreateOperator( ["Y"] ) -# Create X: Sample softmax output for 5-class model +// Create X: Sample softmax output for 5-class model X = np.array([2,2,2,2,2,2,2,2,2,2]) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.int32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` @@ -508,14 +508,14 @@ op = core.CreateOperator( ["has_elements"], ) -# Use a not-empty tensor +// Use a not-empty tensor workspace.FeedBlob("tensor", np.random.randn(2, 2).astype(np.float32)) print("tensor:\n", workspace.FetchBlob("tensor")) workspace.RunOperatorOnce(op) print("has_elements: ", workspace.FetchBlob("has_elements"),"\n") -# Use an empty tensor +// Use an empty tensor workspace.FeedBlob("tensor", np.empty(0)) print("tensor:\n", workspace.FetchBlob("tensor")) From 525548fb64308271bd5248598e398eb4035e25f3 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Wed, 29 Aug 2018 11:48:49 -0700 Subject: [PATCH 08/42] Move SparseTensorRef to core, change some includes to core. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10964 Differential Revision: D9545021 Pulled By: gchanan fbshipit-source-id: 8ba7e5e3a7bdf24e5aeb4bbc91957c1a6f14d7f0 --- aten/src/ATen/DeviceGuard.h | 4 ++-- aten/src/ATen/TensorOptions.h | 4 ++-- aten/src/ATen/{ => core}/SparseTensorRef.h | 0 aten/src/ATen/native/LegacyBridge.cpp | 2 +- aten/src/ATen/templates/Tensor.h | 6 +++--- aten/src/ATen/templates/TensorMethods.h | 2 +- aten/src/ATen/templates/Type.cpp | 2 +- aten/src/ATen/templates/Type.h | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) rename aten/src/ATen/{ => core}/SparseTensorRef.h (100%) diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h index 7adddfca27c9eb..b51d80d22d350f 100644 --- a/aten/src/ATen/DeviceGuard.h +++ b/aten/src/ATen/DeviceGuard.h @@ -1,7 +1,7 @@ #pragma once -#include -#include +#include +#include #include #include #include diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h index c8717689833408..a598290485196d 100644 --- a/aten/src/ATen/TensorOptions.h +++ b/aten/src/ATen/TensorOptions.h @@ -2,10 +2,10 @@ #include #include -#include +#include #include #include -#include +#include #include #include diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/core/SparseTensorRef.h similarity index 100% rename from aten/src/ATen/SparseTensorRef.h rename to aten/src/ATen/core/SparseTensorRef.h diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp index 5b73a09ad9b004..07d7e46ff79a56 100644 --- a/aten/src/ATen/native/LegacyBridge.cpp +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 28e8e5381f2933..f426c6753adc36 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -2,11 +2,11 @@ // ${generated_comment} -#include "ATen/Device.h" +#include "ATen/core/Device.h" #include "ATen/core/Layout.h" #include "ATen/Scalar.h" -#include "ATen/ScalarType.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/ScalarType.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Storage.h" #include "ATen/TensorAccessor.h" #include "ATen/TensorBase.h" diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 214a5d18316588..e52c597b99eeb7 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -4,7 +4,7 @@ #include "ATen/Tensor.h" #include "ATen/Scalar.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Type.h" namespace at { diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp index 40621a9be6e08b..5e5995b9523ba9 100644 --- a/aten/src/ATen/templates/Type.cpp +++ b/aten/src/ATen/templates/Type.cpp @@ -5,7 +5,7 @@ #include "ATen/ExpandUtils.h" #include "ATen/NativeFunctions.h" #include "ATen/Scalar.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Storage.h" #include "ATen/Tensor.h" #include "ATen/TensorOptions.h" diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index d4972d87a6dfd9..b000029e789ca6 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -3,13 +3,13 @@ // ${generated_comment} #include "ATen/core/ATenGeneral.h" -#include "ATen/Allocator.h" +#include "ATen/core/Allocator.h" #include "ATen/core/Deprecated.h" #include "ATen/core/Generator.h" #include "ATen/core/Layout.h" #include "ATen/Scalar.h" #include "ATen/core/ScalarType.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Tensor.h" #include "ATen/core/ArrayRef.h" #include "ATen/core/Half.h" From 396dec0e3740fad00461bc0ebcdfae09708693c6 Mon Sep 17 00:00:00 2001 From: zou3519 Date: Wed, 29 Aug 2018 12:02:34 -0700 Subject: [PATCH 09/42] s/spaerse/sparse (#10968) Summary: cc SsnL Pull Request resolved: https://github.com/pytorch/pytorch/pull/10968 Differential Revision: D9546746 Pulled By: zou3519 fbshipit-source-id: a6a4bb8bb04eccf89c3d90a90259070beb484500 --- torch/_torch_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index fd70fd20f5f450..a9db54d3117842 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -4137,7 +4137,7 @@ def parse_kwargs(desc): Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices` with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate coordinates in the indices, and the value at that index is the sum of all duplicate value entries: -`torch.spaerse`_. +`torch.sparse`_. Args: indices (array_like): Initial data for the tensor. Can be a list, tuple, From 4e446b85fb5e0b5db0951cc068e423d9caf5beef Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Wed, 29 Aug 2018 12:14:46 -0700 Subject: [PATCH 10/42] Make profiler.build_table() O(n) rather than O(n^2) (#10969) Summary: Fixes #10851 Speeds up profiling results dramatically. For the following script: ``` import torch import time ITER = 2000 x = torch.randn(1, 1, requires_grad=True) with torch.autograd.profiler.profile() as prof: y = x for i in range(ITER): y = 3 * y - 2 * y y.backward() start = time.time() print("Done running. Preparing prof") x = str(prof) print("Done preparing prof results") end = time.time() print("Elapsed: {}".format(end - start)) ``` I get 7s before / 0.13s after these changes. cc apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/10969 Differential Revision: D9556129 Pulled By: zou3519 fbshipit-source-id: 26b421686f8a42cdaace6382567d403e6385dc12 --- torch/autograd/profiler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 75e309ac0faf06..c1be47ad494397 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -554,11 +554,11 @@ def build_table(events, sort_by=None, header=None): header_sep = '-' * max_name_length + (' ' + '-' * col_width) * 5 # Have to use a list because nonlocal is Py3 only... - result = [''] + result = [] def append(s): - result[0] += s - result[0] += '\n' + result.append(s) + result.append('\n') # Yes, newline after the end as well # Actual printing if header is not None: @@ -572,4 +572,4 @@ def append(s): append(row_format.format(evt.key, evt.cpu_time_str, evt.cuda_time_str, evt.count, evt.cpu_time_total_str, evt.cuda_time_total_str)) - return result[0] + return ''.join(result) From bed9d41abd27ec991dd66d1b24da22f1a1323033 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Wed, 29 Aug 2018 12:35:51 -0700 Subject: [PATCH 11/42] Generate Type::registerCPU as we do register_cuda_types. (#10947) Summary: The goal here is to separate out the base Type into core; as it was done previously we need all derived Types to be defined when we compile the base Type. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10947 Reviewed By: gchanan Differential Revision: D9540025 Pulled By: ezyang fbshipit-source-id: 49f0b5acb3c378348ef3a55780abb73e4ae27edd --- aten/src/ATen/Context.cpp | 3 ++- aten/src/ATen/Context.h | 1 + aten/src/ATen/gen.py | 9 ++++++++- aten/src/ATen/templates/RegisterCPU.cpp | 20 ++++++++++++++++++++ aten/src/ATen/templates/RegisterCPU.h | 10 ++++++++++ aten/src/ATen/templates/Type.cpp | 12 ------------ aten/src/ATen/templates/Type.h | 1 - 7 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 aten/src/ATen/templates/RegisterCPU.cpp create mode 100644 aten/src/ATen/templates/RegisterCPU.h diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index f85996f74c4b76..a2c3fb40a7d415 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -9,6 +9,7 @@ #include #include "ATen/CPUGenerator.h" +#include "ATen/RegisterCPU.h" #ifdef USE_SSE3 #include @@ -34,7 +35,7 @@ Context::Context() generator_registry[static_cast(DeviceType::CPU)] .reset(new CPUGenerator(this)); - Type::registerCPU(this); + register_cpu_types(this); } // TODO: This could be bad juju if someone calls globalContext() in the diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index f2b3a452cfed57..5584963fefe57f 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -114,6 +114,7 @@ class AT_API Context { std::atomic next_id; std::unique_ptr thc_state; friend struct Type; + friend void register_cpu_types(Context * context); friend void register_cuda_types(Context * context); }; diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 0f859edd3ede3a..53879e56ffb342 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -109,6 +109,9 @@ def check_all_files_written(self): TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h") TYPE_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.cpp") +REGISTER_CPU_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.h") +REGISTER_CPU_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.cpp") + REGISTER_CUDA_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.h") REGISTER_CUDA_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.cpp") @@ -340,7 +343,8 @@ def iterate_types(): def declare_outputs(): files = ['Declarations.yaml', 'Type.h', 'Type.cpp', 'Tensor.h', 'TensorMethods.h', 'Functions.h', - 'CPUCopy.cpp', 'NativeFunctions.h'] + 'CPUCopy.cpp', 'NativeFunctions.h', + 'RegisterCPU.cpp', 'RegisterCPU.h'] for f in files: file_manager.will_write(f) cuda_files = ['CUDACopy.cpp', 'RegisterCUDA.cpp', 'RegisterCUDA.h'] @@ -409,6 +413,9 @@ def generate_outputs(): file_manager.write('Type.h', TYPE_H, top_env) file_manager.write('Type.cpp', TYPE_CPP, top_env) + file_manager.write('RegisterCPU.h', REGISTER_CPU_H, top_env) + file_manager.write('RegisterCPU.cpp', REGISTER_CPU_CPP, top_env) + cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env) cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env) diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp new file mode 100644 index 00000000000000..184af2c8c014da --- /dev/null +++ b/aten/src/ATen/templates/RegisterCPU.cpp @@ -0,0 +1,20 @@ +#include + +// ${generated_comment} + +#include +#include +#include +#include + +${cpu_type_headers} + +namespace at { + +void register_cpu_types(Context * context) { + ${cpu_type_registrations} + context->type_registry[static_cast(Backend::Undefined)] + [static_cast(ScalarType::Undefined)].reset(new UndefinedType(context)); +} + +} // namespace at diff --git a/aten/src/ATen/templates/RegisterCPU.h b/aten/src/ATen/templates/RegisterCPU.h new file mode 100644 index 00000000000000..b923c180aac805 --- /dev/null +++ b/aten/src/ATen/templates/RegisterCPU.h @@ -0,0 +1,10 @@ +#pragma once + +// ${generated_comment} + +namespace at { + +class Context; +void register_cpu_types(Context * context); + +} // namespace at diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp index 5e5995b9523ba9..90dbbb810ee30d 100644 --- a/aten/src/ATen/templates/Type.cpp +++ b/aten/src/ATen/templates/Type.cpp @@ -9,22 +9,10 @@ #include "ATen/Storage.h" #include "ATen/Tensor.h" #include "ATen/TensorOptions.h" -#include "ATen/UndefinedType.h" #include "ATen/DeviceGuard.h" -#include - -#include -${cpu_type_headers} - namespace at { -void Type::registerCPU(Context * context) { - ${cpu_type_registrations} - context->type_registry[static_cast(Backend::Undefined)] - [static_cast(ScalarType::Undefined)].reset(new UndefinedType(context)); -} - Tensor & Type::copy_(Tensor & self, const Tensor & src, bool non_blocking) const { Tensor b_src; std::tie(b_src) = expand_inplace(self, src, "copy"); diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index b000029e789ca6..884bd3a3bdff76 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -56,7 +56,6 @@ struct AT_API Type { virtual bool is_distributed() const = 0; bool is_variable() const noexcept { return is_variable_; } bool is_undefined() const noexcept { return is_undefined_; } - static void registerCPU(Context * context); virtual Storage storage(bool resizable = false) const = 0; virtual Storage storage(size_t size, bool resizable = false) const = 0; virtual Storage storageFromBlob(void * data, int64_t size, const std::function & deleter=noop_deleter) const = 0; From dbce1c840f36621fa12bb7917123e475c1345341 Mon Sep 17 00:00:00 2001 From: Yanghan Wang Date: Wed, 29 Aug 2018 12:40:04 -0700 Subject: [PATCH 12/42] exposing net_transformer_fun before add grad (#11003) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11003 Need a interface to re-write the graph after the net is built and after adding gradient ops. Reviewed By: aazzolini, harouwu Differential Revision: D9557827 fbshipit-source-id: 2e082f0321c0776e488a29e18047d950948e7c37 --- caffe2/python/data_parallel_model.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 60e5c39bed1318..ae169eef2e6480 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -44,6 +44,7 @@ def Parallelize( param_update_builder_fun=None, optimizer_builder_fun=None, post_sync_builder_fun=None, + pre_grad_net_transformer_fun=None, net_transformer_fun=None, devices=None, rendezvous=None, @@ -91,6 +92,11 @@ def Parallelize( Signature: net_transformer_fun( model, num_devices, device_prefix, device_type) + pre_grad_net_transformer_fun: + Optional function to transform the network similar to + net_transformer_fun, but happens before gradient ops + been add. + Signature: pre_grad_net_transformer_fun(model) post_sync_builder_fun: Function applied after initial parameter sync has been completed, such as keeping multi-precision parameters @@ -234,6 +240,9 @@ def Parallelize( model_helper_obj._computed_param_names =\ list(viewkeys(computed_params_grouped)) + if pre_grad_net_transformer_fun: + pre_grad_net_transformer_fun(model_helper_obj) + if has_parameter_updates: log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) From ec519e8a4abf8c327fdceb395e2d718955a44e8f Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 29 Aug 2018 12:50:02 -0700 Subject: [PATCH 13/42] Reduce number of elements within test_abs Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10997 Differential Revision: D9556861 Pulled By: cpuhrsch fbshipit-source-id: 986ef275e94fcffcc04a5c1103b8b7bfb4ae3ba5 --- test/test_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_torch.py b/test/test_torch.py index 34b8256763c658..5167ac618bba75 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6107,7 +6107,7 @@ def _test_abs(tensors_dict): _test_abs(self._make_tensors((3, 5, 7), val_range=(0, max_val))) _test_abs(self._make_tensors((2, 2, 5, 8, 2, 3), val_range=(0, max_val))) _test_abs(self._make_tensors((1000, ), val_range=(0, max_val))) - _test_abs(self._make_tensors((30, 30, 30), val_range=(0, max_val))) + _test_abs(self._make_tensors((10, 10, 10), val_range=(0, max_val))) # Checking that the right abs function is called for LongTensor bignumber = 2 ^ 31 + 1 From fa7c81c6403632153412320754ad51ad3b1f58b0 Mon Sep 17 00:00:00 2001 From: Duc Ngo Date: Wed, 29 Aug 2018 12:54:01 -0700 Subject: [PATCH 14/42] nomnigraph - nit - code style update (#10987) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10987 some code style update to make it consistent with fb cpp style Reviewed By: yinghai Differential Revision: D9550130 fbshipit-source-id: 6aef9878676c08e7d384383c95e7ba8c5c9a1bce --- .../nomnigraph/Representations/Compiler.h | 22 ++++----- .../nomnigraph/Representations/ControlFlow.h | 40 ++++++++-------- .../nomnigraph/Representations/NeuralNet.h | 48 ++++++++++--------- .../include/nomnigraph/Support/Common.h | 20 ++++---- 4 files changed, 66 insertions(+), 64 deletions(-) diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h index 8560ff82374d9a..8c24a2e2cb1076 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h @@ -11,15 +11,15 @@ namespace repr { class CAFFE2_API Value { public: enum class ValueKind { Value, Instruction, Data }; - Value(ValueKind K) : Kind(K) {} - Value() : Kind(ValueKind::Value) {} + Value(ValueKind K) : kind_(K) {} + Value() : kind_(ValueKind::Value) {} ValueKind getKind() const { - return Kind; + return kind_; } virtual ~Value() = default; private: - const ValueKind Kind; + const ValueKind kind_; }; class CAFFE2_API Data : public Value { @@ -30,15 +30,15 @@ class CAFFE2_API Data : public Value { } virtual ~Data() = default; size_t getVersion() const { - return Version; + return version_; } void setVersion(size_t version) { - Version = version; + version_ = version; } private: - size_t Version = 0; + size_t version_ = 0; }; class CAFFE2_API Instruction : public Value { @@ -52,18 +52,18 @@ class CAFFE2_API Instruction : public Value { TerminatorEnd, Phi }; - Instruction() : Value(ValueKind::Instruction), Op(Opcode::Generic) {} - Instruction(Opcode op) : Value(ValueKind::Instruction), Op(op) {} + Instruction() : Value(ValueKind::Instruction), op_(Opcode::Generic) {} + Instruction(Opcode op) : Value(ValueKind::Instruction), op_(op) {} CAFFE2_API static bool classof(const Value* V) { return V->getKind() == ValueKind::Instruction; } virtual ~Instruction() = default; Opcode getOpcode() const { - return Op; + return op_; } private: - Opcode Op; + Opcode op_; }; class CAFFE2_API Terminator : public Instruction { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h index 835f187febf15d..1934b1f1b7bad4 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h @@ -19,45 +19,45 @@ class CAFFE2_API BasicBlock { using NodeRef = typename Subgraph::NodeRef; BasicBlock() {} ~BasicBlock() { - for (auto pair : callbacks) { + for (auto pair : callbacks_) { pair.first->deleteDestructorCallback(pair.second); } } void trackNode(NodeRef node) { - callbacks[node] = node->registerDestructorCallback([&](NodeRef n) { + callbacks_[node] = node->registerDestructorCallback([&](NodeRef n) { assert( hasInstruction(n) && "Destructor callback invoked on untracked node in BasicBlock."); deleteInstruction(n); }); - Nodes.addNode(node); + nodes_.addNode(node); } void untrackNode(NodeRef node) { - callbacks.erase(node); - Nodes.removeNode(node); + callbacks_.erase(node); + nodes_.removeNode(node); } void pushInstructionNode(NodeRef node) { assert( isa(node->data()) && "Cannot push non-instruction node to basic block."); - Instructions.emplace_back(node); + instructions_.emplace_back(node); trackNode(node); } const std::vector& getInstructions() { - return Instructions; + return instructions_; } bool hasInstruction(NodeRef instr) const { - return Nodes.hasNode(instr); + return nodes_.hasNode(instr); } void insertInstructionBefore(NodeRef newInstr, NodeRef instr) { auto it = - std::find(std::begin(Instructions), std::end(Instructions), instr); - Instructions.insert(it, newInstr); + std::find(std::begin(instructions_), std::end(instructions_), instr); + instructions_.insert(it, newInstr); trackNode(newInstr); } @@ -65,28 +65,28 @@ class CAFFE2_API BasicBlock { assert(hasInstruction(instr1) && "Instruction not in basic block."); assert(hasInstruction(instr2) && "Instruction not in basic block."); auto it1 = - std::find(std::begin(Instructions), std::end(Instructions), instr1); + std::find(std::begin(instructions_), std::end(instructions_), instr1); auto it2 = - std::find(std::begin(Instructions), std::end(Instructions), instr2); - Instructions.erase(it1); - Instructions.insert(it2, instr1); + std::find(std::begin(instructions_), std::end(instructions_), instr2); + instructions_.erase(it1); + instructions_.insert(it2, instr1); } void deleteInstruction(NodeRef instr) { assert(hasInstruction(instr) && "Instruction not in basic block."); - Instructions.erase( - std::remove(Instructions.begin(), Instructions.end(), instr), - Instructions.end()); + instructions_.erase( + std::remove(instructions_.begin(), instructions_.end(), instr), + instructions_.end()); untrackNode(instr); } private: - Subgraph Nodes; - std::vector Instructions; + Subgraph nodes_; + std::vector instructions_; // Because we reference a dataflow graph, we need to register callbacks // for when the dataflow graph is modified. std::unordered_map>::Callback*> - callbacks; + callbacks_; }; using Program = Graph; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index 1f7e2c27906c99..b1e9283bc9ccee 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -45,19 +45,19 @@ class CAFFE2_API Annotation { public: enum class AnnotationKind { Generic, Caffe2 }; - Annotation(AnnotationKind K) : Kind(K) {} - Annotation() : Kind(AnnotationKind::Generic) {} + Annotation(AnnotationKind kind) : kind_(kind) {} + Annotation() : kind_(AnnotationKind::Generic) {} virtual ~Annotation() {} AnnotationKind getKind() const { - return Kind; + return kind_; } Annotation(const Annotation&) = delete; Annotation& operator=(Annotation&) = delete; private: - const AnnotationKind Kind; + const AnnotationKind kind_; }; class CAFFE2_API NeuralNetOperator : public Instruction { @@ -75,36 +75,38 @@ class CAFFE2_API NeuralNetOperator : public Instruction { enum class NNLayout { Undefined, NCHW, NHWC }; NeuralNetOperator(NNKind K, Opcode I, NNLayout L) - : Instruction(I), Kind(K), Layout(L) {} + : Instruction(I), kind_(K), layout_(L) {} NeuralNetOperator(NNKind K, Opcode I) - : Instruction(I), Kind(K), Layout(NNLayout::Undefined) {} - NeuralNetOperator(NNKind K, NNLayout L) : Instruction(), Kind(K), Layout(L) {} + : Instruction(I), kind_(K), layout_(NNLayout::Undefined) {} + NeuralNetOperator(NNKind K, NNLayout L) + : Instruction(), kind_(K), layout_(L) {} NeuralNetOperator(NNKind K) - : Instruction(), Kind(K), Layout(NNLayout::Undefined) {} + : Instruction(), kind_(K), layout_(NNLayout::Undefined) {} NeuralNetOperator() - : Instruction(), Kind(NNKind::Undefined), Layout(NNLayout::Undefined) {} + : Instruction(), kind_(NNKind::Undefined), layout_(NNLayout::Undefined) {} NNKind getKind() const { - return Kind; + return kind_; } void setLayout(NNLayout L) { - Layout = L; + layout_ = L; } NNLayout getLayout() const { - return Layout; + return layout_; } void setAnnotation(std::unique_ptr extraAnnotation) { - ExtraAnnotation = std::move(extraAnnotation); + extraAnnotation_ = std::move(extraAnnotation); } const Annotation* getAnnotation() const { - return ExtraAnnotation.get(); + return extraAnnotation_.get(); } + Annotation* getMutableAnnotation() { - return ExtraAnnotation.get(); + return extraAnnotation_.get(); } const std::string getName() const; @@ -128,9 +130,9 @@ class CAFFE2_API NeuralNetOperator : public Instruction { NeuralNetOperator& operator=(NeuralNetOperator&) = delete; private: - const NNKind Kind; - NNLayout Layout; // Mutable attribute, much like a type cast - std::unique_ptr ExtraAnnotation; + const NNKind kind_; + NNLayout layout_; // Mutable attribute, much like a type cast + std::unique_ptr extraAnnotation_; }; class CAFFE2_API NeuralNetData : public Data { @@ -138,12 +140,12 @@ class CAFFE2_API NeuralNetData : public Data { /// Discriminator for LLVM-style RTTI (isa<>) enum class NNDataKind { Generic, Tensor }; - NeuralNetData(NNDataKind kind) : Kind(kind) {} + NeuralNetData(NNDataKind kind) : kind_(kind) {} - NeuralNetData() : Kind(NNDataKind::Generic) {} + NeuralNetData() : kind_(NNDataKind::Generic) {} NNDataKind getKind() const { - return Kind; + return kind_; } virtual NeuralNetData* clone() = 0; @@ -153,8 +155,8 @@ class CAFFE2_API NeuralNetData : public Data { virtual ~NeuralNetData() = 0; private: - NNDataKind Kind; - size_t Version = 0; + NNDataKind kind_; + size_t version_ = 0; }; class CAFFE2_API Tensor : public NeuralNetData { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h index cef1bdec522a56..91e4c2f6e01e87 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h @@ -71,13 +71,13 @@ class Notifier { Notifier() {} Callback* registerDestructorCallback(Callback fn) { - DtorCallbacks.emplace_back(fn); - return &DtorCallbacks.back(); + dtorCallbacks_.emplace_back(fn); + return &dtorCallbacks_.back(); } Callback* registerNotificationCallback(Callback fn) { - NotifCallbacks.emplace_back(fn); - return &NotifCallbacks.back(); + notifCallbacks_.emplace_back(fn); + return ¬ifCallbacks_.back(); } void deleteCallback(std::list& callbackList, Callback* toDelete) { @@ -90,11 +90,11 @@ class Notifier { } void deleteDestructorCallback(Callback* c) { - deleteCallback(DtorCallbacks, c); + deleteCallback(dtorCallbacks_, c); } void deleteNotificationCallback(Callback* c) { - deleteCallback(NotifCallbacks, c); + deleteCallback(notifCallbacks_, c); } /// \brief Notifies all listeners (`registerNotificationCallback` @@ -102,20 +102,20 @@ class Notifier { /// is encoded in the state of the derived class, thus only passing /// a pointer of type T* to the callback. void notify() { - for (auto callback : NotifCallbacks) { + for (auto callback : notifCallbacks_) { callback(reinterpret_cast(this)); } } virtual ~Notifier() { - for (auto callback : DtorCallbacks) { + for (auto callback : dtorCallbacks_) { callback(reinterpret_cast(this)); } } private: - std::list DtorCallbacks; - std::list NotifCallbacks; + std::list dtorCallbacks_; + std::list notifCallbacks_; }; #endif /* NOM_SUPPORT_COMMON_H */ From 56539f5fe1618ea93f733e196710e8c424f549db Mon Sep 17 00:00:00 2001 From: Teng Li Date: Wed, 29 Aug 2018 12:54:55 -0700 Subject: [PATCH 15/42] PT1 Distributed Release MileStone No.1 - Completed Distributed Package and CI tests (#10871) Summary: The PR includes: (1) torch.distributed.c10d, which now includes the complete backward compatible frontend API for `torch.distributed` (2) `env://` init method functionality (3) Minor change to `test_distributed.py`, which is now a test for `torch.distributed.c10d`. (4) The old `test_distributed.py' is now moved to `test_distributed_thd` (5) Miscellaneous bug fixes. (6) DDP CPU test is removed since c10d doesn't have this support yet, but this is a very easy test after moving DDP CPU's dependency to torch.distributed.c10d. (7) CI config to test MPI, NCCL, and Gloo backend of c10d **Now all the distributed test including c10d DDP can pass with the c10d frontend API** TODO: (in a separate PR) MPI subgroup support, once this is added, CI group test will be enabled. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10871 Differential Revision: D9554514 Pulled By: teng-li fbshipit-source-id: fb686ad42258526c8b4372148e82969fac4f42dd --- .jenkins/pytorch/build.sh | 25 +- test/common.py | 10 + test/run_test.py | 33 +- test/test_c10d.py | 36 +- test/test_distributed.py | 108 +- test/test_thd_distributed.py | 1148 ++++++++++++++++++++ torch/csrc/distributed/c10d/init.cpp | 4 +- torch/distributed/c10d/__init__.py | 16 +- torch/distributed/c10d/distributed_c10d.py | 1054 ++++++++++++++++++ torch/distributed/c10d/rendezvous.py | 51 +- torch/lib/c10d/ProcessGroupMPI.cpp | 17 +- torch/nn/parallel/distributed_c10d.py | 20 +- 12 files changed, 2404 insertions(+), 118 deletions(-) create mode 100644 test/test_thd_distributed.py create mode 100644 torch/distributed/c10d/distributed_c10d.py diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index f1eda3103a24af..0f26005f74cb22 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -1,15 +1,28 @@ #!/bin/bash +# For distributed, four environmental configs: +# (1) build with only NCCL +# (2) build with NCCL and MPI +# (3) build with only MPI +# (4) build with neither +if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then + # TODO: move this to Docker + sudo apt-get update + sudo apt-get install libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0 +fi + +if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then + # TODO: move this to Docker + sudo apt-get update + sudo apt-get install openmpi-bin libopenmpi-dev + sudo apt-get install -y --no-install-recommends openssh-client openssh-server + sudo mkdir -p /var/run/sshd +fi + if [[ "$BUILD_ENVIRONMENT" == "pytorch-linux-xenial-py3-clang5-asan" ]]; then exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" $* fi -# TODO: move this to Docker -# TODO: add both NCCL and MPI in CI test by fixing these test first -sudo apt-get update -sudo apt-get install libnccl-dev libnccl2 -# sudo apt-get install openmpi-bin libopenmpi-dev - # Required environment variable: $BUILD_ENVIRONMENT # (This is set by default in the Docker images we build, so you don't # need to set it yourself. diff --git a/test/common.py b/test/common.py index 1c86bcd7fe24b8..545ba4f1f0dd22 100644 --- a/test/common.py +++ b/test/common.py @@ -17,6 +17,7 @@ import warnings import random import contextlib +import socket from functools import wraps from itertools import product from copy import deepcopy @@ -550,3 +551,12 @@ def download_file(url, binary=True): msg = "could not download test file '{}'".format(url) warnings.warn(msg, RuntimeWarning) raise unittest.SkipTest(msg) + + +def find_free_port(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(('localhost', 0)) + sockname = sock.getsockname() + sock.close() + return sockname[1] diff --git a/test/run_test.py b/test/run_test.py index 4d33d34d407476..8fd32b7e75c07f 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -14,6 +14,7 @@ import torch from torch.utils import cpp_extension from common import TEST_WITH_ROCM +import torch.distributed.c10d as c10d TESTS = [ 'autograd', @@ -31,12 +32,14 @@ 'nn', 'optim', 'sparse', + 'thd_distributed', 'torch', 'utils', ] WINDOWS_BLACKLIST = [ 'distributed', + 'thd_distributed', ] ROCM_BLACKLIST = [ @@ -50,10 +53,29 @@ 'multiprocessing', 'nccl', 'nn', + 'thd_distributed', 'utils', ] DISTRIBUTED_TESTS_CONFIG = { + 'gloo': { + 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' + }, +} + + +if c10d.is_available(): + if c10d.is_mpi_available(): + DISTRIBUTED_TESTS_CONFIG['mpi'] = { + 'WORLD_SIZE': '3' + } + if c10d.is_nccl_available(): + DISTRIBUTED_TESTS_CONFIG['nccl'] = { + 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' + } + + +THD_DISTRIBUTED_TESTS_CONFIG = { 'tcp': { 'WORLD_SIZE': '3' }, @@ -126,7 +148,10 @@ def test_distributed(python, test_module, test_directory, options): if options.verbose and not mpi_available: print_to_stderr( 'MPI not available -- MPI backend tests will be skipped') - for backend, env_vars in DISTRIBUTED_TESTS_CONFIG.items(): + config = DISTRIBUTED_TESTS_CONFIG + if test_module == "test_thd_distributed": + config = THD_DISTRIBUTED_TESTS_CONFIG + for backend, env_vars in config.items(): if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: @@ -141,7 +166,10 @@ def test_distributed(python, test_module, test_directory, options): os.environ['INIT_METHOD'] = 'env://' os.environ.update(env_vars) if with_init_file: - init_method = 'file://{}/shared_init_file'.format(tmp_dir) + if test_module == "test_distributed": + init_method = 'file://{}/'.format(tmp_dir) + else: + init_method = 'file://{}/shared_init_file'.format(tmp_dir) os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) @@ -170,6 +198,7 @@ def test_distributed(python, test_module, test_directory, options): CUSTOM_HANDLERS = { 'cpp_extensions': test_cpp_extensions, 'distributed': test_distributed, + 'thd_distributed': test_distributed, } diff --git a/test/test_c10d.py b/test/test_c10d.py index c448eba1349972..13f7b779d04736 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -1,7 +1,6 @@ import copy import math import multiprocessing -import socket import sys import tempfile import unittest @@ -10,6 +9,7 @@ from collections import namedtuple import torch +import common from torch import nn import torch.nn.functional as F from torch.distributed import c10d @@ -60,15 +60,6 @@ def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) -def find_free_port(): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - sock.bind(('localhost', 0)) - sockname = sock.getsockname() - sock.close() - return sockname[1] - - def gpus_for_rank(world_size): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. @@ -126,14 +117,14 @@ def _create_store(self): class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): addr = 'localhost' - port = find_free_port() + port = common.find_free_port() return c10d.TCPStore(addr, port, True) class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): addr = 'localhost' - port = find_free_port() + port = common.find_free_port() self.tcpstore = c10d.TCPStore(addr, port, True) self.prefix = "test_prefix" @@ -150,10 +141,10 @@ def test_unknown_handler(self): class RendezvousFileTest(TestCase): def test_common_errors(self): with self.assertRaisesRegex(ValueError, 'path missing'): - gen = c10d.rendezvous('file://?rank=0&size=1') + gen = c10d.rendezvous('file://?rank=0&world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'rank parameter missing'): - gen = c10d.rendezvous('file:///tmp/foo?size=1') + gen = c10d.rendezvous('file:///tmp/foo?world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'size parameter missing'): gen = c10d.rendezvous('file:///tmp/foo?rank=0') @@ -161,7 +152,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile() as file: - url = 'file://%s?size=%d' % (file.name, 2) + url = 'file://%s?world_size=%d' % (file.name, 2) gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -183,10 +174,10 @@ def test_nominal(self): class RendezvousTCPTest(TestCase): def test_common_errors(self): with self.assertRaisesRegex(ValueError, 'port number missing'): - gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&size=1') + gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'rank parameter missing'): - gen = c10d.rendezvous('tcp://127.0.0.1:23456?size=1') + gen = c10d.rendezvous('tcp://127.0.0.1:23456?world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'size parameter missing'): gen = c10d.rendezvous('tcp://127.0.0.1:23456?rank=0') @@ -194,8 +185,8 @@ def test_common_errors(self): def test_nominal(self): addr = 'localhost' - port = find_free_port() - url = 'tcp://%s:%d?size=%d' % (addr, port, 2) + port = common.find_free_port() + url = 'tcp://%s:%d?world_size=%d' % (addr, port, 2) gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -245,7 +236,7 @@ def setUpClass(cls): def setUp(self): self.rank = self.MAIN_PROCESS_RANK self.file = tempfile.NamedTemporaryFile() - self.port = find_free_port() + self.port = common.find_free_port() self.processes = [self._spawn_process(rank) for rank in range(int(self.world_size))] def tearDown(self): @@ -529,8 +520,9 @@ def _test_ddp_with_process_group(self, process_group): model = Net() ddp_model = distributed_c10d._DistributedDataParallelC10d( copy.deepcopy(model).cuda(gpus[0]), - process_group, - device_ids=gpus) + device_ids=gpus, + process_group=process_group) + model.cuda(gpus[0]) local_batch_size = len(gpus) diff --git a/test/test_distributed.py b/test/test_distributed.py index 47dbe9d056f154..38a32d69ef7c64 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -5,29 +5,32 @@ import os import sys import time +import tempfile import unittest from contextlib import contextmanager from functools import reduce, wraps import torch import torch.cuda -import torch.distributed as dist +import torch.distributed.c10d as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from common import TestCase from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR from torch.autograd import Variable - +import common BACKEND = os.environ["BACKEND"] TEMP_DIR = os.environ["TEMP_DIR"] INIT_METHOD = os.getenv("INIT_METHOD", "env://") -MASTER_PORT = "29500" DEFAULT_TIMEOUT = 300 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500} +if INIT_METHOD.startswith("file://"): + FOLDER = INIT_METHOD[7:] + def get_timeout(test_id): test_name = test_id.split(".")[-1] @@ -361,8 +364,9 @@ def test_broadcast_cuda(self): rank_to_GPU = self._init_multigpu_helper() self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_broadcast_group(self): group, group_id, rank = self._init_group_test() self._test_broadcast_helper(group, group_id, rank) @@ -454,7 +458,8 @@ def test_reduce_max(self): self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_sum(self): group, group_id, rank = self._init_group_test() @@ -469,7 +474,8 @@ def test_reduce_group_sum(self): ) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_product(self): group, group_id, rank = self._init_group_test() @@ -484,14 +490,16 @@ def test_reduce_group_product(self): ) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_min(self): group, group_id, rank = self._init_group_test() self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_max(self): group, group_id, rank = self._init_group_test() @@ -540,8 +548,8 @@ def test_all_reduce_sum(self): ) @unittest.skipIf( - BACKEND != "gloo" and BACKEND != "nccl", - "Only Gloo & Nccl backend support CUDA allReduce", + BACKEND != "gloo", + "Only Gloo backend will have CUDA allReduce tested", ) @skip_if_no_cuda_distributed @skip_if_no_gpu @@ -587,8 +595,9 @@ def test_all_reduce_max(self): group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10 ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_sum(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( @@ -601,8 +610,9 @@ def test_all_reduce_group_sum(self): 2 + (10 * (len(group) - 1)), ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_product(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( @@ -615,16 +625,18 @@ def test_all_reduce_group_product(self): reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_min(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1 ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_max(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( @@ -652,6 +664,7 @@ def test_scatter(self): @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter") @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_scatter_group(self): group, group_id, rank = self._init_group_test() @@ -679,7 +692,8 @@ def test_gather(self): self._test_gather_helper(group, group_id, rank) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_gather_group(self): group, group_id, rank = self._init_group_test() @@ -703,12 +717,13 @@ def _test_all_gather_helper( self._barrier() - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND != "mpi", "Only MPI supports CPU all gather") def test_all_gather(self): group, group_id, rank = self._init_global_test() self._test_all_gather_helper(group, group_id, rank) @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather") + @unittest.skipIf(BACKEND == "nccl", "CUDA all gather skipped for NCCL") @skip_if_no_cuda_distributed @skip_if_no_gpu def test_all_gather_cuda(self): @@ -716,8 +731,10 @@ def test_all_gather_cuda(self): rank_to_GPU = self._init_multigpu_helper() self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_gather_group(self): group, group_id, rank = self._init_group_test() self._test_all_gather_helper(group, group_id, rank) @@ -740,13 +757,14 @@ def _test_barrier_helper(self, group, group_id, rank): self._barrier() - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier") def test_barrier(self): group, group_id, rank = self._init_global_test() self._test_barrier_helper(group, group_id, rank) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_barrier_group(self): group, group_id, rank = self._init_group_test() self._test_barrier_helper(group, group_id, rank) @@ -765,7 +783,8 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): self.assertEqual(tensor, expected_tensor) self._barrier() - @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu") + @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu") + @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped") @skip_if_no_gpu def test_broadcast_multigpu(self): group, group_id, rank = self._init_global_test() @@ -802,7 +821,8 @@ def _test_all_reduce_multigpu_helper( self._barrier() - @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu") + @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu") + @unittest.skipIf(BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL") @skip_if_no_gpu def test_all_reduce_multigpu(self): group, group_id, rank = self._init_global_test() @@ -985,7 +1005,7 @@ def test_DistributedDataParallel(self): # DDP training setup model_DDP = copy.deepcopy(model) model_DDP.cuda(gpu_subset[0]) - model_DDP = nn.parallel.DistributedDataParallel( + model_DDP = nn.parallel._DistributedDataParallelC10d( model_DDP, device_ids=gpu_subset ) @@ -1006,33 +1026,8 @@ def test_DistributedDataParallel(self): ) self._barrier() - @unittest.skipIf( - BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU" - ) - def test_DistributedDataParallelCPU(self): - # Run a simple end to end DDP-CPU model, use result of single node - # model as baseline - group, group_id, rank = self._init_global_test() - - # cpu training setup - model_base = self._create_Net() - - # DDP-CPU training setup - model_DDP = copy.deepcopy(model_base) - model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) - - # dummy data initialization - local_bs = 2 - global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs) - # check two model parameters over 2 iterations - self._test_DDP_2iter( - model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs - ) - self._barrier() - - -if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl": +if BACKEND == "gloo" or BACKEND == "nccl": WORLD_SIZE = os.environ["WORLD_SIZE"] class TestDistBackend(TestCase, _DistTestBase): @@ -1052,7 +1047,6 @@ def wrapper(self): @classmethod def setUpClass(cls): os.environ["MASTER_ADDR"] = MASTER_ADDR - os.environ["MASTER_PORT"] = MASTER_PORT os.environ["WORLD_SIZE"] = WORLD_SIZE for attr in dir(cls): if attr.startswith("test"): @@ -1060,6 +1054,17 @@ def setUpClass(cls): setattr(cls, attr, cls.manager_join(fn)) def setUp(self): + # Adding this hack until we fix the FileStore to delete its + # content at the end + global INIT_METHOD + if INIT_METHOD.startswith("file://"): + _, filename = tempfile.mkstemp(prefix=FOLDER) + INIT_METHOD = "file://{}".format(filename) + + if INIT_METHOD.startswith("env://"): + port = common.find_free_port() + os.environ["MASTER_PORT"] = str(port) + self.processes = [] self.rank = self.MANAGER_PROCESS_RANK Barrier.init() @@ -1081,7 +1086,10 @@ def _run(self, rank): self.rank = rank try: dist.init_process_group( - init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE) + init_method=INIT_METHOD, + backend=BACKEND, + world_size=int(WORLD_SIZE), + rank=self.rank ) except RuntimeError as e: if "recompile" in e.args[0]: diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py new file mode 100644 index 00000000000000..47dbe9d056f154 --- /dev/null +++ b/test/test_thd_distributed.py @@ -0,0 +1,1148 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import copy +import fcntl +import multiprocessing +import os +import sys +import time +import unittest +from contextlib import contextmanager +from functools import reduce, wraps + +import torch +import torch.cuda +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from common import TestCase +from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR +from torch.autograd import Variable + + +BACKEND = os.environ["BACKEND"] +TEMP_DIR = os.environ["TEMP_DIR"] +INIT_METHOD = os.getenv("INIT_METHOD", "env://") +MASTER_PORT = "29500" + +DEFAULT_TIMEOUT = 300 +CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500} + + +def get_timeout(test_id): + test_name = test_id.split(".")[-1] + if test_name in CUSTOMIZED_TIMEOUT: + return CUSTOMIZED_TIMEOUT[test_name] + else: + return DEFAULT_TIMEOUT + + +if not dist.is_available(): + print("Distributed not available, skipping tests") + sys.exit(0) + +SKIP_IF_NO_CUDA_EXIT_CODE = 75 +SKIP_IF_NO_GPU_EXIT_CODE = 76 +SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77 +SKIP_IF_BACKEND_UNAVAILABLE = 78 + + +def skip_if_no_cuda_distributed(func): + func.skip_if_no_cuda_distributed = True + + @wraps(func) + def wrapper(*args, **kwargs): + if not torch.cuda.is_available(): + sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE) + + return func(*args, **kwargs) + + return wrapper + + +def skip_if_no_gpu(func): + """ Nccl multigpu tests requires at least 2 GPUS. Skip if this is not met""" + func.skip_if_no_gpu = True + + @wraps(func) + def wrapper(*args, **kwargs): + if not torch.cuda.is_available(): + sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE) + if torch.cuda.device_count() < int(os.environ["WORLD_SIZE"]): + sys.exit(SKIP_IF_NO_GPU_EXIT_CODE) + + return func(*args, **kwargs) + + return wrapper + + +def skip_if_small_worldsize(func): + func.skip_if_small_worldsize = True + + @wraps(func) + def wrapper(*args, **kwargs): + if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) <= 2: + sys.exit(SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE) + + return func(*args, **kwargs) + + return wrapper + + +def apply_hack_for_nccl(): + # This is a hack for a known NCCL issue using multiprocess + # in conjunction with multiple threads to manage different GPUs which + # may cause ncclCommInitRank to fail. + # http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4 + # It slows down the performance of collective operations. + # Without this setting NCCL might throw unhandled error. + os.environ["NCCL_MAX_NRINGS"] = "1" + + +@contextmanager +def _lock(): + lockfile = os.path.join(TEMP_DIR, "lockfile") + with open(lockfile, "w") as lf: + try: + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + yield + finally: + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + lf.close() + + +def _build_tensor(size, value=None): + if value is None: + value = size + return torch.FloatTensor(size, size, size).fill_(value) + + +class Barrier(object): + barrier_id = 0 + + @classmethod + def init(cls): + cls.barrier_id = 0 + barrier_dir = os.path.join(TEMP_DIR, "barrier") + for f_name in os.listdir(barrier_dir): + os.unlink(os.path.join(barrier_dir, f_name)) + + @classmethod + def sync(cls, timeout=5): + cls.barrier_id += 1 + barrier_dir = os.path.join(TEMP_DIR, "barrier") + pid = str(os.getpid()) + barrier_file = os.path.join(barrier_dir, pid) + with _lock(): + with open(barrier_file, "w") as f: + f.write(str(cls.barrier_id)) + + start_time = time.time() + while True: + arrived = 0 + with _lock(): + for f_name in os.listdir(barrier_dir): + with open(os.path.join(barrier_dir, f_name), "r") as f: + data = f.read() + if int(data) >= cls.barrier_id: + arrived += 1 + if arrived == dist.get_world_size(): + break + + if time.time() - start_time > timeout: + raise RuntimeError("barrier timeout") + time.sleep(0.1) + + +class _DistTestBase(object): + def _barrier(self, *args, **kwargs): + Barrier.sync(*args, **kwargs) + + def _init_group_test(self): + group = [1, 2] + group_id = dist.new_group(group) + rank = dist.get_rank() + if rank not in group: + return ([], None, rank) + + return (group, group_id, rank) + + def _init_global_test(self): + group = [i for i in range(0, dist.get_world_size())] + group_id = dist.group.WORLD + rank = dist.get_rank() + return (group, group_id, rank) + + # HELPER FOR MULTIGPU TESTS + def _init_multigpu_helper(self): + """Multigpu tests are designed to simulate the multi nodes with multi + GPUs on each node. Nccl backend requires equal #GPUs in each process. + On a single node, all visible GPUs are evenly + divided to subsets, each process only uses a subset. + """ + nGPUs = torch.cuda.device_count() + world_size = dist.get_world_size() + visible_devices = range(nGPUs) + + if BACKEND == "nccl": + apply_hack_for_nccl() + + nGPUs_per_process = nGPUs // world_size + rank_to_GPU = { + i: list( + visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process] + ) + for i in range(world_size) + } + return rank_to_GPU + + # GET RANK + def test_get_rank(self): + test_dir = os.path.join(TEMP_DIR, "test_dir") + pid = str(os.getpid()) + num_processes = dist.get_world_size() + with open(os.path.join(test_dir, pid), "w") as f: + f.write(str(dist.get_rank())) + + self._barrier() + + all_ranks = set() + for f_name in os.listdir(test_dir): + with open(os.path.join(test_dir, f_name), "r") as f: + all_ranks.add(int(f.read())) + self.assertEqual(len(all_ranks), num_processes) + + self._barrier() + + if dist.get_rank() == 0: + for f_name in os.listdir(test_dir): + os.unlink(os.path.join(test_dir, f_name)) + + self._barrier() + + # SEND RECV + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support send/recv") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv") + def test_send_recv(self): + rank = dist.get_rank() + tensor = _build_tensor(rank + 1) + for dest in range(0, dist.get_world_size()): + if dest == rank: + continue + dist.send(tensor, dest) + + for src in range(0, dist.get_world_size()): + if src == rank: + continue + tensor = _build_tensor(src + 1, value=-1) + expected_tensor = _build_tensor(src + 1) + dist.recv(tensor, src) + self.assertEqual(tensor, expected_tensor) + + self._barrier() + + # SEND RECV ANY SOURCE + @unittest.skipIf( + BACKEND == "gloo", "Gloo does not support send/recv from any source" + ) + @unittest.skipIf( + BACKEND == "nccl", "Nccl does not support send/recv from any source" + ) + def test_send_recv_any_source(self): + rank = dist.get_rank() + tensor = _build_tensor(10, rank) + for dest in range(0, dist.get_world_size()): + if dest == rank: + continue + dist.send(tensor, dest) + + recv_ranks = set() + for src in range(0, dist.get_world_size()): + if src == rank: + continue + tensor = _build_tensor(10, value=-1) + sender = dist.recv(tensor) + self.assertTrue(tensor.eq(sender).all()) + recv_ranks.add(sender) + + self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) + self._barrier() + + # ISEND + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support isend") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend") + def test_isend(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + requests = [ + dist.isend(_build_tensor(dest, 10), dest) + for dest in range(1, world_size) + ] + for request in requests: + request.wait() + self.assertTrue(request.is_completed()) + else: + tensor = _build_tensor(rank, -1) + dist.recv(tensor, 0) + self.assertEqual(tensor, _build_tensor(rank, 10)) + + self._barrier() + + # IRECV + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support irecv") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support irecv") + def test_irecv(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] + requests = [ + dist.irecv(expected_tensors[src - 1], src) + for src in range(1, world_size) + ] + + for src in range(1, world_size): + requests[src - 1].wait() + self.assertTrue(requests[src - 1].is_completed()) + self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) + else: + tensor = _build_tensor(rank, 10) + dist.send(tensor, 0) + + self._barrier() + + # BROADCAST + def _test_broadcast_helper( + self, group, group_id, rank, cuda=False, rank_to_GPU=None + ): + for ttype, value, requires_cuda in [ + ("torch.FloatTensor", -1e-10, False), + ("torch.DoubleTensor", -1e-100, False), + ("torch.HalfTensor", -0.1, True), + ("torch.CharTensor", -2, False), + ("torch.ByteTensor", 129, False), + ("torch.IntTensor", -1e5, False), + ("torch.LongTensor", -1e15, False), + ]: + if requires_cuda and not cuda: + continue + for src in group: + expected_tensor = _build_tensor(src + 1, value).type(ttype) + if cuda: + expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0]) + if rank == src: + dist.broadcast(expected_tensor, src, group_id) + else: + tensor = _build_tensor(src + 1, -1).type(ttype) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.broadcast(tensor, src, group_id) + self.assertEqual(tensor.size(), expected_tensor.size()) + self.assertEqual(tensor.ne(expected_tensor).max(), 0) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_broadcast(self): + group, group_id, rank = self._init_global_test() + self._test_broadcast_helper(group, group_id, rank) + + @unittest.skipIf( + BACKEND != "gloo" and BACKEND != "nccl", + "Only Gloo and Nccl backend supports CUDA allReduce", + ) + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_broadcast_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_broadcast_group(self): + group, group_id, rank = self._init_group_test() + self._test_broadcast_helper(group, group_id, rank) + + # REDUCE + def _test_reduce_helper( + self, + group, + group_id, + rank, + op, + master_value, + worker_value, + expected_value, + cuda=False, + rank_to_GPU=None, + ): + for src in group: + if rank == src: + tensor = _build_tensor(src + 1).fill_(master_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.reduce(tensor, src, op, group_id) + self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) + else: + tensor = _build_tensor(src + 1).fill_(worker_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.reduce(tensor, src, op, group_id) + + self._barrier() + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_sum(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA reduce") + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_reduce_sum_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + 10 * (len(group) - 1), + True, + rank_to_GPU, + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_product(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_min(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_max(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_sum(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_product(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_min(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_max(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10) + + # ALL REDUCE + def _test_all_reduce_helper( + self, + group, + group_id, + rank, + op, + master_value, + worker_value, + expected_value, + cuda=False, + rank_to_GPU=None, + ): + for src in group: + if rank == src: + tensor = _build_tensor(src + 1).fill_(master_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.all_reduce(tensor, op, group_id) + self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) + else: + tensor = _build_tensor(src + 1).fill_(worker_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.all_reduce(tensor, op, group_id) + self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_sum(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf( + BACKEND != "gloo" and BACKEND != "nccl", + "Only Gloo & Nccl backend support CUDA allReduce", + ) + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_all_reduce_sum_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + True, + rank_to_GPU, + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_product(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_min(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1 + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_max(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10 + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_sum(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_product(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_min(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1 + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_max(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10 + ) + + # SCATTER + def _test_scatter_helper(self, group, group_id, rank): + for dest in group: + tensor = _build_tensor(dest + 1, -1) + expected_tensor = _build_tensor(dest + 1, rank) + tensors = ( + [_build_tensor(dest + 1, i) for i in group] if rank == dest else [] + ) + dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id) + self.assertEqual(tensor, expected_tensor) + + self._barrier() + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter") + def test_scatter(self): + group, group_id, rank = self._init_global_test() + self._test_scatter_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter") + @skip_if_small_worldsize + def test_scatter_group(self): + group, group_id, rank = self._init_group_test() + self._test_scatter_helper(group, group_id, rank) + + # GATHER + def _test_gather_helper(self, group, group_id, rank): + for dest in group: + tensor = _build_tensor(dest + 1, rank) + tensors = ( + [_build_tensor(dest + 1, -1) for i in group] if rank == dest else [] + ) + dist.gather(tensor, dst=dest, gather_list=tensors, group=group_id) + if rank == dest: + expected_tensors = [_build_tensor(dest + 1, i) for i in group] + for t1, t2 in zip(tensors, expected_tensors): + self.assertEqual(t1, t2) + + self._barrier() + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_gather(self): + group, group_id, rank = self._init_global_test() + self._test_gather_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_gather_group(self): + group, group_id, rank = self._init_group_test() + self._test_gather_helper(group, group_id, rank) + + # ALL GATHER + def _test_all_gather_helper( + self, group, group_id, rank, cuda=False, rank_to_GPU=None + ): + for dest in group: + tensor = _build_tensor(dest + 1, rank) + tensors = [_build_tensor(dest + 1, -1) for i in group] + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors] + dist.all_gather(tensors, tensor, group_id) + + expected_tensors = [_build_tensor(dest + 1, i) for i in group] + for t1, t2 in zip(tensors, expected_tensors): + self.assertEqual(t1, t2) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_gather(self): + group, group_id, rank = self._init_global_test() + self._test_all_gather_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather") + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_all_gather_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_gather_group(self): + group, group_id, rank = self._init_group_test() + self._test_all_gather_helper(group, group_id, rank) + + # BARRIER + def _test_barrier_helper(self, group, group_id, rank): + WAIT_TIME = 0.3 # seconds + + for dest in group: + expected_time = torch.DoubleTensor(1).fill_(0.0) + if dest == rank: + expected_time.fill_(time.time() + WAIT_TIME) + dist.broadcast(expected_time, dest, group_id) + time.sleep(WAIT_TIME + 0.1) # sleep a little bit longer + dist.barrier(group_id) + else: + dist.broadcast(expected_time, dest, group_id) + dist.barrier(group_id) + self.assertGreaterEqual(time.time(), expected_time[0]) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_barrier(self): + group, group_id, rank = self._init_global_test() + self._test_barrier_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_barrier_group(self): + group, group_id, rank = self._init_group_test() + self._test_barrier_helper(group, group_id, rank) + + def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): + for src in group: + expected_tensor = _build_tensor(src + 1) + tensors = [ + _build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank] + ] + if rank == src: + tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0]) + + dist.broadcast_multigpu(tensors, src, group_id) + for tensor in tensors: + self.assertEqual(tensor, expected_tensor) + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu") + @skip_if_no_gpu + def test_broadcast_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_broadcast_multigpu_helper(group, group_id, rank, rank_to_GPU) + + def _test_all_reduce_multigpu_helper( + self, + group, + group_id, + rank, + rank_to_GPU, + op, + master_value, + worker_value, + expected_value, + ): + for src in group: + if rank == src: + tensors = [ + _build_tensor(src + 1, master_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + else: + tensors = [ + _build_tensor(src + 1, worker_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + + dist.all_reduce_multigpu(tensors, op, group_id) + expected_tensor = _build_tensor(src + 1, expected_value) + for tensor in tensors: + self.assertEqual(tensor, expected_tensor) + + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu") + @skip_if_no_gpu + def test_all_reduce_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_reduce_multigpu_helper( + group, + group_id, + rank, + rank_to_GPU, + dist.reduce_op.SUM, + 2, + 10, + (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]), + ) + + def _test_reduce_multigpu_helper( + self, + group, + group_id, + rank, + rank_to_GPU, + op, + master_value, + worker_value, + expected_value, + ): + for src in group: + if rank == src: + tensors = [ + _build_tensor(src + 1, master_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + dist.reduce_multigpu(tensors, src, op, group_id) + expected_tensor = _build_tensor(src + 1, expected_value) + self.assertEqual(tensors[0], expected_tensor) + else: + tensors = [ + _build_tensor(src + 1, worker_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + dist.reduce_multigpu(tensors, src, op, group_id) + + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports reduce multigpu") + @skip_if_no_gpu + def test_reduce_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_reduce_multigpu_helper( + group, + group_id, + rank, + rank_to_GPU, + dist.reduce_op.SUM, + 2, + 10, + (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]), + ) + + def _test_all_gather_multigpu_helper(self, group, group_id, rank, rank_to_GPU): + for dest in group: + tensors = [ + _build_tensor(dest + 1).cuda(device=i) for i in rank_to_GPU[rank] + ] + + # construct expected output along with + # a place holder to receive all gather results + output_tensors = [] + expected_output = [] + output_per_gpu = ( + [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group) + ) + expected_per_gpu = ( + [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group) + ) + for gpu in rank_to_GPU[rank]: + output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu]) + expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu]) + + dist.all_gather_multigpu(output_tensors, tensors, group_id) + self.assertEqual(output_tensors, expected_output) + + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allgather multigpu") + @skip_if_no_gpu + def test_all_gather_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU) + + def _create_Net(self): + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 50, bias=False) + self.fc3 = nn.Linear(50, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return F.softmax(x, dim=1) + + return Net() + + def _model_step(self, model): + for param in model.parameters(): + param.data += param.grad + param.grad = None + + def _prepare_dummy_data(self, local_bs): + # global_bs for DDP should be divisible by WORLD_SIZE + global_bs = int(WORLD_SIZE) * local_bs + input_cpu = torch.randn(global_bs, 2) + target = torch.randn(global_bs, 4) + loss = nn.MSELoss() + return global_bs, input_cpu, target, loss + + # END TO END TEST FOR DISTRIBUTEDDATAPARALLEL + def _test_DDP_helper(self, model, input_var, target, loss): + model.train() + output = model(input_var) + l = loss(output, target) + l.backward() + + def _assert_equal_param(self, param_gpu, param_DDP): + self.assertEqual(len(param_gpu), len(param_DDP)) + for p_gpu, p_DDP in zip(param_gpu, param_DDP): + self.assertEqual(p_gpu, p_DDP) + + def _test_DDP_2iter( + self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size + ): + for _ in range(2): + # single cpu/gpu training + self._test_DDP_helper(model_base, input, target, loss) + + # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs + self._test_DDP_helper( + model_DDP, + input[rank * local_bs: (rank + 1) * local_bs], + target[rank * local_bs: (rank + 1) * local_bs], + loss, + ) + + # Update weights and run a second iteration to shake out errors + self._model_step(model_base) + self._model_step(model_DDP) + self._assert_equal_param( + list(model_base.parameters()), list(model_DDP.module.parameters()) + ) + + # Shuffle the input so that DDP input is different + input = input[torch.randperm(batch_size)] + + @unittest.skipIf( + BACKEND != "nccl" and BACKEND != "gloo", + "Only Nccl & Gloo backend support DistributedDataParallel", + ) + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_DistributedDataParallel(self): + # Run a simple end to end DDP model, use result of single node model + # as baseline + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + + # cpu training setup + model = self._create_Net() + + # single gpu training setup + model_gpu = copy.deepcopy(model) + gpu_subset = list(rank_to_GPU[rank]) + model_gpu.cuda(gpu_subset[0]) + + # DDP training setup + model_DDP = copy.deepcopy(model) + model_DDP.cuda(gpu_subset[0]) + model_DDP = nn.parallel.DistributedDataParallel( + model_DDP, device_ids=gpu_subset + ) + + # dummy data initialization + local_bs = len(gpu_subset) + global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs) + + # check two model parameters over 2 iterations + self._test_DDP_2iter( + model_gpu, + model_DDP, + input_cpu.cuda(gpu_subset[0]), + target.cuda(gpu_subset[0]), + loss, + local_bs, + rank, + global_bs, + ) + self._barrier() + + @unittest.skipIf( + BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU" + ) + def test_DistributedDataParallelCPU(self): + # Run a simple end to end DDP-CPU model, use result of single node + # model as baseline + group, group_id, rank = self._init_global_test() + + # cpu training setup + model_base = self._create_Net() + + # DDP-CPU training setup + model_DDP = copy.deepcopy(model_base) + model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) + + # dummy data initialization + local_bs = 2 + global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs) + + # check two model parameters over 2 iterations + self._test_DDP_2iter( + model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs + ) + self._barrier() + + +if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl": + WORLD_SIZE = os.environ["WORLD_SIZE"] + + class TestDistBackend(TestCase, _DistTestBase): + MANAGER_PROCESS_RANK = -1 + + @staticmethod + def manager_join(fn): + @wraps(fn) + def wrapper(self): + if self.rank == self.MANAGER_PROCESS_RANK: + self._join_and_reduce(fn) + else: + fn(self) + + return wrapper + + @classmethod + def setUpClass(cls): + os.environ["MASTER_ADDR"] = MASTER_ADDR + os.environ["MASTER_PORT"] = MASTER_PORT + os.environ["WORLD_SIZE"] = WORLD_SIZE + for attr in dir(cls): + if attr.startswith("test"): + fn = getattr(cls, attr) + setattr(cls, attr, cls.manager_join(fn)) + + def setUp(self): + self.processes = [] + self.rank = self.MANAGER_PROCESS_RANK + Barrier.init() + for rank in range(int(WORLD_SIZE)): + self.processes.append(self._spawn_process(rank)) + + def tearDown(self): + for p in self.processes: + p.terminate() + + def _spawn_process(self, rank): + os.environ["RANK"] = str(rank) + name = "process " + str(rank) + process = multiprocessing.Process(target=self._run, name=name, args=(rank,)) + process.start() + return process + + def _run(self, rank): + self.rank = rank + try: + dist.init_process_group( + init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE) + ) + except RuntimeError as e: + if "recompile" in e.args[0]: + sys.exit(SKIP_IF_BACKEND_UNAVAILABLE) + # sys.exit(0) + raise + # self.id() == e.g. '__main__.TestDistributed.test_get_rank' + # We're retreiving a corresponding test and executing it. + getattr(self, self.id().split(".")[2])() + sys.exit(0) + + def _join_and_reduce(self, fn): + skip_ok = ( + getattr(fn, "skip_if_no_cuda_distributed", False) or + getattr(fn, "skip_if_no_gpu", False) or + getattr(fn, "skip_if_small_worldsize", False) + ) + self.JOIN_TIMEOUT = get_timeout(self.id()) + for p in self.processes: + p.join(self.JOIN_TIMEOUT) + + first_process = self.processes[0] + for p in self.processes: + self.assertEqual(p.exitcode, first_process.exitcode) + + if first_process.exitcode == SKIP_IF_BACKEND_UNAVAILABLE: + raise unittest.SkipTest("Compiled without the " + BACKEND + " backend") + + if skip_ok: + # do this first so we don't give an error message about + # mismatched exit codes if the first isn't valid + assert ( + first_process.exitcode == 0 or + first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE or + first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or + first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE + ) + + if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE: + raise unittest.SkipTest("cuda is not available") + if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE: + raise unittest.SkipTest( + "One unique gpu per process is not available" + ) + if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE: + raise unittest.SkipTest("worldsize is too small to run group tests") + + self.assertEqual(first_process.exitcode, 0) + + +elif BACKEND == "mpi": + WORLD_SIZE = os.environ["WORLD_SIZE"] + dist.init_process_group(init_method=INIT_METHOD, backend="mpi") + + class TestMPI(TestCase, _DistTestBase): + pass + + +if __name__ == "__main__": + assert ( + not torch.cuda._initialized + ), "test_distributed must not have initialized CUDA context on main process" + + unittest.main() diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index fdf88bc0704a47..a67d009e024360 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -346,8 +346,8 @@ PyObject* c10d_init(PyObject* _unused) { #endif shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work") - .def("isCompleted", &::c10d::ProcessGroup::Work::isCompleted) - .def("isSuccess", &::c10d::ProcessGroup::Work::isSuccess) + .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted) + .def("is_success", &::c10d::ProcessGroup::Work::isSuccess) .def("exception", &::c10d::ProcessGroup::Work::exception) .def("synchronize", &::c10d::ProcessGroup::Work::synchronize) .def( diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py index 3b98424e891479..5356097743aa3c 100644 --- a/torch/distributed/c10d/__init__.py +++ b/torch/distributed/c10d/__init__.py @@ -6,20 +6,8 @@ def is_available(): if is_available() and not torch._C._c10d_init(): - raise RuntimeError("c10d initialization failed") + raise RuntimeError("Failed to initialize PyTorch distributed support") if is_available(): - from .rendezvous import rendezvous, register_rendezvous_handler - from . import BroadcastOptions, AllreduceOptions - - DEFAULT_REDUCE_OPTIONS = AllreduceOptions() - - def broadcast(tensor, src, process_group): - opts = BroadcastOptions() - opts.rootRank = src - opts.rootTensor = 0 - return process_group.broadcast([tensor], opts) - - def all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS): - return process_group.allreduce([tensor], opts) + from .distributed_c10d import * diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/c10d/distributed_c10d.py new file mode 100644 index 00000000000000..dc341f99427552 --- /dev/null +++ b/torch/distributed/c10d/distributed_c10d.py @@ -0,0 +1,1054 @@ +import torch + +from .rendezvous import rendezvous, register_rendezvous_handler +from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \ + ScatterOptions, GatherOptions +from . import ReduceOp as reduce_op +from . import PrefixStore +from . import ProcessGroupGloo + + +_MPI_AVAILBLE = True +_NCCL_AVAILBLE = True + + +try: + from. import ProcessGroupMPI +except ImportError: + _MPI_AVAILBLE = False + +try: + from. import ProcessGroupNCCL +except ImportError: + _NCCL_AVAILBLE = False + + +class DistBackend: + UNDEFINED = -1 + GLOO = 0 + NCCL = 2 + MPI = 3 + + +class group(object): + WORLD = object() + + +class GroupMember(object): + # Alias to group.WORLD for backward compatibility + WORLD = group.WORLD + NON_GROUP_MEMBER = object() + + +# Cached process groups, map from ProcessGroup to (DistBackend, Store) +_pg_map = {} +# Process group's names, map from ProcessGroup to str +_pg_names = {} +# Process group's global rank to local rank mapping +_pg_group_ranks = {} + +# Default process group state +_default_pg = None +_default_pg_init_method = None + +# Process group count for default naming +_group_count = 0 + + +def _rank_not_in_group(group): + """ + Helper that checks if the current process's rank is not in a given group + + """ + return group == GroupMember.NON_GROUP_MEMBER + + +def _get_group_rank(group, rank): + """ + Helper that gets a given group's local rank in the group from a given global + rank + + """ + if group is GroupMember.WORLD: + raise RuntimeError("group.WORLD does not have local rank to global " + "rank mapping") + group_rank = _pg_group_ranks[group][rank] + if group_rank is None: + raise RuntimeError("The global rank is not part of the group") + return group_rank + + +def _get_global_rank(group, group_rank): + """ + Helper that gets a given group's global rank from a given local rank in the + group + + """ + if group is GroupMember.WORLD: + raise RuntimeError("group.WORLD does not have local rank to global " + "rank mapping") + group_rank_map = _pg_group_ranks[group] + for rank, grp_rank in group_rank_map.items(): + if grp_rank == group_rank: + return rank + raise RuntimeError("The group rank is not part of the group") + + +def _check_default_pg(): + """ + Helper that checks if the default ProcessGroup has been initializd, with + assertion + + """ + assert _default_pg is not None, \ + "Default process group is not initialized" + + +def is_mpi_available(): + """ + Checks if MPI is available + + """ + return _MPI_AVAILBLE + + +def is_nccl_available(): + """ + Checks if NCCL is available + + """ + return _NCCL_AVAILBLE + + +def is_initialized(): + """ + Checking if the default process group has been initialized + + """ + return _default_pg is not None + + +def get_default_group(): + """ + Getting the default process group created by init_process_group + + """ + if not is_initialized(): + raise RuntimeError("Default process group has not been initialized, " + "please make sure to call init_process_group.") + return _default_pg + + +def init_process_group(backend, + init_method="env://", + **kwargs): + """ + Initializes the default distributed process group, and this will also + initialize the distributed package + + Arguments: + backend (str): Name of the backend to use. Depending on build-time + configuration valid values include: + ``mpi`` and ``gloo``. + init_method (str, optional): URL specifying how to initialize the + process group. + world_size (int, optional): Number of processes participating in + the job. + rank (int, optional): Rank of the current process. + group_name (str, optional, deprecated): Group name. + + To enable ``backend == mpi``, PyTorch needs to built from source on + a system that supports MPI. The same applies to NCCL as well. + + """ + global _pg_map + global _pg_names + global _default_pg + global _default_pg_init_method + + if _default_pg is not None: + raise RuntimeError("trying to initialize the default process group " + "twice!") + + world_size = kwargs.pop('world_size', -1) + group_name = kwargs.pop('group_name', '') + rank = kwargs.pop('rank', -1) + assert len(kwargs) == 0, \ + "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) + + if backend == "mpi": + if not is_mpi_available(): + raise RuntimeError("Distributed package doesn't have MPI built in") + + _default_pg = ProcessGroupMPI() + _pg_map[_default_pg] = (DistBackend.MPI, None) + else: + # backward compatible API + if init_method != "env://" and world_size != -1 and rank != -1: + url = "{}?rank={}&world_size={}".format(init_method, + rank, + world_size) + store, _, _ = next(rendezvous(url)) + else: + store, rank, world_size = next(rendezvous(init_method)) + + if backend == "gloo": + _default_pg = ProcessGroupGloo(store, rank, world_size) + _pg_map[_default_pg] = (DistBackend.GLOO, store) + _pg_names[_default_pg] = group_name + elif backend == "nccl": + if not is_nccl_available(): + raise RuntimeError("Distributed package doesn't have NCCL " + "built in") + _default_pg = ProcessGroupNCCL(store, rank, world_size) + _pg_map[_default_pg] = (DistBackend.NCCL, store) + _pg_names[_default_pg] = group_name + else: + raise RuntimeError("Invalid distributed backend name: " + backend) + + _default_pg_init_method = init_method + + +def _new_process_group_helper(world_size, rank, group_name=""): + """ + Create a new distributed process group. And the new process group can be + used to perform collective operations. + + """ + global _pg_map + global _group_count + global _pg_names + + if not group_name: + group_name = str(_group_count) + _group_count += 1 + + if group_name in _pg_names.values(): + raise RuntimeError("The specified group name has already been " + "created, please use a different group name") + + default_backend, default_store = _pg_map[_default_pg] + + # Create the prefix store + store = PrefixStore(group_name, default_store) + + if default_backend == DistBackend.GLOO: + pg = ProcessGroupGloo(store, rank, world_size) + _pg_map[pg] = (DistBackend.GLOO, store, group_name) + _pg_names[_default_pg] = group_name + elif default_backend == DistBackend.NCCL: + if not is_nccl_available(): + raise RuntimeError("Distributed package doesn't have NCCL " + "built in") + pg = ProcessGroupNCCL(store, rank, world_size) + _pg_map[pg] = (DistBackend.NCCL, store, group_name) + _pg_names[_default_pg] = group_name + else: + raise RuntimeError("Unsupported distributed backend by group") + return pg + + +def destroy_process_group(group=group.WORLD): + """ + Destroy a given process group, and deinitialize the distributed package + + Arguments: + group (ProcessGroup, optional): The process group to be destroyed, if + group.WORLD is given, all process + groups including the default one will + be destroyed. + """ + if _rank_not_in_group(group): + return + + global _pg_map + global _pg_names + global _pg_group_ranks + global _default_pg + global _default_pg_init_method + + if group == GroupMember.WORLD: + pg = _default_pg + + if _pg_map.get(pg, None) is None: + raise RuntimeError("Invalid process group specified") + + if group == GroupMember.WORLD: + _default_pg = None + _default_pg_init_method = None + _pg_map.clear() + _pg_names.clear() + _pg_group_ranks.clear() + else: + del _pg_map[pg] + del _pg_names[pg] + del _pg_group_ranks[pg] + + +def get_rank(group=group.WORLD): + """ + Returns the rank of currrent process group + + Rank is a unique identifier assigned to each process within a distributed + process group. They are always consecutive integers ranging from 0 to + ``world_size``. + + Arguments: + group (ProcessGroup, optional): The process group to work on + + Returns: + The rank of the process group + -1, if not part of the group + + """ + if _rank_not_in_group(group): + return -1 + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.rank() + + return group.rank() + + +def get_world_size(group=group.WORLD): + """ + Returns the number of processes in the current process group + + Arguments: + group (ProcessGroup, optional): The process group to work on + + Returns: + The world size of the process group + -1, if not part of the group + + """ + if _rank_not_in_group(group): + return -1 + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.size() + + return group.size() + + +def isend(tensor, + dst, + group=group.WORLD): + """ + Sends a tensor asynchronously. + + Arguments: + tensor (Tensor): Tensor to send. + dst (int): Destination rank. + group (ProcessGroup, optional): The process group to work on + + Returns: + A distributed request object. + None, if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.send([tensor], dst) + else: + group_dst_rank = _get_group_rank(group, dst) + return group.send([tensor], group_dst_rank) + + +def irecv(tensor, + src, + group=group.WORLD): + """ + Receives a tensor asynchronously. + + Arguments: + tensor (Tensor): Tensor to fill with received data. + src (int): Source rank. + group (ProcessGroup, optional): The process group to work on + + Returns: + A distributed request object. + None, if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.recv([tensor], src) + else: + group_src_rank = _get_group_rank(group, src) + return group.recv([tensor], group_src_rank) + + +def send(tensor, + dst, + group=group.WORLD): + """ + Sends a tensor synchronously. + + Arguments: + tensor (Tensor): Tensor to send. + dst (int): Destination rank. + group (ProcessGroup, optional): The process group to work on + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + _default_pg.send([tensor], dst).wait() + else: + group_dst_rank = _get_group_rank(group, dst) + group.send([tensor], group_dst_rank).wait() + + +def recv(tensor, + src=None, + group=group.WORLD): + """ + Receives a tensor synchronously. + + Arguments: + tensor (Tensor): Tensor to fill with received data. + src (int, optional): Source rank. Will receive from any + process if unspecified. + group (ProcessGroup, optional): The process group to work on + + Returns: + Sender rank + -1, if not part of the group + + """ + if _rank_not_in_group(group): + return -1 + + if group == GroupMember.WORLD: + _check_default_pg() + pg = _default_pg + else: + pg = group + + if src is None: + rank_tensor = torch.IntTensor([-1]) + pg.recv_anysource([tensor], rank_tensor).wait() + src_rank = rank_tensor[0].item() + if group == GroupMember.WORLD: + return src_rank + else: + return _get_global_rank(pg, src_rank) + else: + if group == GroupMember.WORLD: + pg.recv([tensor], src).wait() + else: + group_src_rank = _get_group_rank(pg, src) + pg.recv([tensor], group_src_rank).wait() + return src + + +def broadcast_multigpu(tensor_list, + src, + group=group.WORLD, + async_op=False, + src_tensor=0): + """ + Broadcasts the tensor to the whole group with multiple GPU tensors + per node. + + ``tensor`` must have the same number of elements in all the GPUs from + all processes participating in the collective. each tensor in the list must + be on a different GPU + + Only nccl and gloo backend are currently supported + tensors should only be GPU tensors + + Arguments: + tensor_list (List[Tensor]): Tensors that participate in the collective + operation. if ``src`` is the rank, then ``src_tensor``th element of + ``tensor_list`` (``tensor_list[src_tensor]``) will be broadcasted + to all other tensors (on different GPUs) in the src process and + all tensors in ``tensor_list`` of other non-src processes. + You also need to make sure that ``len(tensor_list)`` is the same + for all the distributed processes calling this function. + + src (int): Source rank. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + src_tensor (int, optional): Source tensor rank within ``tensor_list`` + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = BroadcastOptions() + opts.rootRank = src + opts.rootTensor = src_tensor + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.broadcast(tensor_list, opts) + else: + group_src_rank = _get_group_rank(group, src) + opts.rootRank = group_src_rank + work = group.broadcast(tensor_list, opts) + if async_op: + return work + else: + work.wait() + + +def broadcast(tensor, + src, + group=group.WORLD, + async_op=False): + """ + Broadcasts the tensor to the whole group. + + ``tensor`` must have the same number of elements in all processes + participating in the collective. + + Arguments: + tensor (Tensor): Data to be sent if ``src`` is the rank of current + process, and tensor to be used to save received data otherwise. + src (int): Source rank. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = BroadcastOptions() + opts.rootRank = src + opts.rootTensor = 0 + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.broadcast([tensor], opts) + else: + group_src_rank = _get_group_rank(group, src) + opts.rootRank = group_src_rank + work = group.broadcast([tensor], opts) + if async_op: + return work + else: + work.wait() + + +def all_reduce_multigpu(tensor_list, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False): + """ + Reduces the tensor data across all machines in such a way that all get + the final result. This function reduces a number of tensors on every node, + while each tensor resides on different GPUs. + Therefore, the input tensor in the tensor list needs to be GPU tensors. + Also, each tensor in the tensor list needs to reside on a different GPU. + + After the call, all ``tensor`` in ``tensor_list`` is going to be bitwise + identical in all processes. + + Only nccl and gloo backend is currently supported + tensors should only be GPU tensors + + Arguments: + tensor list (List[Tensor]): List of input and output tensors of + the collective. The function operates in-place and requires that + each tensor to be a GPU tensor on different GPUs. + You also need to make sure that ``len(tensor_list)`` is the same for + all the distributed processes calling this function. + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = AllreduceOptions() + opts.reduceOp = op + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allreduce(tensor_list, opts) + else: + work = group.allreduce(tensor_list, opts) + + if async_op: + return work + else: + work.wait() + + +def all_reduce(tensor, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False): + """ + Reduces the tensor data across all machines in such a way that all get + the final result. + + After the call ``tensor`` is going to be bitwise identical in all processes. + + Arguments: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = AllreduceOptions() + opts.reduceOp = op + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allreduce([tensor], opts) + else: + work = group.allreduce([tensor], opts) + + if async_op: + return work + else: + work.wait() + + +def reduce_multigpu(tensor_list, + dst, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False, + dst_tensor=0): + """ + Reduces the tensor data on multiple GPUs across all machines. Each tensor + in ``tensor_list`` should reside on a separate GPU + + Only the GPU of ``tensor_list[dst_tensor]`` on the process with rank ``dst`` + is going to receive the final result. + + Only nccl backend is currently supported + tensors should only be GPU tensors + + Arguments: + tensor_list (List[Tensor]): Input and output GPU tensors of the + collective. The function operates in-place. + You also need to make sure that ``len(tensor_list)`` is the same for + all the distributed processes calling this function. + dst (int): Destination rank + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + dst_tensor (int, optional): Destination tensor rank within + ``tensor_list`` + + Returns: + Async work handle, if async_op is set to True. + None, otherwise + + """ + if _rank_not_in_group(group): + return + + opts = ReduceOptions() + opts.reduceOp = op + opts.rootRank = dst + opts.rootTensor = dst_tensor + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.reduce(tensor_list, opts) + else: + group_dst_rank = _get_group_rank(group, dst) + opts.rootRank = group_dst_rank + work = group.reduce(tensor_list, opts) + + if async_op: + return work + else: + work.wait() + + +def reduce(tensor, + dst, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False): + """ + Reduces the tensor data across all machines. + + Only the process with rank ``dst`` is going to receive the final result. + + Arguments: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + dst (int): Destination rank + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = ReduceOptions() + opts.reduceOp = op + opts.rootRank = dst + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.reduce([tensor], opts) + else: + group_dst_rank = _get_group_rank(group, dst) + opts.rootRank = group_dst_rank + work = group.reduce([tensor], opts) + + if async_op: + return work + else: + work.wait() + + +def all_gather_multigpu(output_tensor_lists, + input_tensor_list, + group=group.WORLD, + async_op=False): + """ + Gathers tensors from the whole group in a list. + Each tensor in ``tensor_list`` should reside on a separate GPU + + Only nccl backend is currently supported + tensors should only be GPU tensors + + Arguments: + output_tensor_lists (List[List[Tensor]]): Output lists. It should + contain correctly-sized tensors on each GPU to be used for output of + the collective. + e.g. ``output_tensor_lists[i]`` contains the all_gather + result that resides on the GPU of ``input_tensor_list[i]``. + Note that each element of ``output_tensor_lists[i]`` has the size of + ``world_size * len(input_tensor_list)``, since the function all + gathers the result from every single GPU in the group. To interpret + each element of ``output_tensor_list[i]``, note that + ``input_tensor_list[j]`` of rank k will be appear in + ``output_tensor_list[i][rank * world_size + j]`` + Also note that ``len(output_tensor_lists)``, and the size of each + element in ``output_tensor_lists`` (each element is a list, + therefore ``len(output_tensor_lists[i])``) need to be the same + for all the distributed processes calling this function. + + input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to + be broadcast from current process. + Note that ``len(input_tensor_list)`` needs to be the same for + all the distributed processes calling this function. + + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allgather(output_tensor_lists, input_tensor_list) + else: + work = group.allgather(output_tensor_lists, input_tensor_list) + + if async_op: + return work + else: + work.wait() + + +def all_gather(tensor_list, + tensor, + group=group.WORLD, + async_op=False): + """ + Gathers tensors from the whole group in a list. + + Arguments: + tensor_list (list[Tensor]): Output list. It should contain + correctly-sized tensors to be used for output of the collective. + tensor (Tensor): Tensor to be broadcast from current process. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allgather([tensor_list], [tensor]) + else: + work = group.allgather([tensor_list], [tensor]) + + if async_op: + return work + else: + work.wait() + + +def gather(tensor, + gather_list, + dst, + group=group.WORLD, + async_op=False): + """ + Gathers a list of tensors in a single process. + + Arguments: + tensor (Tensor): Input tensor. + gather_list (list[Tensor]): List of appropriately-sized tensors to + use for received data. Required only in the receiving process. + dst (int): Destination rank. Required in all processes except the one + that is receiveing the data. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + my_rank = get_rank() + if dst == my_rank: + if gather_list is None: + raise RuntimeError("gather_list is a required argument in gather " + "destination") + else: + if gather_list: + raise RuntimeError("non-empty gather_list can be given only " + "to gather destination") + + opts = GatherOptions() + opts.rootRank = dst + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.gather([gather_list], [tensor], opts) + else: + group_dst_rank = _get_group_rank(group, dst) + opts.rootRank = group_dst_rank + work = group.gather([gather_list], [tensor], opts) + + if async_op: + return work + else: + work.wait() + + +def scatter(tensor, + scatter_list, + src, + group=group.WORLD, + async_op=False): + """ + Scatters a list of tensors to all processes in a group. + + Each process will receive exactly one tensor and store its data in the + ``tensor`` argument. + + Arguments: + tensor (Tensor): Output tensor. + scatter_list (list[Tensor]): List of tensors to scatter. Required only + in the process that is sending the data. + src (int): Source rank. Required in all processes except the one that + is sending the data. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + my_rank = get_rank() + if src == my_rank: + if scatter_list is None: + raise RuntimeError("scatter_list is a required argument in " + "scatter source") + else: + if scatter_list: + raise RuntimeError("non-empty can be given only to scatter " + "source") + + opts = ScatterOptions() + opts.rootRank = src + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.scatter([tensor], [scatter_list], opts) + else: + group_src_rank = _get_group_rank(group, src) + opts.rootRank = group_src_rank + work = group.scatter([tensor], [scatter_list], opts) + + if async_op: + return work + else: + work.wait() + + +def barrier(group=group.WORLD, + async_op=False): + """ + Synchronizes all processes. + + This collective blocks processes until the whole group enters this function, + if async_op is False, or if async work handle is called on wait(). + + Arguments: + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.barrier() + else: + work = group.barrier() + + if async_op: + return work + else: + work.wait() + + +def new_group(ranks=None): + """ + Creates a new distributed group. + + This function requires that all processes in the main group (i.e. all + processes that are part of the distributed job) enter this function, even + if they are not going to be members of the group. Additionally, groups + should be created in the same order in all processes. + + Arguments: + ranks (list[int]): List of ranks of group members. + + Returns: + A handle of distributed group that can be given to collective calls. + """ + + _check_default_pg() + + global _pg_group_ranks + + default_backend, _ = _pg_map[_default_pg] + if default_backend == DistBackend.MPI: + raise RuntimeError("Only NCCL and Gloo backend currently support " + "new_group function") + + global_rank = _default_pg.rank() + global_world_size = _default_pg.size() + + # checks the input ranks + if ranks is not None: + group_world_size = len(ranks) + if group_world_size > global_world_size: + raise RuntimeError("the new group's world size should be less or " + "equal to the world size set by " + "init_process_group") + # check ranks' sanity + for rank in ranks: + if rank < 0 or rank >= global_world_size: + raise RuntimeError("The new group's rank should be within the " + "the world_size set by init_process_group") + + if global_rank in ranks: + group_rank = ranks.index(global_rank) + else: + group_rank = None + else: + group_world_size = global_world_size + group_rank = global_rank + + # Release ranks not in the group + if global_rank not in ranks: + return GroupMember.NON_GROUP_MEMBER + + pg = _new_process_group_helper(group_world_size, group_rank) + + # Create the global rank to group rank mapping + _pg_group_ranks[pg] = {} + for rank in range(global_world_size): + if rank in ranks: + _pg_group_ranks[pg][rank] = ranks.index(rank) + else: + _pg_group_ranks[pg][rank] = None + + return pg + + +# TODO: delete these functions and replace DDP with public functions +DEFAULT_REDUCE_OPTIONS = AllreduceOptions() + + +def _broadcast(tensor, src, process_group): + opts = BroadcastOptions() + opts.rootRank = src + opts.rootTensor = 0 + return process_group.broadcast([tensor], opts) + + +def _all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS): + return process_group.allreduce([tensor], opts) diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/c10d/rendezvous.py index 062443f87abfec..30c9f2dfe7dd3b 100644 --- a/torch/distributed/c10d/rendezvous.py +++ b/torch/distributed/c10d/rendezvous.py @@ -3,6 +3,7 @@ except ImportError: from urlparse import urlparse +import os from . import FileStore, TCPStore @@ -59,13 +60,13 @@ def _error(msg): query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) if "rank" not in query: raise _error("rank parameter missing") - if "size" not in query: - raise _error("size parameter missing") + if "world_size" not in query: + raise _error("world size parameter missing") rank = int(query["rank"]) - size = int(query["size"]) + world_size = int(query["world_size"]) store = FileStore(path) - yield (store, rank, size) + yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using file:// method") @@ -81,18 +82,52 @@ def _error(msg): query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) if "rank" not in query: raise _error("rank parameter missing") - if "size" not in query: - raise _error("size parameter missing") + if "world_size" not in query: + raise _error("world size parameter missing") rank = int(query["rank"]) - size = int(query["size"]) + world_size = int(query["world_size"]) start_daemon = rank == 0 store = TCPStore(result.hostname, result.port, start_daemon) - yield (store, rank, size) + yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using tcp:// method") +def _env_rendezvous_handler(url): + def _error(msg): + return ValueError("env:// rendezvous: " + msg) + + if url != "env://": + raise _error("Only `env://` is expected for the env init method") + world_size = os.environ["WORLD_SIZE"] + if world_size is None: + raise _error("world size is missing") + rank = os.environ["RANK"] + if rank is None: + raise _error("rank is missing") + master_addr = os.environ["MASTER_ADDR"] + if master_addr is None: + raise _error("master addr is missing") + master_port = os.environ["MASTER_PORT"] + if master_port is None: + raise _error("master port is missing") + + # Converting before creating the store + rank = int(rank) + world_size = int(world_size) + master_port = int(master_port) + + # Now start the TCP store daemon on the rank 0 + start_daemon = rank == 0 + store = TCPStore(master_addr, master_port, start_daemon) + yield (store, rank, world_size) + + # If this configuration is invalidated, there is nothing we can do about it + raise RuntimeError("Unable to perform rerendezvous using env:// method") + + register_rendezvous_handler("file", _file_rendezvous_handler) register_rendezvous_handler("tcp", _tcp_rendezvous_handler) +register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp index 3d2bad9191a1fb..3afa33c7536bac 100644 --- a/torch/lib/c10d/ProcessGroupMPI.cpp +++ b/torch/lib/c10d/ProcessGroupMPI.cpp @@ -386,16 +386,17 @@ std::shared_ptr ProcessGroupMPI::gather( const GatherOptions& opts) { checkSingleTensor(inputTensors); + if (outputTensors.size() != 1) { + throw std::runtime_error("Gather: multi-GPU collective is not supported"); + } + if (rank_ != opts.rootRank) { - if (outputTensors.size() > 0) { + if (outputTensors[0].size() > 0) { throw std::runtime_error( "Gather: number of output tensors should be 0 " "for non-root"); } } else { - if (outputTensors.size() != 1) { - throw std::runtime_error("Gather: multi-GPU collective is not supported"); - } if (static_cast(size_) != outputTensors[0].size()) { throw std::runtime_error( "Gather: number of output tensors should equal " @@ -449,17 +450,17 @@ std::shared_ptr ProcessGroupMPI::scatter( std::vector>& inputTensors, const ScatterOptions& opts) { checkSingleTensor(outputTensors); + if (inputTensors.size() != 1) { + throw std::runtime_error("Scatter: multi-GPU collective is not supported"); + } if (rank_ != opts.rootRank) { - if (inputTensors.size() > 0) { + if (inputTensors[0].size() > 0) { throw std::runtime_error( "Scatter: number of input tensors should be 0 " "for non-root"); } } else { - if (inputTensors.size() != 1) { - throw std::runtime_error("Gather: multi-GPU collective is not supported"); - } if (static_cast(size_) != inputTensors[0].size()) { throw std::runtime_error( "Scatter: number of input tensors should equal " diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py index 1310d2d748c89c..daa03f9f585114 100644 --- a/torch/nn/parallel/distributed_c10d.py +++ b/torch/nn/parallel/distributed_c10d.py @@ -91,13 +91,14 @@ class _DistributedDataParallelC10d(Module): Args: module: module to be parallelized - process_group: the c10d process group to be used for distributed data - all-reduction device_ids: CUDA devices (default: all devices) output_device: device location of output (default: device_ids[0]) broadcast_buffers: flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function. (default: True) + process_group: the c10d process group to be used for distributed data + all-reduction. If None, the default process group will + be used bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into multiple buckets so that gradient reduction of each bucket can potentially overlap with backward computation. @@ -112,9 +113,9 @@ class _DistributedDataParallelC10d(Module): >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size) >>> net = torch.nn._DistributedDataParallelC10d(model, pg) """ - def __init__(self, module, process_group, device_ids=None, + def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True, - bucket_cap_mb=25): + process_group=None, bucket_cap_mb=25): super(_DistributedDataParallelC10d, self).__init__() @@ -125,13 +126,19 @@ def __init__(self, module, process_group, device_ids=None, if output_device is None: output_device = device_ids[0] + if process_group is None: + self.process_group = c10d.get_default_group() + else: + self.process_group = process_group + self.dim = dim self.module = module - self.process_group = process_group self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers + self.allreduce_opts = c10d.AllreduceOptions() + MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well @@ -341,7 +348,8 @@ def _queue_reduction(self, bucket_idx): nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) # now work on the first gpu - reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group) + reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]], + self.allreduce_opts) self.reduction_works[bucket_idx] = reduction_work self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0] From c99a143eea5224272e52b58a7714eec805671cab Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Wed, 29 Aug 2018 13:27:03 -0700 Subject: [PATCH 16/42] Update blackbox predictor with new constructor (#10920) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10920 Update the black box predictor and the related code to use the constructor with PredictorConfig. Reviewed By: highker Differential Revision: D9516972 fbshipit-source-id: fbd7ece934d527e17dc6bcc740b4e67e778afa1d --- caffe2/predictor/predictor_config.cc | 58 ++++++++++++++++++++++------ caffe2/predictor/predictor_config.h | 3 +- caffe2/predictor/predictor_utils.cc | 51 ++++++++++++++++++++++++ caffe2/predictor/predictor_utils.h | 10 +++++ 4 files changed, 109 insertions(+), 13 deletions(-) diff --git a/caffe2/predictor/predictor_config.cc b/caffe2/predictor/predictor_config.cc index aabff0daffcd73..0ca120d0121da5 100644 --- a/caffe2/predictor/predictor_config.cc +++ b/caffe2/predictor/predictor_config.cc @@ -10,7 +10,7 @@ namespace { // We don't use the getNet() from predictor_utils.cc here because that file // has additional dependencies that we want to avoid bringing in, to keep the // binary size as small as possible. -const NetDef& getNet(const MetaNetDef& def, const std::string& name) { +static const NetDef& getNet(const MetaNetDef& def, const std::string& name) { for (const auto& n : def.nets()) { if (n.key() == name) { return n.value(); @@ -19,7 +19,7 @@ const NetDef& getNet(const MetaNetDef& def, const std::string& name) { CAFFE_THROW("Net not found: ", name); } -const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs( +static const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs( const MetaNetDef& def, const std::string& name) { for (const auto& b : def.blobs()) { @@ -30,26 +30,60 @@ const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs( CAFFE_THROW("Blob not found: ", name); } +static std::string combine(const std::string& str, const std::string& name) { + if (name.empty()) { + return std::string(str); + } + return str + "_" + name; +} + +static std::string getNamedPredictNet(const string& name) { + return combine(PredictorConsts::default_instance().predict_net_type(), name); +} + +static std::string getNamedInitNet(const string& name) { + return combine( + PredictorConsts::default_instance().predict_init_net_type(), name); +} + +static std::string getNamedInputs(const string& name) { + return combine(PredictorConsts::default_instance().inputs_blob_type(), name); +} + +static std::string getNamedOutputs(const string& name) { + return combine(PredictorConsts::default_instance().outputs_blob_type(), name); +} + +static std::string getNamedParams(const string& name) { + return combine( + PredictorConsts::default_instance().parameters_blob_type(), name); +} + } // namespace -PredictorConfig -makePredictorConfig(const MetaNetDef& def, Workspace* parent, bool run_init) { - const auto& init_net = - getNet(def, PredictorConsts::default_instance().global_init_net_type()); - const auto& run_net = - getNet(def, PredictorConsts::default_instance().predict_net_type()); +PredictorConfig makePredictorConfig( + const MetaNetDef& def, + Workspace* parent, + bool run_init, + const std::string& net_name) { + const auto& init_net = getNet(def, getNamedInitNet(net_name)); + const auto& run_net = getNet(def, getNamedPredictNet(net_name)); auto config = makePredictorConfig(init_net, run_net, parent, run_init); - const auto& inputs = - getBlobs(def, PredictorConsts::default_instance().inputs_blob_type()); + const auto& inputs = getBlobs(def, getNamedInputs(net_name)); for (const auto& input : inputs) { config.input_names.emplace_back(input); } - const auto& outputs = - getBlobs(def, PredictorConsts::default_instance().outputs_blob_type()); + const auto& outputs = getBlobs(def, getNamedOutputs(net_name)); for (const auto& output : outputs) { config.output_names.emplace_back(output); } + + const auto& params = getBlobs(def, getNamedParams(net_name)); + for (const auto& param : params) { + config.parameter_names.emplace_back(param); + } + return config; } diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h index eda1c9d03ca2ba..b1555addfa6f08 100644 --- a/caffe2/predictor/predictor_config.h +++ b/caffe2/predictor/predictor_config.h @@ -45,7 +45,8 @@ CAFFE2_API Workspace makeWorkspace(std::shared_ptr paramete CAFFE2_API PredictorConfig makePredictorConfig( const MetaNetDef& net, Workspace* parent = nullptr, - bool run_init = true); + bool run_init = true, + const std::string& net_name = ""); CAFFE2_API PredictorConfig makePredictorConfig( const NetDef& init_net, diff --git a/caffe2/predictor/predictor_utils.cc b/caffe2/predictor/predictor_utils.cc index 4af83d0bea8c25..f5acd4f936010b 100644 --- a/caffe2/predictor/predictor_utils.cc +++ b/caffe2/predictor/predictor_utils.cc @@ -1,4 +1,5 @@ #include "caffe2/predictor/predictor_utils.h" +#include "caffe2/predictor/predictor_config.h" #include "caffe2/core/blob.h" #include "caffe2/core/logging.h" @@ -6,6 +7,13 @@ #include "caffe2/proto/predictor_consts.pb.h" #include "caffe2/utils/proto_utils.h" +CAFFE2_DEFINE_bool( + caffe2_predictor_claim_tensor_memory, + true, + "If false, then predictor will not claim tensor memory" + "otherwise when tensor is shrinked to a size smaller than current size " + "by FLAGS_caffe2_max_keep_on_shrink_memory, the memory will be claimed."); + namespace caffe2 { namespace predictor_utils { @@ -79,4 +87,47 @@ std::unique_ptr runGlobalInitialization( } } // namespace predictor_utils + +void removeExternalBlobs( + const std::vector& input_blobs, + const std::vector& output_blobs, + Workspace* ws) { + for (const auto& blob : input_blobs) { + ws->RemoveBlob(blob); + } + for (const auto& blob : output_blobs) { + ws->RemoveBlob(blob); + } +} + +PredictorConfig makePredictorConfig( + const string& db_type, + const string& db_path) { + // TODO: Remove this flags once Predictor accept PredictorConfig as + // constructors. These comes are copied temporarly from the Predictor. + if (FLAGS_caffe2_predictor_claim_tensor_memory) { + if (FLAGS_caffe2_max_keep_on_shrink_memory == LLONG_MAX) { + FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 1024 * 1024; + } + } + auto dbReader = + make_unique(db::CreateDB(db_type, db_path, db::READ)); + auto ws = std::make_shared(); + auto net_def = + predictor_utils::runGlobalInitialization(std::move(dbReader), ws.get()); + auto config = makePredictorConfig(*net_def, ws.get()); + config.ws = ws; + const auto& init_net = predictor_utils::getNet( + *net_def, PredictorConsts::default_instance().predict_init_net_type()); + CAFFE_ENFORCE(config.ws->RunNetOnce(init_net)); + config.ws->RemoveBlob( + PredictorConsts::default_instance().predictor_dbreader()); + // Input and output blobs should never be allocated in the master workspace + // since we'll end up with race-conditions due to these being shared among + // predictor threads / TL workspaces. Safely handle against globalInitNet + // creating them in the master. + removeExternalBlobs(config.input_names, config.output_names, config.ws.get()); + return config; +} + } // namespace caffe2 diff --git a/caffe2/predictor/predictor_utils.h b/caffe2/predictor/predictor_utils.h index 8c9cb4a5792d48..af7799b039c8b7 100644 --- a/caffe2/predictor/predictor_utils.h +++ b/caffe2/predictor/predictor_utils.h @@ -24,4 +24,14 @@ CAFFE2_API std::unique_ptr runGlobalInitialization( Workspace* master); } // namespace predictor_utils + +PredictorConfig makePredictorConfig( + const string& db_type, + const string& db_path); + +void removeExternalBlobs( + const std::vector& input_blobs, + const std::vector& output_blobs, + Workspace* ws); + } // namespace caffe2 From cd9416317d4c66c9a15caed3e47bbaa0469f40e0 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 29 Aug 2018 13:29:34 -0700 Subject: [PATCH 17/42] Minor copy-edit on setup.py Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10933 Reviewed By: cpuhrsch Differential Revision: D9526650 fbshipit-source-id: 8ad1c989bee7009b3f95a2641189f55cf6c1979f --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index a2dbff45a29025..e2446a1494d9f8 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,8 @@ # TORCH_CUDA_ARCH_LIST # specify which CUDA architectures to build for. # ie `TORCH_CUDA_ARCH_LIST="6.0;7.0"` +# These are not CUDA versions, instead, they specify what +# classes of NVIDIA hardware we should generate PTX for. # # ONNX_NAMESPACE # specify a namespace for ONNX built here rather than the hard-coded From b644d5e74a7a70dfbcb6be83e06d0288c8769c3d Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 29 Aug 2018 13:50:54 -0700 Subject: [PATCH 18/42] Delete context and get_context from Type. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11001 Reviewed By: cpuhrsch Differential Revision: D9557315 fbshipit-source-id: b9862b8dda49194298bb1a4fbc214d466f3c8350 --- aten/src/ATen/UndefinedType.cpp | 4 ++-- aten/src/ATen/UndefinedType.h | 2 +- aten/src/ATen/gen.py | 2 +- aten/src/ATen/native/cuda/Gesv.cu | 2 +- aten/src/ATen/templates/RegisterCPU.cpp | 2 +- aten/src/ATen/templates/SparseTypeDerived.cpp | 6 +++--- aten/src/ATen/templates/Type.cpp | 4 ++-- aten/src/ATen/templates/Type.h | 7 ++----- aten/src/ATen/templates/TypeDerived.cpp | 6 +++--- aten/src/ATen/templates/TypeDerived.h | 2 +- tools/autograd/templates/VariableType.cpp | 2 +- 11 files changed, 18 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp index 60d9c884b8aef2..2bc3965c6d33ae 100644 --- a/aten/src/ATen/UndefinedType.cpp +++ b/aten/src/ATen/UndefinedType.cpp @@ -3,8 +3,8 @@ namespace at { -UndefinedType::UndefinedType(Context* context) - : Type(context, UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {} +UndefinedType::UndefinedType() + : Type(UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {} ScalarType UndefinedType::scalarType() const { return ScalarType::Undefined; } diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h index 2cb14a3a652c4f..d216e3131dd693 100644 --- a/aten/src/ATen/UndefinedType.h +++ b/aten/src/ATen/UndefinedType.h @@ -12,7 +12,7 @@ namespace at { struct UndefinedType final : public Type { - explicit UndefinedType(Context* context); + explicit UndefinedType(); virtual ScalarType scalarType() const override; virtual Backend backend() const override; virtual bool is_cuda() const override; diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 53879e56ffb342..bb6d71f54c2d1a 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -125,7 +125,7 @@ def check_all_files_written(self): TYPE_REGISTER = CodeTemplate("""\ context->type_registry[static_cast(Backend::${backend})] [static_cast(ScalarType::${scalar_type})] - .reset(new ${type_name}(context)); + .reset(new ${type_name}()); detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type}); """) diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index 0692dd0fea2901..bc37e83990e192 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -48,7 +48,7 @@ void magmaGesvBatched( } static magma_queue_t createMagmaQueue(const Tensor& tensor) { - auto& context = tensor.type().get_context(); + auto& context = at::globalContext(); magma_queue_t magma_queue; magma_queue_create_from_cuda( tensor.get_device(), diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp index 184af2c8c014da..0c1eeb4818fbbc 100644 --- a/aten/src/ATen/templates/RegisterCPU.cpp +++ b/aten/src/ATen/templates/RegisterCPU.cpp @@ -14,7 +14,7 @@ namespace at { void register_cpu_types(Context * context) { ${cpu_type_registrations} context->type_registry[static_cast(Backend::Undefined)] - [static_cast(ScalarType::Undefined)].reset(new UndefinedType(context)); + [static_cast(ScalarType::Undefined)].reset(new UndefinedType()); } } // namespace at diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 4a17004bb5ff8c..2ef9dbf398fa2f 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -27,8 +27,8 @@ namespace at { -${Type}::${Type}(Context* context) - : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} +${Type}::${Type}() + : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } @@ -58,7 +58,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { AT_ERROR("unsafeTensorFromTH not supported on sparse"); } std::unique_ptr ${Type}::generator() const { - return std::unique_ptr(new ${Generator}(context)); + return std::unique_ptr(new ${Generator}(&at::globalContext())); } const char * ${Type}::toString() const { diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp index 90dbbb810ee30d..ff154971e7bffb 100644 --- a/aten/src/ATen/templates/Type.cpp +++ b/aten/src/ATen/templates/Type.cpp @@ -38,10 +38,10 @@ Tensor Type::copy(const Tensor & src, bool non_blocking) const { } Type & Type::toBackend(Backend b) const { - return context->getType(b,scalarType()); + return at::globalContext().getType(b,scalarType()); } Type & Type::toScalarType(ScalarType s) const { - return context->getType(backend(),s); + return at::globalContext().getType(backend(),s); } static std::vector defaultStrides(IntList sizes) { std::vector strides(sizes.size()); diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 884bd3a3bdff76..10c52ac14b6975 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -45,8 +45,8 @@ enum class TypeID { }; struct AT_API Type { - explicit Type(Context* context, TensorTypeId type_id, bool is_variable, bool is_undefined) - : context(context), type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {} + explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined) + : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {} virtual ~Type() {} virtual ScalarType scalarType() const = 0; virtual Backend backend() const = 0; @@ -79,8 +79,6 @@ struct AT_API Type { Type & cuda() const { return this->toBackend(at::backendToCUDA(this->backend())); } - Context& get_context() const { return *context; } - // contiguous IDs for all types in the system // for external dispatch virtual TypeID ID() const = 0; @@ -111,7 +109,6 @@ struct AT_API Type { // virtual Tensor * add(Tensor & a, Tensor & b) = 0; ${type_method_declarations} protected: - Context* context; TensorTypeId type_id_; bool is_variable_; bool is_undefined_; diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index fbafed82b57e02..4335a8f2209a20 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -38,8 +38,8 @@ static int getPointerDevice(void* ptr) { } #endif -${Type}::${Type}(Context* context) - : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} +${Type}::${Type}() + : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } @@ -99,7 +99,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { return Storage((${THStorage}*) th_pointer); } std::unique_ptr ${Type}::generator() const { - return std::unique_ptr(new ${Generator}(context)); + return std::unique_ptr(new ${Generator}(&at::globalContext())); } const char * ${Type}::toString() const { diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h index e8613b62a333be..ec08e1a336daf6 100644 --- a/aten/src/ATen/templates/TypeDerived.h +++ b/aten/src/ATen/templates/TypeDerived.h @@ -16,7 +16,7 @@ namespace at { struct ${Type} final : public Type { - explicit ${Type}(Context* context); + explicit ${Type}(); virtual ScalarType scalarType() const override; virtual Backend backend() const override; virtual bool is_cuda() const override; diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 89101a24714b72..244606ca7938d7 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -43,7 +43,7 @@ using namespace torch::autograd::generated; namespace torch { namespace autograd { VariableType::VariableType(Context* context, Type* baseType) - : Type(context, baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false) + : Type(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false) , baseType(baseType) , id_(context->freshTypeID()) { str = std::string("Variable[") + baseType->toString() + "]"; From f687ff5a59f18120617b6ce0c45d4335f0ff65ab Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 29 Aug 2018 13:58:53 -0700 Subject: [PATCH 19/42] Delete unnecessary includes from TensorImpl.h (#11005) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11005 Reviewed By: smessmer Differential Revision: D9558300 Pulled By: ezyang fbshipit-source-id: ebebb3c6d3a1a2f7cc3da9fe9d3c56310ead46e1 --- aten/src/ATen/TensorImpl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h index 30b34cabec769f..8976acb6a40904 100644 --- a/aten/src/ATen/TensorImpl.h +++ b/aten/src/ATen/TensorImpl.h @@ -3,8 +3,6 @@ #include #include -#include "ATen/Retainable.h" -#include "ATen/StorageImpl.h" #include "ATen/Storage.h" #include "ATen/core/optional.h" #include "ATen/core/TensorTypeId.h" From e9eed8edb438ec5ed6b950225a6d315e30da4b70 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Wed, 29 Aug 2018 14:09:24 -0700 Subject: [PATCH 20/42] Add doc for Tensor.digamma_? (#11008) Summary: follow up for #10967 zou3519 vishwakftw Pull Request resolved: https://github.com/pytorch/pytorch/pull/11008 Differential Revision: D9559889 Pulled By: SsnL fbshipit-source-id: a05d8fbad92a54bcdb93de6e62a7f94180da1d99 --- test/test_torch.py | 2 -- torch/_tensor_docs.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_torch.py b/test/test_torch.py index 5167ac618bba75..167a400ec91473 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -179,8 +179,6 @@ def test_namespace(ns, *skips): 'as_strided_', re.compile('^clamp_(min|max)_?$'), 'coalesce', - 'digamma', - 'digamma_', 'index_put', 'is_coalesced', 'is_distributed', diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 39d14668958c94..0a76a89a20d55a 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -650,6 +650,20 @@ def add_docstr_all(method, docstr): See :func:`torch.diagonal` """) +add_docstr_all('digamma', + r""" +digamma() -> Tensor + +See :func:`torch.digamma` +""") + +add_docstr_all('digamma_', + r""" +digamma_() -> Tensor + +In-place version of :meth:`~Tensor.digamma` +""") + add_docstr_all('dim', r""" dim() -> int From 0b1de747329250fbb290411d819bf10edbc1b858 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 29 Aug 2018 14:22:23 -0700 Subject: [PATCH 21/42] Documentation improvement in caffe2/core/tensor.h (#11006) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11006 Reviewed By: smessmer Differential Revision: D9558383 Pulled By: ezyang fbshipit-source-id: 7d36fb69a6e8a7d064da2c8796dc263a9fd4e094 --- caffe2/core/tensor.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 23740cfc5772e5..21dc126c7f2c0b 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -25,7 +25,7 @@ inline vector ToVectorTIndex(const std::vector& src) { } /** - * Return product of all dimensions starting from K + * Return product of all dimensions starting from k */ inline TIndex size_from_dim_(int k, const vector& dims) { TIndex r = 1; @@ -35,7 +35,7 @@ inline TIndex size_from_dim_(int k, const vector& dims) { return r; } -// Product of all dims up to +// Product of all dims up to k (not including dims[k]) inline TIndex size_to_dim_(int k, const vector& dims) { CAFFE_ENFORCE((unsigned)k <= dims.size()); TIndex r = 1; @@ -61,6 +61,7 @@ inline TIndex size_between_dim_(int k, int l, const vector& dims) { return r; } +// Wrap around axis_index if it is negative, s.t., -1 is the last dim inline int canonical_axis_index_(int axis_index, int ndims) { CAFFE_ENFORCE_GE(axis_index, -ndims); CAFFE_ENFORCE_LT(axis_index, ndims); From 6a8bc3804ac72f7c946038289b088e50a2672891 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Wed, 29 Aug 2018 14:25:49 -0700 Subject: [PATCH 22/42] Add flush to logging messages higher than INFO. (#10983) Summary: This probably fixes the logging test error that orionr is encountering - haven't tested locally but wanted to send out a PR to kick off CI. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10983 Reviewed By: ezyang Differential Revision: D9552607 Pulled By: Yangqing fbshipit-source-id: 9ac019031ffd9c03972144df04a836e5dcdafe02 --- caffe2/core/logging.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc index ec7518630c9525..a394f91c729b87 100644 --- a/caffe2/core/logging.cc +++ b/caffe2/core/logging.cc @@ -236,6 +236,12 @@ MessageLogger::~MessageLogger() { if (severity_ >= FLAGS_caffe2_log_level) { // If not building on Android, log all output to std::cerr. std::cerr << stream_.str(); + // Simulating the glog default behavior: if the severity is above INFO, + // we flush the stream so that the output appears immediately on std::cerr. + // This is expected in some of our tests. + if (severity_ > INFO) { + std::cerr << std::flush; + } } #endif // ANDROID if (severity_ == FATAL) { From 22e3b2c9c369c5fb44476eb538fa0a308df94eff Mon Sep 17 00:00:00 2001 From: Zhanibek Datbayev Date: Wed, 29 Aug 2018 14:37:39 -0700 Subject: [PATCH 23/42] Revert D9413150: [New Checkpoint] Kill the dummy TaskOutput when task.get_step() Differential Revision: D9413150 Original commit changeset: 51aaf3201e26 fbshipit-source-id: ac7c4c0960db03f344fe3eb2ad7f0e034db2371a --- caffe2/python/checkpoint_test.py | 4 +-- caffe2/python/core_test.py | 4 +-- caffe2/python/task.py | 48 +++++++++++++++++++------------- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py index afba3dddcd5aae..a91bbf9910e29a 100644 --- a/caffe2/python/checkpoint_test.py +++ b/caffe2/python/checkpoint_test.py @@ -161,9 +161,9 @@ def test_ckpt_name_and_load_model_from_ckpts(self): num_epochs = job_runner.train(session) self.assertEquals(num_epochs, len(EXPECTED_TOTALS)) - # There are 15 global blobs after finishing up the job runner. + # There are 17 global blobs after finishing up the job runner. # (only blobs on init_group are checkpointed) - self.assertEquals(len(ws.blobs), 15) + self.assertEquals(len(ws.blobs), 17) ws = workspace.C.Workspace() session = LocalSession(ws) diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index d989471a16bab2..7120843f33152d 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -533,8 +533,8 @@ def test_create_plan_from_proto_correctly(self): self.assertEqual(len(plan.Steps()), 1) self.assertEqual(len(test_plan.Steps()), 1) - self.assertEqual(len(plan.Proto().network), 8) - self.assertEqual(len(test_plan.Proto().network), 8) + self.assertEqual(len(plan.Proto().network), 9) + self.assertEqual(len(test_plan.Proto().network), 9) self.assertEqual(len(plan.Proto().execution_step), 1) self.assertEqual(len(test_plan.Proto().execution_step), 1) self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name()) diff --git a/caffe2/python/task.py b/caffe2/python/task.py index 5aafdf63c3b28a..311211dfdff3ee 100644 --- a/caffe2/python/task.py +++ b/caffe2/python/task.py @@ -150,7 +150,7 @@ def add_setup_steps(step, init_nets, exit_nets, name): if init_nets: steps.append(core.execution_step('%s:init' % name, init_nets)) steps.append(step) - if exit_nets: + if len(exit_nets) > 0: steps.append(core.execution_step('%s:exit' % name, exit_nets)) return core.execution_step(name, steps) @@ -215,11 +215,10 @@ def add(self, task): self._tasks.append(task) def tasks(self): - if not self._already_used: - for task in self._tasks_to_add: - self.add(task) - self._tasks_to_add = [] - self._already_used = True + for task in self._tasks_to_add: + self.add(task) + self._tasks_to_add = [] + self._already_used = True return self._tasks def num_registered_tasks(self): @@ -228,7 +227,7 @@ def num_registered_tasks(self): def used_nodes(self): # use list to keep order used = [] - for task in self.tasks(): + for task in self._tasks + self._tasks_to_add: if task.node not in used: used.append(task.node) return used @@ -260,8 +259,9 @@ def tasks_by_node(self, node_remap=None): # tasks_by_node can't be called twice because the setup won't # work properly a second time. node_map = {} - for node in self.used_nodes(): - node_map[node] = node_remap(node) if node_remap else node + for task in self.tasks(): + node_map[task.node] =\ + node_remap(task.node) if node_remap else task.node if self._tasks_by_node is not None: tasks_by_node, prev_node_map = self._tasks_by_node assert prev_node_map == node_map, ( @@ -285,7 +285,11 @@ def tasks_by_node(self, node_remap=None): grouped_by_node = TaskGroup() for node, tasks in viewitems(tasks_by_node): report_steps = report_steps_by_node[node] - + node_inits, node_exits = get_setup_nets( + TaskGroup.LOCAL_SETUP, + [t.get_step() for t in tasks] + report_steps, + self) + # shortcut for single task with no queue steps = report_steps outputs = [] grouped_workspace_type = WorkspaceType.PRIVATE @@ -307,15 +311,16 @@ def tasks_by_node(self, node_remap=None): else: step = core.execution_step( '%s:body' % node, steps, concurrent_substeps=True) - - # Prepend and append setup nets. - node_inits, node_exits = get_setup_nets( - TaskGroup.LOCAL_SETUP, - [t.get_step() for t in tasks] + report_steps, - self, - ) - step = add_setup_steps(step, node_inits, node_exits, node) - + if len(node_inits) > 0 or len(node_exits) > 0: + steps = [] + if len(node_inits) > 0: + steps.append( + core.execution_step('%s:init' % node, node_inits)) + steps.append(step) + if len(node_exits) > 0: + steps.append( + core.execution_step('%s:exit' % node, node_exits)) + step = core.execution_step(node, steps) Task( node=node, step=step, outputs=outputs, name='grouped_by_node', @@ -577,6 +582,11 @@ def get_step(self): Task.TASK_SETUP, [self._step] + report_steps, self) instance_init_nets, instance_exit_nets = get_setup_nets( Task.TASK_INSTANCE_SETUP, [self._step] + report_steps, self) + if len(self._outputs) == 0: + output_net = core.Net('%s:output' % self.name) + self.add_output(output_net.ConstantFill( + [], 1, dtype=core.DataType.INT32, value=0)) + task_exit_nets.append(output_net) # Add instance-level report steps body = self._step if not report_steps else core.execution_step( From 89834dfe647d246f5bd3549a884e31ae602a25bd Mon Sep 17 00:00:00 2001 From: Tommy Yu Date: Wed, 29 Aug 2018 14:47:36 -0700 Subject: [PATCH 24/42] Add GPU version of HardSigmoid Op to Caffe2 (#10955) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10955 Add GPU version of HardSigmoid Op to Caffe2. Updated test file to include GPU tests. Reviewed By: enosair Differential Revision: D9499353 fbshipit-source-id: fcb51902063d0c3e4b10354533a8a42cf827c545 --- caffe2/operators/hard_sigmoid_op.cu | 91 +++++++++++++++++++ .../operator_test/elementwise_ops_test.py | 2 +- 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 caffe2/operators/hard_sigmoid_op.cu diff --git a/caffe2/operators/hard_sigmoid_op.cu b/caffe2/operators/hard_sigmoid_op.cu new file mode 100644 index 00000000000000..ed3a4ec8286888 --- /dev/null +++ b/caffe2/operators/hard_sigmoid_op.cu @@ -0,0 +1,91 @@ +#include "caffe2/operators/hard_sigmoid_op.h" + +#include +#include + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +namespace { + +template +__global__ void HardSigmoidCUDAKernel( + const int N, + const T alpha, + const T beta, + const T* X, + T* Y) { + CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + Y[i] = max(T(0), min(T(1), alpha * __ldg(X + i) + beta)); +#else + Y[i] = max(T(0), min(T(1), alpha * X[i] + beta)); +#endif + } +} + +template +__global__ void HardSigmoidGradientCUDAKernel( + const int N, + const T alpha, + const T* dY, + const T* Y, + T* dX) { + CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + dX[i] = (__ldg(Y + i) > T(0) && __ldg(Y + i) < T(1)) ? __ldg(dY + i) * alpha + : T(0); +#else + dX[i] = (Y[i] > T(0) && Y[i] < T(1)) ? dY[i] * alpha : T(0); +#endif + } +} + +} // namespace + +template <> +template +bool HardSigmoidFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + HardSigmoidCUDAKernel + <<cuda_stream()>>>(N, alpha, beta, X, Y); + return true; +} + +template <> +template +bool HardSigmoidGradientFunctor::Forward( + const std::vector& Y_dims, + const std::vector& /* dY_dims */, + const T* Y, + const T* dY, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies()); + HardSigmoidGradientCUDAKernel + <<cuda_stream()>>>(size, alpha, dY, Y, dX); + return true; +} + +REGISTER_CUDA_OPERATOR( + HardSigmoid, + UnaryElementwiseWithArgsOp< + TensorTypes, + CUDAContext, + HardSigmoidFunctor>); +REGISTER_CUDA_OPERATOR( + HardSigmoidGradient, + BinaryElementwiseWithArgsOp< + TensorTypes, + CUDAContext, + HardSigmoidGradientFunctor>); + +} // namespace caffe2 diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index 0e590307a88858..c20aad4218f17e 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -338,7 +338,7 @@ def sigmoid_ref(X): alpha=st.floats(min_value=-100.0, max_value=100.0), beta=st.floats(min_value=-100.0, max_value=100.0), engine=st.sampled_from([""]), - **hu.gcs_cpu_only) + **hu.gcs) def test_hard_sigmoid(self, X, inplace, alpha, beta, engine, gc, dc): # Prevent alpha and beta from mutually being 0 to avoid a division # error when adjusting our inputs From c755616e006efe011726105e2d7a1d7502c989b9 Mon Sep 17 00:00:00 2001 From: jgong5 Date: Wed, 29 Aug 2018 14:56:55 -0700 Subject: [PATCH 25/42] Enable Detectron model inference for CPU and MKL-DNN paths (#10157) Summary: 1. Support ops needed for inference of Faster-RCNN/Mask-RCNN needed in Detectron, mostly direct fallbacks. 2. Use CPU device to hold 0-dim tensors and integer tensors in both fallback op and blob feeder, needed by Detectron models. 3. Ignore 0-dim tensor in MKL-DNN concat operator. 4. Generate dynamic library of Detectron module for CPU device. This PR obsoletes #9164. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10157 Differential Revision: D9276837 Pulled By: yinghai fbshipit-source-id: dc364932ae4a2e7fcefdee70b5fce3c0cee91b6f --- caffe2/ideep/operators/concat_split_op.cc | 12 +- .../operators/operator_fallback_ideep.cc | 8 + .../ideep/operators/operator_fallback_ideep.h | 63 ++++---- .../python/ideep/operator_fallback_op_test.py | 99 ++++++++++++ caffe2/python/pybind_state_ideep.cc | 141 ++++++++++-------- modules/detectron/CMakeLists.txt | 4 + modules/detectron/batch_permutation_op.cc | 10 ++ modules/detectron/upsample_nearest_op.cc | 9 ++ modules/detectron/upsample_nearest_op.h | 46 +++++- 9 files changed, 299 insertions(+), 93 deletions(-) create mode 100644 caffe2/python/ideep/operator_fallback_op_test.py diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc index eb2d5b6acf1a61..25d4e16d2f9e7a 100644 --- a/caffe2/ideep/operators/concat_split_op.cc +++ b/caffe2/ideep/operators/concat_split_op.cc @@ -25,13 +25,21 @@ class IDEEPConcatOp final : public IDEEPOperator { virtual ~IDEEPConcatOp() {} bool RunOnDevice() override { - const auto& input_zero = Input(INPUT0); auto* output = Output(OUTPUT); TensorCPU* axis_info = OperatorBase::Output(AXIS_INFO, CPU); vector inputs; for (int i = 0; i < InputSize(); ++i) { - inputs.emplace_back(Input(i)); + if (OperatorBase::InputBlob(i).template IsType()) { + inputs.emplace_back(Input(i)); + } else { + CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType(CPU), + "Expect cpu tensor if not itensor"); + auto& tensor_cpu = OperatorBase::Input(i, CPU); + CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 || + tensor_cpu.size_from_dim(0) == 0, + "Expect zero dim tensor"); + } } auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output); diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc index 8251b386eeb3c7..75895c5d844345 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.cc +++ b/caffe2/ideep/operators/operator_fallback_ideep.cc @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -116,6 +118,12 @@ REGISTER_IDEEP_OPERATOR( REGISTER_IDEEP_OPERATOR( BBoxTransform, IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR( + AffineChannel, + IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR( + StopGradient, + IDEEPFallbackOp>); REGISTER_IDEEP_OPERATOR( PadImage, diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index ae4f903c23c2fc..31df729a217850 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -53,6 +53,8 @@ class IDEEPFallbackOp final : public IDEEPOperator { // then forward output blobs to local workspace. std::unordered_map forwarded_output_blobs; for (int i = 0; i < base_def_.output_size(); i++) { + // For in-place case, the in/output tensor for local_ws must be + // re-created, instead of forwarding from current workspace. string parent_name(base_def_.output(i)); if (!SkipOutputCopy::Contains(i)) { parent_name += "_cpu_output_blob_" + base_def_.type(); @@ -60,6 +62,13 @@ class IDEEPFallbackOp final : public IDEEPOperator { local_output_blobs_.push_back(ws->CreateBlob(parent_name)); CHECK_NOTNULL(local_output_blobs_.back()); forwarded_output_blobs[base_def_.output(i)] = parent_name; + output_inplace_.push_back(false); + for (const string &input_name : base_def_.input()) { + if (input_name == base_def_.output(i)) { + output_inplace_[i] = true; + break; + } + } } local_ws_.reset(new Workspace(ws, forwarded_output_blobs)); // Set up the symbols for the local workspace. @@ -67,31 +76,26 @@ class IDEEPFallbackOp final : public IDEEPOperator { local_input_blobs_.push_back(local_ws_->CreateBlob(name)); CHECK_NOTNULL(local_input_blobs_.back()); } + input_share_.resize(local_input_blobs_.size(), false); base_op_.reset(new CPUOp(base_def_, local_ws_.get())); } bool RunOnDevice() override { for (int i = 0; i < InputSize(); ++i) { - if (InputIsType(i) && Input(i).get_data_type() == itensor::data_type::f32) { + if (InputIsType(i) && + Input(i).get_data_type() == itensor::data_type::f32) { auto& input = Input(i); - auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU); - dtensor->Resize(input.get_dims()); - if (input.is_public_format()) { - dtensor->ShareExternalPointer(static_cast(input.get_data_handle())); - } else { - input.reorder_to(dtensor->template mutable_data()); + if (input_share_[i]) { + local_input_blobs_[i]->Reset(); } - } else if ( - InputIsType(i) && - Input(i).get_data_type() == itensor::data_type::s32) { - auto& input = Input(i); + input_share_[i] = false; auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU); dtensor->Resize(input.get_dims()); if (input.is_public_format()) { dtensor->ShareExternalPointer( - static_cast(input.get_data_handle())); + static_cast(input.get_data_handle())); } else { - input.reorder_to(dtensor->template mutable_data()); + input.reorder_to(dtensor->template mutable_data()); } } else { VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy."; @@ -99,8 +103,9 @@ class IDEEPFallbackOp final : public IDEEPOperator { // local_input_blobs will only be used as const blob input for the // base op so we are still fine. local_input_blobs_[i]->ShareExternal( - const_cast(OperatorBase::Inputs()[i]->GetRaw()), + const_cast(OperatorBase::Inputs()[i]->GetRaw()), OperatorBase::Inputs()[i]->meta()); + input_share_[i] = true; } } @@ -120,21 +125,16 @@ class IDEEPFallbackOp final : public IDEEPOperator { "IDEEP fallback op currently does not support non-TensorCPU " "output type who needs copying."); const auto& src = local_output_blobs_[i]->template Get(); - auto src_dims = src.dims(); - if (src.ndim() == 0) { - VLOG(1) << "Copy output: index " << i << " skipped."; + if (src.template IsType() && + src.dims().size() != 0 && src.size_from_dim(0) != 0 && + base_op_->type() != "Python") { Blob* dst = OperatorBase::OutputBlob(i); - dst->Reset(new Tensor(CPU)); - auto dtensor = dst->GetMutableTensor(CPU); - dtensor->Resize(src_dims); - dtensor->ShareData(src); - continue; - } - - if (src.template IsType()) { - Blob* dst = OperatorBase::OutputBlob(i); - if (!dst->template IsType()) { + // The output tensor must be ideep tensor with public format. + // If reusing ideep tensor with non-public format, the tensor buffer + // will be interpreted incorrectly. + if (!dst->template IsType() || + !dst->template Get().is_public_format()) { dst->Reset(new itensor()); } @@ -143,7 +143,12 @@ class IDEEPFallbackOp final : public IDEEPOperator { if (dtensor->get_dims() != dst_dims) { dtensor->resize(dst_dims, itensor::data_type::f32); } - dtensor->set_data_handle(const_cast(src.raw_data())); + if (output_inplace_[i]) { + dtensor->reorder_from(dst_dims, itensor::data_type::f32, + const_cast(src.raw_data())); + } else { + dtensor->set_data_handle(const_cast(src.raw_data())); + } } else { VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor"; Blob* dst = OperatorBase::OutputBlob(i); @@ -159,6 +164,8 @@ class IDEEPFallbackOp final : public IDEEPOperator { protected: vector local_input_blobs_; vector local_output_blobs_; + vector output_inplace_; + vector input_share_; std::unique_ptr base_op_; std::unique_ptr local_ws_; OperatorDef base_def_; diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py new file mode 100644 index 00000000000000..19bdbaac8a217e --- /dev/null +++ b/caffe2/python/ideep/operator_fallback_op_test.py @@ -0,0 +1,99 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import unittest +import hypothesis.strategies as st +from hypothesis import given +import numpy as np +from caffe2.python import core, workspace +from caffe2.proto import caffe2_pb2 +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.ideep_test_util as mu + + +@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.") +class TestFallbackOps(hu.HypothesisTestCase): + @given(stride=st.integers(1, 3), + pad=st.integers(0, 3), + kernel=st.integers(3, 5), + size=st.integers(8, 10), + input_channels=st.integers(1, 3), + output_channels=st.integers(1, 5), + batch_size=st.integers(1, 3), + use_bias=st.booleans(), + **mu.gcs) + def test_in_place(self, stride, pad, kernel, size, + input_channels, output_channels, + batch_size, use_bias, gc, dc): + # To expose fallback in-place potential issue, the fallback op + # following ideep op must be run at least two iterations. + conv = core.CreateOperator( + "Conv", + ["X", "w", "b"] if use_bias else ["X", "w"], + ["Y"], + stride=stride, + pad=pad, + kernel=kernel, + device_option=dc[0] + ) + X = np.random.rand( + batch_size, input_channels, size, size).astype(np.float32) - 0.5 + w = np.random.rand(output_channels, input_channels, kernel, kernel) \ + .astype(np.float32) - 0.5 + b = np.random.rand(output_channels).astype(np.float32) - 0.5 + + old_ws_name = workspace.CurrentWorkspace() + workspace.SwitchWorkspace("_device_check_", True) + workspace.FeedBlob('X', X, dc[0]) + workspace.FeedBlob('w', w, dc[0]) + workspace.FeedBlob('b', b, dc[0]) + workspace.RunOperatorOnce(conv) + Y = workspace.FetchBlob('Y') + + scale = np.random.randn(Y.shape[1]).astype(np.float32) + bias = np.random.randn(Y.shape[1]).astype(np.float32) + ac = core.CreateOperator( + "AffineChannel", + ["Y", "scale", "bias"], + ["Y"], + is_learnable=False, + device_option=dc[0] + ) + workspace.FeedBlob('scale', scale, dc[0]) + workspace.FeedBlob('bias', bias, dc[0]) + workspace.RunOperatorOnce(ac) + workspace.RunOperatorOnce(conv) + workspace.RunOperatorOnce(ac) + Y0 = workspace.FetchBlob('Y') + + workspace.ResetWorkspace() + dev_net = caffe2_pb2.NetDef() + conv_dev = caffe2_pb2.OperatorDef() + conv_dev.CopyFrom(conv) + conv_dev.device_option.CopyFrom(dc[1]) + ac_dev = caffe2_pb2.OperatorDef() + ac_dev.CopyFrom(ac) + ac_dev.device_option.CopyFrom(dc[1]) + dev_net.op.extend([conv_dev, ac_dev]) + workspace.FeedBlob('X', X, dc[1]) + workspace.FeedBlob('w', w, dc[1]) + workspace.FeedBlob('b', b, dc[1]) + workspace.FeedBlob('scale', scale, dc[1]) + workspace.FeedBlob('bias', bias, dc[1]) + workspace.RunNetOnce(dev_net) + workspace.RunNetOnce(dev_net) + Y1 = workspace.FetchBlob('Y') + + if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): + print(Y1.flatten()) + print(Y0.flatten()) + print(np.max(np.abs(Y1 - Y0))) + self.assertTrue(False) + + workspace.SwitchWorkspace(old_ws_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc index 668c812cd8e1a8..056558c9a73335 100644 --- a/caffe2/python/pybind_state_ideep.cc +++ b/caffe2/python/pybind_state_ideep.cc @@ -9,6 +9,7 @@ #include #include +#include "caffe2/ideep/operators/operator_fallback_ideep.h" #include namespace caffe2 { @@ -19,42 +20,42 @@ USE_IDEEP_DEF_ALIASES(); class IDeepFetcher; class IDeepFeeder; -REGISTER_BLOB_FETCHER((TypeMeta::Id()),IDeepFetcher); +REGISTER_IDEEP_OPERATOR(Python, IDEEPFallbackOp>); + +REGISTER_BLOB_FETCHER((TypeMeta::Id()), IDeepFetcher); REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder); class IDeepFetcher : public BlobFetcherBase { TypeMeta type_transform(const itensor &atensor) { - switch(atensor.get_data_type()) { - case itensor::data_type::f32: - return TypeMeta::Make(); - case itensor::data_type::s16: - return TypeMeta::Make(); - case itensor::data_type::s32: - return TypeMeta::Make(); - case itensor::data_type::s8: - return TypeMeta::Make(); - case itensor::data_type::u8: - return TypeMeta::Make(); - default: - // Should we throw exception? - return TypeMeta(); + switch (atensor.get_data_type()) { + case itensor::data_type::f32: + return TypeMeta::Make(); + case itensor::data_type::s32: + return TypeMeta::Make(); + case itensor::data_type::s8: + return TypeMeta::Make(); + case itensor::data_type::u8: + return TypeMeta::Make(); + default: + // Should we throw exception? + return TypeMeta(); } } - public: - pybind11::object Fetch(const Blob& blob) override { +public: + pybind11::object Fetch(const Blob &blob) override { try { return FetchTensor(blob.Get(), true).obj; - } catch (ideep::error& e) { - VLOG(1) << "IDEEP error: " << e.message; + } catch (ideep::error &e) { + LOG(ERROR) << "IDEEP error: " << e.message; throw; } } - FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) { + FetchedBlob FetchTensor(const itensor &atensor, bool force_copy) { FetchedBlob result; CAFFE_ENFORCE(atensor.materialized(), - "Trying to fetch uninitialized tensor"); + "Trying to fetch uninitialized tensor"); const int numpy_type = CaffeToNumpyType(type_transform(atensor)); CAFFE_ENFORCE( numpy_type != -1, @@ -64,17 +65,16 @@ class IDeepFetcher : public BlobFetcherBase { std::vector npy_dims(dims.begin(), dims.end()); result.copied = force_copy || atensor.need_reorder(); - void* outPtr; + void *outPtr; if (result.copied) { result.obj = py::reinterpret_steal( PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type)); outPtr = static_cast( - PyArray_DATA(reinterpret_cast(result.obj.ptr()))); + PyArray_DATA(reinterpret_cast(result.obj.ptr()))); } else { outPtr = atensor.get_data_handle(); - result.obj = py::reinterpret_steal( - PyArray_SimpleNewFromData( - atensor.ndims(), npy_dims.data(), numpy_type, outPtr)); + result.obj = py::reinterpret_steal(PyArray_SimpleNewFromData( + atensor.ndims(), npy_dims.data(), numpy_type, outPtr)); } if (numpy_type == NPY_OBJECT) { @@ -95,8 +95,6 @@ class IDeepFeeder : public BlobFeederBase { return itensor::data_type::f32; else if (meta == TypeMeta::Make()) return itensor::data_type::s32; - else if (meta == TypeMeta::Make()) - return itensor::data_type::s16; else if (meta == TypeMeta::Make()) return itensor::data_type::s8; else if (meta == TypeMeta::Make()) @@ -105,53 +103,74 @@ class IDeepFeeder : public BlobFeederBase { return itensor::data_type::data_undef; } - public: - void FeedTensor( - const DeviceOption& option, - PyArrayObject *original_array, - itensor *tensor) { - PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array); - auto g = MakeGuard([&]() {Py_XDECREF(array); }); - - const auto npy_type = PyArray_TYPE(array); - const TypeMeta& meta = NumpyTypeToCaffe(npy_type); - CAFFE_ENFORCE( - meta.id() != TypeIdentifier::uninitialized(), +public: + void FeedTensor( + const DeviceOption &option, + PyArrayObject *original_array, + itensor *tensor) { + PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array); + auto g = MakeGuard([&]() { Py_XDECREF(array); }); + const auto npy_type = PyArray_TYPE(array); + const TypeMeta &meta = NumpyTypeToCaffe(npy_type); + CAFFE_ENFORCE_NE( + meta.id(), + TypeIdentifier::uninitialized(), "This numpy data type is not supported: ", - PyArray_TYPE(array), - "."); + PyArray_TYPE(array), "."); - int ndim = PyArray_NDIM(array); - npy_intp* npy_dims = PyArray_DIMS(array); + int ndim = PyArray_NDIM(array); + npy_intp *npy_dims = PyArray_DIMS(array); - itensor::dims adims; - for (int i = 0; i < ndim; i++) { - adims.push_back(static_cast( - npy_dims[i])); - } + itensor::dims adims; + for (int i = 0; i < ndim; i++) { + adims.push_back(static_cast(npy_dims[i])); + } - switch (npy_type) { + switch (npy_type) { case NPY_OBJECT: case NPY_UNICODE: CAFFE_THROW("IDeep doesn't support string"); break; default: auto type = type_transform(meta); - tensor->resize(adims, type); + if (tensor->get_dims() != adims || type != tensor->get_data_type()) { + tensor->resize(adims, type); + } tensor->reorder_from(adims, type, - static_cast(PyArray_DATA(array))); - } - } + static_cast(PyArray_DATA(array))); + } + } - void Feed(const DeviceOption& option, PyArrayObject* original_array, - Blob* blob) { - try { + bool ZeroDim(PyArrayObject *array) { + int ndim = PyArray_NDIM(array); + npy_intp *npy_dims = PyArray_DIMS(array); + return ndim == 0 || + std::find(npy_dims, npy_dims + ndim, 0) != npy_dims + ndim; + } + + void Feed(const DeviceOption &option, PyArrayObject *original_array, + Blob *blob) { + try { + PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array); + auto g = MakeGuard([&]() { Py_XDECREF(array); }); + + const auto npy_type = PyArray_TYPE(array); + const TypeMeta &meta = NumpyTypeToCaffe(npy_type); + // TODO: if necessary, use dispatcher. + if (meta.Match() && !ZeroDim(original_array)) { FeedTensor(option, original_array, blob->GetMutable()); - } catch (ideep::error& e) { - VLOG(1) << "IDEEP error: " << e.message; - throw; + } else { + DeviceOption cpu_option(option); + cpu_option.set_device_type(DeviceType::CPU); + TensorFeeder cpu_tensor_feeder; + cpu_tensor_feeder.FeedTensor(cpu_option, original_array, + blob->GetMutableTensor(CPU)); } - } + } catch (ideep::error &e) { + LOG(ERROR) << "IDEEP error: " << e.message; + throw; + } + } }; } // namespace python diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt index f18077b829427b..1791ca27a98590 100644 --- a/modules/detectron/CMakeLists.txt +++ b/modules/detectron/CMakeLists.txt @@ -11,4 +11,8 @@ if (USE_CUDA) target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu) install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) +elseif(NOT IOS_PLATFORM) + add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) + target_link_libraries(caffe2_detectron_ops caffe2) + install(TARGETS caffe2_detectron_ops DESTINATION lib) endif() diff --git a/modules/detectron/batch_permutation_op.cc b/modules/detectron/batch_permutation_op.cc index f92d7dd236d758..032288f811de08 100644 --- a/modules/detectron/batch_permutation_op.cc +++ b/modules/detectron/batch_permutation_op.cc @@ -15,9 +15,19 @@ */ #include "batch_permutation_op.h" +#ifdef CAFFE2_USE_IDEEP +#include +#include +#endif namespace caffe2 { +#ifdef CAFFE2_USE_IDEEP +REGISTER_IDEEP_OPERATOR( + BatchPermutation, + IDEEPFallbackOp>); +#endif + REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp); REGISTER_CPU_OPERATOR( BatchPermutationGradient, diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc index b668701b4ce4f4..4fc4d6dcd93a31 100644 --- a/modules/detectron/upsample_nearest_op.cc +++ b/modules/detectron/upsample_nearest_op.cc @@ -15,8 +15,17 @@ */ #include "upsample_nearest_op.h" +#ifdef CAFFE2_USE_IDEEP +#include "caffe2/ideep/operators/operator_fallback_ideep.h" +#include "caffe2/ideep/utils/ideep_operator.h" +#endif namespace caffe2 { +#ifdef CAFFE2_USE_IDEEP +REGISTER_IDEEP_OPERATOR( + UpsampleNearest, + IDEEPFallbackOp>); +#endif REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp); REGISTER_CPU_OPERATOR( diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h index e24d705bc14afd..17f77855509e67 100644 --- a/modules/detectron/upsample_nearest_op.h +++ b/modules/detectron/upsample_nearest_op.h @@ -35,8 +35,50 @@ class UpsampleNearestOp final : public Operator { USE_OPERATOR_CONTEXT_FUNCTIONS; bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; + auto translate_idx = [](int ii, int d1, int d2, int d3, int scale_factor) { + int x, y, z, w; + w = ii % d3; + ii = ii/d3; + z = ii % d2; + ii = ii/d2; + y = ii % d1; + ii = ii/d1; + x = ii; + w = w/scale_factor; + z = z/scale_factor; + d2 /= scale_factor; + d3 /= scale_factor; + return (((x*d1+y)*d2)+z)*d3+w; + }; + + auto& X = Input(0); + auto* Y = Output(0); + auto out_shape = X.dims(); + out_shape[X.ndim() - 1] *= scale_; + out_shape[X.ndim() - 2] *= scale_; + Y->Resize(out_shape); + + int d1; + int d2; + int d3; + if (X.ndim() == 3) { + d1 = Y->dim32(0); + d2 = Y->dim32(1); + d3 = Y->dim32(2); + } else { + d1 = Y->dim32(1); + d2 = Y->dim32(2); + d3 = Y->dim32(3); + } + + const T *input_data = X.template data(); + T *output_data = Y->template mutable_data(); + + for (int ii = 0; ii < Y->size(); ii++) { + int ipidx = translate_idx(ii, d1, d2, d3, scale_); + output_data[ii] = input_data[ipidx]; + } + return true; } protected: From d9b74f6540abd8e969a9abae279cbc9055140709 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Wed, 29 Aug 2018 14:58:13 -0700 Subject: [PATCH 26/42] Make it possible to disable JIT using env variables (#10867) Summary: zdevito Pull Request resolved: https://github.com/pytorch/pytorch/pull/10867 Differential Revision: D9556882 Pulled By: apaszke fbshipit-source-id: 04c0ca875d15d37dd9ac05ac7b515cd899ddb7e4 --- test/test_jit.py | 22 +++++++ torch/jit/__init__.py | 130 ++++++++++++++++++++++++++---------------- 2 files changed, 102 insertions(+), 50 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index d9d345b0e8fce0..e99203333dc386 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -396,6 +396,28 @@ def fn(x, y): self.assertEqual(fn(x, y), fn_traced(x, y)) + def test_disabled(self): + torch.jit._enabled = False + try: + def f(x, y): + return x + y + + self.assertIs(torch.jit.trace(torch.randn(2, 2), torch.randn(2, 2))(f), f) + self.assertIs(torch.jit.script(f), f) + + class MyModule(torch.jit.ScriptModule): + @torch.jit.script_method + def method(self, x): + return x + + # XXX: Unfortunately ScriptModule won't simply become Module now, + # because that requires disabling the JIT at startup time, which + # we can't do in here. + # We need to or those two conditions to make it work with all versions of Python + self.assertTrue(inspect.ismethod(MyModule.method) or inspect.isfunction(MyModule.method)) + finally: + torch.jit._enabled = True + # Backwards tracing was broken for indexing by a constant, # because it's internally implemented using as_strided, # and we attempted to trace its derivative (which is not diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index 30904ac7adff7d..e0314acea4a173 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -21,6 +21,25 @@ import collections import re + +def _parse_env(name, default, true_message, false_message): + value = os.environ.get(name) + if value is None: + return default + if value.lower() in {'1', 'true', 'yes'}: + return True + elif value.lower() in {'0', 'false', 'no'}: + return False + if value == '1v': + print(true_message) + return True + elif value == '0v': + print(false_message) + return False + raise ValueError('Unknown setting of {}. Try using 0 or 1.'.format(name)) + + +_enabled = _parse_env('PYTORCH_JIT', True, "> Using PyTorch JIT", "> PyTorch JIT DISABLED") _flatten = torch._C._jit_flatten _unflatten = torch._C._jit_unflatten _jit_script_compile = torch._C._jit_script_compile @@ -431,6 +450,8 @@ def trace(*args, **kwargs): ... return x * 2 """ def wrapper(func): + if not _enabled: + return func executor_options = {'optimize': True} for name in executor_options: executor_options[name] = kwargs.pop(name, executor_options[name]) @@ -509,6 +530,8 @@ def __getattr__(self, attr): def script(fn, optimize=True, _frames_up=0): + if not _enabled: + return fn rcb = createResolutionCallback(_frames_up + 1) ast = get_jit_ast(fn, is_method=False) graph = _jit_script_compile(ast, rcb) @@ -528,6 +551,8 @@ def script(fn, optimize=True, _frames_up=0): def script_method(fn): + if not _enabled: + return fn # NOTE: we need to traverse two frames here because the meta-class frame # for ScriptModule will be present, as opposed to invoking @script on a # a function or invoking define() on a CompilationUnit. @@ -547,6 +572,8 @@ def script_method(fn): def batch(batch_size=1, optimize=True, _frames_up=0): def decorator(fn): + if not _enabled: + return fn import torch.jit.batchop mod = script(fn, optimize, _frames_up) res_graph = torch.to_batch_graph(mod.graph) @@ -757,57 +784,60 @@ def init_then_register(self, *args, **kwargs): return super(ScriptMeta, cls).__init__(name, bases, attrs) -class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)): - def __init__(self, optimize=True): - # must be before Module.init since the field is used in __getattr__ - Module.__init__(self) - self._set_optimized(optimize) - self._parameters = OrderedParameterDict(self) - self._buffers = OrderedBufferDict(self) - self._modules = OrderedModuleDict(self) - - def __getattr__(self, attr): - if self._has_method(attr): - if attr in self.__class__._original_methods: - original_method = self.__class__._original_methods[attr] - script_method = self._get_method(attr) - return functools.wraps(original_method)(script_method) +if _enabled: + class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)): + def __init__(self, optimize=True): + # must be before Module.init since the field is used in __getattr__ + Module.__init__(self) + self._set_optimized(optimize) + self._parameters = OrderedParameterDict(self) + self._buffers = OrderedBufferDict(self) + self._modules = OrderedModuleDict(self) + + def __getattr__(self, attr): + if self._has_method(attr): + if attr in self.__class__._original_methods: + original_method = self.__class__._original_methods[attr] + script_method = self._get_method(attr) + return functools.wraps(original_method)(script_method) + else: + return self._get_method(attr) + if attr == 'graph' and self._has_method('forward'): + return self.__getattr__('forward').graph + return Module.__getattr__(self, attr) + + def __setattr__(self, attr, value): + if attr not in self._constants_set: + return super(ScriptModule, self).__setattr__(attr, value) + if hasattr(self, attr): + raise RuntimeError("attempting to re-assign constant '{}'".format(attr)) + if isinstance(value, ModuleList): + # special case for list of modules. Modules need to be registered with their + # parent module. To do this, we create a ConstModuleList, which is itself a module, that + # contains each of these modules as submodules. The ConstModuleList then + # is set as an attribute of the parent module. + super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value)) + elif isinstance(value, Sequential): + super(ScriptModule, self).__setattr__(attr, _ConstSequential(value)) else: - return self._get_method(attr) - if attr == 'graph' and self._has_method('forward'): - return self.__getattr__('forward').graph - return Module.__getattr__(self, attr) - - def __setattr__(self, attr, value): - if attr not in self._constants_set: - return super(ScriptModule, self).__setattr__(attr, value) - if hasattr(self, attr): - raise RuntimeError("attempting to re-assign constant '{}'".format(attr)) - if isinstance(value, ModuleList): - # special case for list of modules. Modules need to be registered with their - # parent module. To do this, we create a ConstModuleList, which is itself a module, that - # contains each of these modules as submodules. The ConstModuleList then - # is set as an attribute of the parent module. - super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value)) - elif isinstance(value, Sequential): - super(ScriptModule, self).__setattr__(attr, _ConstSequential(value)) - else: - super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value)) - - def __dir__(self): - return sorted(Module.__dir__(self) + self._method_names()) - - def define(self, lang): - # We use frames_up=1 to get to the proper surrounding scope. The stack - # will look like: - # 0. createResolutionCallback - # 1. define() - # 2. surrounding scope. - # - # createResolutionCallback internally adds 1 to get us to our frame, then - # we add 1 to get to the proper surrounding scope. - rcb = createResolutionCallback(frames_up=1) - self._define(lang, rcb, True) + super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value)) + + def __dir__(self): + return sorted(Module.__dir__(self) + self._method_names()) + + def define(self, lang): + # We use frames_up=1 to get to the proper surrounding scope. The stack + # will look like: + # 0. createResolutionCallback + # 1. define() + # 2. surrounding scope. + # + # createResolutionCallback internally adds 1 to get us to our frame, then + # we add 1 to get to the proper surrounding scope. + rcb = createResolutionCallback(frames_up=1) + self._define(lang, rcb, True) +else: + ScriptModule = torch.nn.Module def _get_methods(cls): From 6b87198245c29a73a4203576c5b8cb33bd71418d Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 29 Aug 2018 15:28:03 -0700 Subject: [PATCH 27/42] Devirtualize StorageImpl deconstructor (#11018) Summary: Further align at::StorageImpl with caffe2::StorageImpl Pull Request resolved: https://github.com/pytorch/pytorch/pull/11018 Reviewed By: ezyang Differential Revision: D9562256 Pulled By: cpuhrsch fbshipit-source-id: d929317f6226a1e2550b78034b723afbae343aaa --- aten/src/ATen/StorageImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index a9394d53935636..a18318790eec2b 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -21,7 +21,7 @@ struct Type; struct AT_API StorageImpl : public c10::intrusive_ptr_target { public: StorageImpl() = delete; - virtual ~StorageImpl() {}; + ~StorageImpl() {}; StorageImpl( at::DataType data_type, ptrdiff_t size, From ef7fc2a3e15e9ff4dd242e8137306418bfb52c06 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 29 Aug 2018 16:00:34 -0700 Subject: [PATCH 28/42] Remove at::StorageImpl::finalizer_ (#11022) Summary: Unused member variable Pull Request resolved: https://github.com/pytorch/pytorch/pull/11022 Reviewed By: ezyang Differential Revision: D9562520 Pulled By: cpuhrsch fbshipit-source-id: af190b3ba06d33d65fa0fabffb34a0df769f38d0 --- aten/src/ATen/StorageImpl.cpp | 3 +-- aten/src/ATen/StorageImpl.h | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index af488472f24b5b..bc2d69a7aa8f5d 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -12,8 +12,7 @@ StorageImpl::StorageImpl( data_ptr_(std::move(data_ptr)), size_(size), resizable_(resizable), - allocator_(allocator), - finalizer_(nullptr) {} + allocator_(allocator) {} StorageImpl::StorageImpl( at::DataType data_type, diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index a18318790eec2b..f484cadbdac973 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -61,10 +61,6 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { } void release_resources() override { - if (finalizer_) { - (*finalizer_)(); - } - finalizer_ = nullptr; data_ptr_.clear(); } @@ -135,6 +131,5 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { ptrdiff_t size_; bool resizable_; at::Allocator* allocator_; - std::unique_ptr finalizer_; }; } // namespace at From 98d85b1790fb2dd7600de7b6f18b00e20d9750d5 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Wed, 29 Aug 2018 16:13:10 -0700 Subject: [PATCH 29/42] Debugging help + test Summary: When conversion fails, dump more information to help fix up the netdef Reviewed By: hyuen, yinghai Differential Revision: D9558667 fbshipit-source-id: 8917cc61c9be6285697e4f8395a9dbc7135f618e --- caffe2/opt/converter.cc | 22 +++++++++++++++++----- caffe2/python/transformations_test.py | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 6a8d22253444a5..80e2308eabf3cd 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -322,14 +322,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_mappushInstructionNode(opNode); } - CAFFE_ENFORCE( - externalInputNames.size() == 0, - "Attempting to convert an ill-formed network: \ - external_input contains unused blobs"); + if (externalInputNames.size()) { + std::ostringstream os; + for (const auto& inputName : externalInputNames) { + os << "\"" << inputName << "\" "; + } + + CAFFE_ENFORCE( + externalInputNames.size() == 0, + "Attempting to convert an ill-formed network: external_input contains ", + externalInputNames.size(), + " unused blobs: ", + os.str()); + } for (const auto& outputName : net.external_output()) { CAFFE_ENFORCE( - blobMap.count(outputName), "NetDef has ill-formed external_output"); + blobMap.count(outputName), + "NetDef has ill-formed external_output: \"", + outputName, + "\""); module.outputs.insert(blobMap[outputName]); } diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 2437933ae624eb..1a579b519fe09c 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -391,3 +391,17 @@ def test_transformer_FuseConv3DBN( rtol=1e-02, atol=1e-04 ) + + def test_converterEnforceUnusedInputs(self): + net = core.Net("net") + net.Relu(["X"], ["Y"]) + net.Proto().external_input.extend(["fake"]) + with self.assertRaises(Exception): + transformer.AddNNPACK(net) # just testing the converter + + def test_converterEnforceUnusedOutputs(self): + net = core.Net("net") + net.Relu(["X"], ["Y"]) + net.Proto().external_output.extend(["fake"]) + with self.assertRaises(Exception): + transformer.AddNNPACK(net) # just testing the converter From 2cc98d8df7365aa26eca555028035aef20da3088 Mon Sep 17 00:00:00 2001 From: pbialecki Date: Wed, 29 Aug 2018 16:24:16 -0700 Subject: [PATCH 30/42] Adds `dim` argument to `torch.unique` (#10423) Summary: Initial version of `unique` supporting a `dim` argument. As discussed in [this issue](https://github.com/pytorch/pytorch/issues/9997) I added the `dim` argument to `torch.unique` with the same behavior like [numpy](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.unique.html). Since the implementation is based on `std/thrust::unique`, the `tensor` always needs to be sorted. The `sorted` argument in `torch.unique` does not have any function, as in the CUDA version of the plain `torch.unique`. To check the performance and equal behavior between `torch.unique` and `np.unique`, I've used [this gist](https://gist.github.com/ptrblck/ac0dc862f4e1766f0e1036c252cdb105). Currently we achieve the following timings for an input of `x = torch.randint(2, (1000, 1000))`: (The values are calculated by taking the average of the times for both dimension) | Device | PyTorch (return_inverse=False) | Numpy (return_inverse=False) | PyTorch (return_inverse=True) | Numpy (return_inverse=True) | | --- | --- | --- | --- | --- | | CPU | ~0.007331s | ~0.022452s | ~0.011139s | ~0.044800s | | GPU | ~0.006154s | - | ~0.105373s | - | Many thanks to colesbury for the awesome mentoring and the valuable advices on the general implementation and performance issues! Pull Request resolved: https://github.com/pytorch/pytorch/pull/10423 Differential Revision: D9517289 Pulled By: soumith fbshipit-source-id: a4754f805223589c2847c98b8e4e39d8c3ddb7b5 --- aten/src/ATen/native/Unique.cpp | 84 +++++++++++++++++++ aten/src/ATen/native/cuda/Unique.cu | 97 ++++++++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 5 ++ test/test_torch.py | 61 ++++++++++++++ torch/functional.py | 20 +++-- torch/tensor.py | 15 +++- 6 files changed, 273 insertions(+), 9 deletions(-) diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index d9bd94e1f7810b..d5ff300c0dd9e2 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -47,6 +47,82 @@ std::tuple _unique_cpu_template( } return std::make_tuple(output, inverse_indices); } + +template +ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last, + std::vector& indices, Tensor inverse_indices_vec) { + if (first == last) { + return last; + } + // save to calculate distance to iterators + ForwardIt begin = first; + + // set first inverse index + inverse_indices_vec[indices[0]] = 0; + + ForwardIt result = first; + while (++first != last) { + if (!at::equal(*result, *first) && ++result != first) { + *result = std::move(*first); + } + int64_t idx_result = std::distance(begin, result); + int64_t idx_first = std::distance(begin, first); + inverse_indices_vec[indices[idx_first]] = idx_result; + } + + return ++result; + } + +template +std::tuple _unique_dim_cpu_template( + const Tensor& self, + const int64_t dim, + const bool return_inverse) { + // reshape tensor as [dim, -1] + Tensor input_flat = self.transpose(dim, 0); + auto orig_sizes = input_flat.sizes().vec(); + input_flat = input_flat.contiguous().view({input_flat.size(0), -1}); + + std::vector indices(input_flat.size(0)); + std::iota(indices.begin(), indices.end(), 0); + int64_t numel = input_flat.size(1); + scalar_t* input_flat_ptr = ((scalar_t*)input_flat.data_ptr()); + + // sort indices using data + std::sort(indices.begin(), indices.end(), + [&](int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < numel; ++i) { + scalar_t lhs = input_flat_ptr[i + a * numel]; + scalar_t rhs = input_flat_ptr[i + b * numel]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + Tensor input_sorted = at::empty(input_flat.sizes(), input_flat.type()); + for (int i = 0; i < indices.size(); ++i) { + input_sorted[i] = input_flat[indices[i]]; + } + + Tensor inverse_indices = at::empty(indices.size(), self.type().toScalarType(kLong)); + std::vector input_unbind = at::unbind(input_sorted, 0); + auto last = _unique_dim_cpu_impl( + input_unbind.begin(), input_unbind.end(), indices, inverse_indices); + input_unbind.erase(last, input_unbind.end()); + + // reshape back + auto output = at::stack(input_unbind, 0); + auto new_sizes = std::vector(orig_sizes); + new_sizes[0] = -1; + output = output.view(new_sizes); + output = output.transpose(0, dim); + + return std::make_tuple(output, inverse_indices); +} } // namespace std::tuple @@ -56,5 +132,13 @@ _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) { }); } +std::tuple +_unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) { + return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] { + // The current implementation using `dim` always sorts due to unhashable tensors + return _unique_dim_cpu_template(self, dim, return_inverse); + }); +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index f2e13b4c708b62..c29337f90f1347 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -69,6 +69,92 @@ template return std::tuple(output, inverse_indices); } + +template + std::tuple _unique_dim_cuda_template( + const Tensor& self, + const int64_t dim, + const bool return_inverse) { + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + Tensor input_flat = self.transpose(dim, 0); + auto orig_sizes = input_flat.sizes().vec(); + input_flat = input_flat.contiguous().view({input_flat.size(0), -1}); + + scalar_t* input_flat_ptr = input_flat.data(); + + Tensor indices = at::arange(0, input_flat.size(0), self.type().toScalarType(kLong)); + int64_t* indices_ptr = indices.data(); + int64_t numel = input_flat.size(1); + + // sort indices using data + thrust::sort(policy, indices_ptr, indices_ptr + indices.numel(), + [=] __device__ (int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < numel; ++i) { + scalar_t lhs = input_flat_ptr[i + a * numel]; + scalar_t rhs = input_flat_ptr[i + b * numel]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + Tensor input_sorted = input_flat.index_select(0, indices); + + // get unique tensors + scalar_t* input_sorted_ptr = input_sorted.data(); + Tensor input_sorted_indices = at::arange(0, input_sorted.size(0), self.type().toScalarType(kLong)); + int64_t* input_sorted_indices_ptr = input_sorted_indices.data(); + auto last = thrust::unique(policy, input_sorted_indices_ptr, input_sorted_indices_ptr + input_sorted_indices.numel(), + [=] __device__ (int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < numel; ++i) { + scalar_t lhs = input_sorted_ptr[i + a * numel]; + scalar_t rhs = input_sorted_ptr[i + b * numel]; + if (lhs != rhs) { + return false; + } + } + return true; + }); + input_sorted_indices.resize_(last - input_sorted_indices_ptr); + Tensor output = input_sorted.index_select(0, input_sorted_indices); + + // reshape back + auto new_sizes = std::vector(orig_sizes); + new_sizes[0] = -1; + output = output.view(new_sizes); + output = output.transpose(0, dim); + + // calculate inverse indices + Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong)); + if (return_inverse) { + int64_t size = self.size(dim); + inverse_indices.resize_(size); + Tensor mask = at::empty(input_sorted.size(0), self.type().toScalarType(kLong)); + mask[0] = 1; + for (int i = 0; i < input_sorted.size(0) - 1; ++i) { + if (!at::equal(input_sorted[i], input_sorted[i+1])) { + mask[i+1] = 1; + } else { + mask[i+1] = 0; + } + } + + Tensor imask = at::cumsum(mask, 0) - 1; + for (int i = 0; i < indices.size(0); ++i) { + inverse_indices[indices[i]] = imask[i]; + } + } + + THCudaCheck(cudaGetLastError()); + return std::tuple(output, inverse_indices); + } } // namespace #endif @@ -86,5 +172,16 @@ _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) { #endif } +std::tuple +_unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) { + #ifndef __HIP_PLATFORM_HCC__ + return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] { + return _unique_dim_cuda_template(self, dim, return_inverse); + }); + #else + AT_ERROR("unique_dim_cuda: HIP not supported"); + #endif +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 466fe6c3134e84..cb194cd0c7bdee 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1748,6 +1748,11 @@ CPU: _unique_cpu CUDA: _unique_cuda +- func: _unique_dim(Tensor self, int64_t dim, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor) + dispatch: + CPU: _unique_dim_cpu + CUDA: _unique_dim_cuda + - func: _unsafe_view(Tensor self, IntList size) -> Tensor variants: function diff --git a/test/test_torch.py b/test/test_torch.py index 167a400ec91473..863f97ff1d20e3 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -8485,6 +8485,67 @@ def test_unique(self): self.assertEqual(torch.ByteTensor([7, 42, 128, 133]), byte_unique) self.assertEqual(torch.LongTensor([3, 0, 0, 0, 1, 2]), byte_inverse) + def test_unique_dim(self): + def run_test(dtype=torch.float): + x = torch.tensor([[[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]], + [[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]]], dtype=dtype) + expected_unique_dim0 = torch.tensor([[[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]]], dtype=dtype) + expected_inverse_dim0 = torch.tensor([0, 0]) + expected_unique_dim1 = torch.tensor([[[0., 1.], + [1., 1.], + [2., 1.]], + [[0., 1.], + [1., 1.], + [2., 1.]]], dtype=dtype) + expected_inverse_dim1 = torch.tensor([1, 0, 2, 0]) + expected_unique_dim2 = torch.tensor([[[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]], + [[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]]], dtype=dtype) + expected_inverse_dim2 = torch.tensor([0, 1]) + + # dim0 + x_unique = torch.unique(x, dim=0) + self.assertEqual(expected_unique_dim0, x_unique) + + x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=0) + self.assertEqual(expected_unique_dim0, x_unique) + self.assertEqual(expected_inverse_dim0, x_inverse) + + # dim1 + x_unique = torch.unique(x, dim=1) + self.assertEqual(expected_unique_dim1, x_unique) + + x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=1) + self.assertEqual(expected_unique_dim1, x_unique) + self.assertEqual(expected_inverse_dim1, x_inverse) + + # dim2 + x_unique = torch.unique(x, dim=2) + self.assertEqual(expected_unique_dim2, x_unique) + + x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=2) + self.assertEqual(expected_unique_dim2, x_unique) + self.assertEqual(expected_inverse_dim2, x_inverse) + + run_test(torch.float) + run_test(torch.double) + run_test(torch.long) + run_test(torch.uint8) + @staticmethod def _test_bincount(self, device): # negative input throws diff --git a/torch/functional.py b/torch/functional.py index 055141b7469a20..8c78b6efe9f80f 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -389,7 +389,7 @@ def isnan(tensor): return tensor != tensor -def unique(input, sorted=False, return_inverse=False): +def unique(input, sorted=False, return_inverse=False, dim=None): r"""Returns the unique scalar elements of the input tensor as a 1-D tensor. Arguments: @@ -431,11 +431,19 @@ def unique(input, sorted=False, return_inverse=False): [ 1, 2]]) """ - output, inverse_indices = torch._unique( - input, - sorted=sorted, - return_inverse=return_inverse, - ) + if dim is not None: + output, inverse_indices = torch._unique_dim( + input, + dim, + sorted=sorted, + return_inverse=return_inverse + ) + else: + output, inverse_indices = torch._unique( + input, + sorted=sorted, + return_inverse=return_inverse, + ) if return_inverse: return output, inverse_indices else: diff --git a/torch/tensor.py b/torch/tensor.py index ed2f7f0c10a565..904d3a5eeb3760 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -319,13 +319,22 @@ def masked_fill(self, mask, value): """ return self.clone().masked_fill_(mask, value) - def unique(self, sorted=False, return_inverse=False): + def unique(self, sorted=False, return_inverse=False, dim=None): r"""Returns the unique scalar elements of the tensor as a 1-D tensor. See :func:`torch.unique` """ - output, inverse_indices = self._unique( - sorted=sorted, return_inverse=return_inverse) + if dim is not None: + output, inverse_indices = self._unique_dim( + sorted=sorted, + return_inverse=return_inverse, + dim=dim + ) + else: + output, inverse_indices = self._unique( + sorted=sorted, + return_inverse=return_inverse + ) if return_inverse: return output, inverse_indices else: From c4e1adf29d0b22fa5ff0ea2206a22f4d035c36cb Mon Sep 17 00:00:00 2001 From: Roy Li Date: Wed, 29 Aug 2018 16:26:51 -0700 Subject: [PATCH 31/42] Remove THHalf type Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11010 Reviewed By: ezyang Differential Revision: D9561325 Pulled By: li-roy fbshipit-source-id: 053cf2925ec1fc458db31e92bd31ffd23389f3e8 --- aten/src/ATen/StorageImpl.h | 3 +-- aten/src/ATen/gen.py | 12 ---------- aten/src/TH/CMakeLists.txt | 1 - aten/src/TH/THHalf.h | 36 ++++++++---------------------- aten/src/TH/THStorageFunctions.hpp | 1 - aten/src/TH/THTypeConversion.hpp | 24 -------------------- aten/src/TH/generic/THStorage.cpp | 10 ++++----- 7 files changed, 15 insertions(+), 72 deletions(-) delete mode 100644 aten/src/TH/THTypeConversion.hpp diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index f484cadbdac973..68c5012777edd7 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -3,7 +3,6 @@ #include #include #include -#include #include @@ -44,7 +43,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { template inline T* data() const { auto data_type_T = - at::scalarTypeToDataType(at::CTypeToScalarType>::to()); + at::scalarTypeToDataType(at::CTypeToScalarType::to()); if (dtype() != data_type_T) { AT_ERROR( "Attempt to access StorageImpl having data type ", diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index bb6d71f54c2d1a..f7a4deb58dc941 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -283,19 +283,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if scalar_name == "Half": env['SparseTensor'] = 'Tensor' if backend == "CUDA": - env['to_th_type'] = 'HalfFix<__half,Half>' - env['to_at_type'] = 'HalfFix' env['AS_REAL'] = 'convert' - env['THScalarType'] = 'half' - else: - env['to_th_type'] = 'HalfFix' - env['to_at_type'] = 'HalfFix' - elif scalar_name == 'Long': - env['to_th_type'] = 'long' - env['to_at_type'] = 'int64_t' - else: - env['to_th_type'] = '' - env['to_at_type'] = '' declarations, definitions = function_wrapper.create_derived( env, declarations) diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index ab9f5343eddad9..9fe22beb0dc54e 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -102,7 +102,6 @@ INSTALL(FILES THTensor.hpp THStorageFunctions.hpp THGenerator.hpp - THTypeConversion.hpp DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH") INSTALL(FILES diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h index 5ff85eb2c8f40b..fb68639ec44752 100644 --- a/aten/src/TH/THHalf.h +++ b/aten/src/TH/THHalf.h @@ -2,40 +2,22 @@ #define TH_HALF_H #include -#include -/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */ -#if defined(__GNUC__) -#define __thalign__(n) __attribute__((aligned(n))) -#elif defined(_WIN32) -#define __thalign__(n) __declspec(align(n)) -#else -#define __thalign__(n) +#ifdef __cplusplus +#include #endif -typedef struct __thalign__(2){ - unsigned short x; -} __THHalf; - -typedef struct __thalign__(4) { - unsigned int x; -} __THHalf2; - -typedef __THHalf THHalf; -typedef __THHalf2 THHalf2; +#ifdef __cplusplus +#define THHalf at::Half +#else +typedef struct at_Half at_Half; +#define THHalf at_Half +#endif TH_API void TH_float2halfbits(float*, unsigned short*); TH_API void TH_halfbits2float(unsigned short*, float*); TH_API THHalf TH_float2half(float); -TH_API float TH_half2float(THHalf); - -#ifndef TH_HALF_BITS_TO_LITERAL -# define TH_HALF_BITS_TO_LITERAL(n) { n } -#endif - -#define TH_HALF_ZERO 0x0U -#define TH_HALF_INF 0x7C00U +TH_API float TH_half2float(THHalf); -#undef __thalign__ #endif diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 9fe0db5e5497f9..362fa6e2c83de5 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -8,7 +8,6 @@ #include #include -#include "THTypeConversion.hpp" #include // Note [Weak references for intrusive refcounting] diff --git a/aten/src/TH/THTypeConversion.hpp b/aten/src/TH/THTypeConversion.hpp deleted file mode 100644 index d40169e7180e58..00000000000000 --- a/aten/src/TH/THTypeConversion.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include -#include "THHalf.h" - -// Type traits to convert types to TH-specific types. Used primarily to -// convert at::Half to TH's half type. This makes the conversion explicit. -// FIXME: we should just use the same type - -namespace th { - -template -struct FromTypeConversion { - using type = T; -}; - -template <> -struct FromTypeConversion { - using type = at::Half; -}; - -template -using from_type = typename FromTypeConversion::type; -} diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 21431ef778d5a0..384ce9c632e22b 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -21,13 +21,13 @@ size_t THStorage_(elementSize)() THStorage* THStorage_(new)(void) { - return THStorage_new(at::CTypeToScalarType>::to()); + return THStorage_new(at::CTypeToScalarType::to()); } THStorage* THStorage_(newWithSize)(ptrdiff_t size) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType>::to()), + at::scalarTypeToDataType(at::CTypeToScalarType::to()), size, getTHDefaultAllocator(), true).release(); @@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, at::Allocator *allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType>::to()), + at::scalarTypeToDataType(at::CTypeToScalarType::to()), size, allocator, true).release(); @@ -48,7 +48,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags) { - auto scalar_type = at::CTypeToScalarType>::to(); + auto scalar_type = at::CTypeToScalarType::to(); size_t actual_size = -1; THStorage* storage = c10::make_intrusive( at::scalarTypeToDataType(scalar_type), @@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage) THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType>::to()), + at::scalarTypeToDataType(at::CTypeToScalarType::to()), size, std::move(data), allocator, From ae635b16f76ee4e0abf5f4848cca810bfaf17ba9 Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Wed, 29 Aug 2018 17:08:31 -0700 Subject: [PATCH 32/42] Record tensor factory functions in trace (#10935) Summary: Things like torch.zeros now appear in traces rather than constants. To continue to support our current level of ONNX export, we run constant prop to turn these back into constants where possible before export. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10935 Differential Revision: D9527427 Pulled By: zdevito fbshipit-source-id: 552a8bcc01b911251dab7d7026faafdd7a3c758a --- ...it.test_constant_prop_loop_constant.expect | 20 ++++++ ...test_call_traced_mod_from_script_fn.expect | 18 +++-- ...cript.test_onnx_export_speculate-f1.expect | 23 +++--- ...cript.test_onnx_export_speculate-f2.expect | 25 +++---- test/test_jit.py | 28 ++++++-- tools/autograd/gen_variable_factories.py | 13 +++- tools/autograd/gen_variable_type.py | 71 ++++++++++--------- tools/autograd/templates/variable_factories.h | 2 +- tools/jit/gen_jit_dispatch.py | 3 +- torch/csrc/jit/constants.cpp | 4 +- .../csrc/jit/passes/constant_propagation.cpp | 24 ++++--- torch/csrc/jit/python_ir.cpp | 21 +++++- torch/csrc/jit/tracer.cpp | 10 +++ torch/csrc/jit/tracer.h | 21 +++--- torch/csrc/jit/type.cpp | 2 + torch/jit/__init__.py | 8 +-- torch/onnx/utils.py | 35 +++++---- 17 files changed, 218 insertions(+), 110 deletions(-) create mode 100644 test/expect/TestJit.test_constant_prop_loop_constant.expect diff --git a/test/expect/TestJit.test_constant_prop_loop_constant.expect b/test/expect/TestJit.test_constant_prop_loop_constant.expect new file mode 100644 index 00000000000000..5bdca2f2c47890 --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_loop_constant.expect @@ -0,0 +1,20 @@ +graph() { + %b.1 : int = prim::Constant[value=0]() + %1 : int = prim::Constant[value=2147483647]() + %2 : int = prim::Constant[value=1]() + %b.3 : int = prim::Loop(%1, %2, %b.1) + block0(%4 : int, %5 : int) { + %b.2 : int = prim::Constant[value=1]() + %7 : int = prim::Constant[value=1]() + -> (%7, %b.2) + } + %8 : int = prim::Constant[value=2147483647]() + %9 : int = prim::Constant[value=0]() + %b : int = prim::Loop(%8, %9, %b.3) + block0(%11 : int, %12 : int) { + %b.4 : int = prim::Constant[value=2]() + %14 : int = prim::Constant[value=0]() + -> (%14, %b.4) + } + return (%b); +} diff --git a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect index 6a9a3a571967a2..078091d52268e2 100644 --- a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect +++ b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect @@ -1,8 +1,14 @@ graph(%x : Dynamic) { - %1 : Double(4, 3) = prim::Constant[value=]() - %2 : Double(3, 3) = aten::mm(%x, %1) - %3 : int = prim::Constant[value=1]() - %4 : int = prim::Constant[value=1]() - %5 : Dynamic = aten::add(%2, %3, %4) - return (%5); + %1 : int = prim::Constant[value=4]() + %2 : int = prim::Constant[value=3]() + %3 : int[] = prim::ListConstruct(%1, %2) + %4 : int = prim::Constant[value=7]() + %5 : int = prim::Constant[value=0]() + %6 : int[] = prim::Constant[value=[0, -1]]() + %7 : Double(4, 3) = aten::zeros(%3, %4, %5, %6) + %8 : Double(3, 3) = aten::mm(%x, %7) + %9 : int = prim::Constant[value=1]() + %10 : int = prim::Constant[value=1]() + %11 : Dynamic = aten::add(%8, %9, %10) + return (%11); } diff --git a/test/expect/TestScript.test_onnx_export_speculate-f1.expect b/test/expect/TestScript.test_onnx_export_speculate-f1.expect index 47f55eb41ccdaa..4e8e51552ea4ac 100644 --- a/test/expect/TestScript.test_onnx_export_speculate-f1.expect +++ b/test/expect/TestScript.test_onnx_export_speculate-f1.expect @@ -6,27 +6,28 @@ ModelProto { GraphProto { name: "torch-jit-export" inputs: [{name: "x.1", type:Tensor dims: 1 10}] - outputs: [{name: "6", type:Tensor dims: 10 1}] + outputs: [{name: "8", type:Tensor dims: 10 1}] initializers: [] nodes: [ Node {type: "Add", inputs: [x.1,x.1], outputs: [1], attributes: []}, - Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "Transpose", inputs: [1], outputs: [3], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, - Node {type: "Transpose", inputs: [1], outputs: [4], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, + Node {type: "ReduceSum", inputs: [1], outputs: [2], attributes: [{ name: 'keepdims', type: int, value: 0}]}, + Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Greater", inputs: [2,3], outputs: [4], attributes: []}, Node {type: "Transpose", inputs: [1], outputs: [5], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, - Node {type: "If", inputs: [2], outputs: [6], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "Transpose", inputs: [1], outputs: [6], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, + Node {type: "Transpose", inputs: [1], outputs: [7], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, + Node {type: "If", inputs: [4], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export1" inputs: [] - outputs: [{name: "8", type:Tensor dims: }] + outputs: [{name: "9", type:Tensor dims: }] initializers: [] nodes: [ - Node {type: "Constant", inputs: [], outputs: [7], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "If", inputs: [7], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "If", inputs: [4], outputs: [9], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export2" inputs: [] - outputs: [{name: "3", type:Tensor dims: }] + outputs: [{name: "5", type:Tensor dims: }] initializers: [] nodes: [ @@ -37,7 +38,7 @@ ModelProto { GraphProto { name: "torch-jit-export3" inputs: [] - outputs: [{name: "4", type:Tensor dims: }] + outputs: [{name: "6", type:Tensor dims: }] initializers: [] nodes: [ @@ -52,7 +53,7 @@ ModelProto { GraphProto { name: "torch-jit-export4" inputs: [] - outputs: [{name: "5", type:Tensor dims: }] + outputs: [{name: "7", type:Tensor dims: }] initializers: [] nodes: [ diff --git a/test/expect/TestScript.test_onnx_export_speculate-f2.expect b/test/expect/TestScript.test_onnx_export_speculate-f2.expect index e7d04f54309b05..2820ce5f639ecb 100644 --- a/test/expect/TestScript.test_onnx_export_speculate-f2.expect +++ b/test/expect/TestScript.test_onnx_export_speculate-f2.expect @@ -6,27 +6,28 @@ ModelProto { GraphProto { name: "torch-jit-export" inputs: [{name: "x.1", type:Tensor dims: 1 10},{name: "1", type:Tensor dims: 20 10},{name: "2", type:Tensor dims: 20}] - outputs: [{name: "5", type:Tensor dims: 1 20}] + outputs: [{name: "7", type:Tensor dims: 1 20}] initializers: [TensorProto shape: [20 10],TensorProto shape: [20]] nodes: [ Node {type: "Add", inputs: [x.1,x.1], outputs: [3], attributes: []}, - Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "If", inputs: [4], outputs: [5], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "ReduceSum", inputs: [3], outputs: [4], attributes: [{ name: 'keepdims', type: int, value: 0}]}, + Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Greater", inputs: [4,5], outputs: [6], attributes: []}, + Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export1" inputs: [] - outputs: [{name: "7", type:Tensor dims: 1 20}] + outputs: [{name: "8", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "If", inputs: [6], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export2" inputs: [] - outputs: [{name: "8", type:Tensor dims: 1 20}] + outputs: [{name: "9", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Gemm", inputs: [3,1,2], outputs: [8], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} + Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} ] } @@ -34,10 +35,10 @@ ModelProto { GraphProto { name: "torch-jit-export3" inputs: [] - outputs: [{name: "9", type:Tensor dims: 1 20}] + outputs: [{name: "10", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} + Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} ] } @@ -49,10 +50,10 @@ ModelProto { GraphProto { name: "torch-jit-export4" inputs: [] - outputs: [{name: "10", type:Tensor dims: 1 20}] + outputs: [{name: "11", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} + Node {type: "Gemm", inputs: [3,1,2], outputs: [11], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} ] } diff --git a/test/test_jit.py b/test/test_jit.py index e99203333dc386..a3e3ebed0e5af5 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -975,6 +975,24 @@ def fn(x, y): self.assertExpectedGraph(traced_fn.graph) self.assertExportImport(traced_fn.graph, (x, y)) + def test_trace_tensor_factory(self): + def run(**kwargs): + inputs_require_grads = kwargs.pop('inputs_require_grads', True) + + def fn(x): + return x + torch.ones(2, 3, **kwargs) + input = torch.ones(2, 3, **kwargs) + self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads) + # check we recorded 'ones' and did not just record a constant + tfn = torch.jit.trace(input)(fn) + self.assertTrue("ones" in str(tfn.graph)) + run() + run(dtype=torch.int, inputs_require_grads=False) + if RUN_CUDA: + run(device="cuda:0") + if RUN_CUDA_MULTI_GPU: + run(device="cuda:1") + # TODO: implement @unittest.expectedFailure def test_output_unflatten(self): @@ -1403,8 +1421,6 @@ def constant_prop(a, b): self.run_pass('constant_propagation', constant_prop.graph) self.assertExpected(canonical(constant_prop.graph)) - # TODO: implement - @unittest.expectedFailure def test_constant_prop_loop_constant(self): @torch.jit.script def constant_prop(): @@ -4716,8 +4732,12 @@ def __init__(self, m): @torch.jit.script_method def forward(self, x): x += x - if True: - if True: + # because we are testing if we emit `if` statement correctly + # we cannot use `True` as the condition. Constant prop + # would remove the `if` statements. + c = sum(x) > 4 + if c: + if c: y = self.m(x) else: y = self.m(x) diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py index c963650933cf25..ac3e8782eb355d 100644 --- a/tools/autograd/gen_variable_factories.py +++ b/tools/autograd/gen_variable_factories.py @@ -5,11 +5,16 @@ import re from .utils import CodeTemplate, write +from .gen_variable_type import format_trace + FUNCTION_TEMPLATE = CodeTemplate("""\ inline at::Tensor ${name}(${formals}) { + ${pre_record_trace} at::Tensor tensor = at::${name}(${actuals}); - return autograd::make_variable(tensor, /*requires_grad=*/${requires_grad}); + auto result = autograd::make_variable(tensor, /*requires_grad=*/${requires_grad}); + ${post_record_trace} + return result; } """) @@ -53,6 +58,10 @@ def process_function(decl, has_tensor_options): requires_grad = "options.requires_grad()" if has_tensor_options else "false" if decl['name'].endswith('_like') and not has_tensor_options: actuals.append('at::TensorOptions({}, /*discard_runtime_type=*/true)'.format(actuals[0])) + + pre_record_trace, post_record_trace = format_trace(decl) + return FUNCTION_TEMPLATE.substitute( - name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad + name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad, + pre_record_trace=pre_record_trace, post_record_trace=post_record_trace ) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 0fe32115da314e..caa6744bb38542 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -141,7 +141,7 @@ POST_RECORD_TRACE = CodeTemplate("""\ if (jit::tracer::isTracing()) { - jit::tracer::postRecordTrace(node, ArrayRef(${trace_outputs}) ); + jit::tracer::postRecordTrace(node, at::ArrayRef(${trace_outputs}) ); } """) @@ -183,6 +183,41 @@ def should_trace(declaration): return True +def get_trace_outputs(declaration): + if declaration['return_type'] == 'std::vector': + return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name']) + elif declaration['name'].endswith('_out'): + output_args = [arg['name'] for arg in declaration['arguments'] + if arg.get('output', False)] + return '{' + ', '.join(output_args) + '}' + trace_outs = [r['name'] for r in declaration['returns']] + if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']): + return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs) + else: + return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs) + + +def format_trace(declaration): + local = {} + + add_trace_inputs = [] + for argument in declaration['arguments']: + add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name'])) + local['add_trace_inputs'] = '\n'.join(add_trace_inputs) + + # Record inplace operations as out-of-place operations (e.g., + # not add_ but add) + # TODO: Add a proper concept of side effects to the IR, and + # properly record inplace operations. + local['trace_name'] = uninplace_api_name(declaration['api_name']) + if local['trace_name'] in RENAME_TRACE: + local['trace_name'] = RENAME_TRACE[local['trace_name']] + + local['trace_outputs'] = get_trace_outputs(declaration) + + return (PRE_RECORD_TRACE.substitute(local), POST_RECORD_TRACE.substitute(local)) + + def gen_variable_type(out, aten_declarations, template_path): """VariableType.h and VariableType.cpp body @@ -361,42 +396,10 @@ def reference_args(args): res.append(arg['name']) return res - def get_trace_outputs(declaration): - if declaration['return_type'] == 'std::vector': - return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name']) - elif name.endswith('_out'): - output_args = [arg['name'] for arg in arguments - if arg.get('output', False)] - return '{' + ', '.join(output_args) + '}' - trace_outs = [r['name'] for r in declaration['returns']] - if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']): - return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs) - else: - return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs) - def emit_record_trace(env): if not should_trace(declaration): return ('', '') - - local = {} - - add_trace_inputs = [] - for argument in declaration['arguments']: - add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name'])) - local['add_trace_inputs'] = '\n'.join(add_trace_inputs) - - # Record inplace operations as out-of-place operations (e.g., - # not add_ but add) - # TODO: Add a proper concept of side effects to the IR, and - # properly record inplace operations. - local['trace_name'] = uninplace_api_name(declaration['api_name']) - if local['trace_name'] in RENAME_TRACE: - local['trace_name'] = RENAME_TRACE[local['trace_name']] - - local['trace_outputs'] = get_trace_outputs(declaration) - - combined = nested_dict(local, nested_dict(env, declaration)) - return (PRE_RECORD_TRACE.substitute(combined), POST_RECORD_TRACE.substitute(combined)) + return format_trace(declaration) def declare_returned_variables(): if modifies_arguments: diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h index bc2fa21385777f..bf74abc9138c65 100644 --- a/tools/autograd/templates/variable_factories.h +++ b/tools/autograd/templates/variable_factories.h @@ -3,7 +3,7 @@ // ${generated_comment} #include - +#include #include #include diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index d337143dd8b09e..ff7fce56e91552 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -262,7 +262,8 @@ def declkey(decl): arguments.extend([ # XXX - until we actually have first-class interpreter types for these # concepts, the default values to be encoded in Tensors - + # If you change this, you also need to update [TensorOptions in script] + # in the tracer code. # dtype is specified as an int64_t of at::ScalarType {'name': 'dtype', 'simple_type': 'ScalarType', 'default': 'float', 'kwarg_only': True}, # layout is specified as an int64_t of at::Layout diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index f51a735acea1b5..d7876411c687a6 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -13,7 +13,9 @@ Value* insertConstant( Node * n = g.create(prim::Constant); if(val.isTensor()) { at::Tensor ref = std::move(val).toTensor(); - JIT_ASSERT(ref.defined()); + if(!ref.defined()) { + throw constant_not_supported_error("undefined tensors cannot become constants"); + } n->output()->inferTypeFrom(ref); // note: before t_ because of std::move(ref) n->t_(attr::value, std::move(ref)); } else if(val.isInt()) { diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index 6855002d4fd9cb..bfd8ec9b9f1764 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -31,6 +31,10 @@ std::unordered_set skip_list = { aten::randn_like, aten::randperm, aten::randperm_out, + prim::Constant, + prim::Undefined, + // TODO (zach): we should consider skipping tensor factories in the cases + // where the constant tensor would be large but cheap to create. }; std::vector runNode(Node* n) { @@ -40,9 +44,14 @@ std::vector runNode(Node* n) { stack.push_back(*(toIValue(input))); } op(stack); - auto var_outputs = fmap(stack, [&](IValue v) { + auto var_outputs = fmap(stack, [&](IValue v) -> IValue { if (v.isTensor()) { - return IValue(autograd::as_variable_ref(v.toTensor()).data()); + auto t = std::move(v).toTensor(); + if(t.defined()) { + return IValue(autograd::as_variable_ref(t).data()); + } else { + return t; + } } else { return v; } @@ -119,11 +128,11 @@ bool removeExtraNodeOutputs(Node *n) { } // anonymous namespace void ConstantPropagation(Node* n, bool recurse) { - bool constant_inputs = (n->inputs().size() > 0) && - std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { - return v->node()->kind() == prim::Constant; - }); - bool supported_node = skip_list.count(n->kind()) == 0; + bool constant_inputs = + std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { + return v->node()->kind() == prim::Constant; + }); + bool supported_node = !n->kind().is_onnx() && skip_list.count(n->kind()) == 0; auto run_blocks = [&]() { if (recurse) { for (Block * block : n->blocks()) { @@ -150,7 +159,6 @@ void ConstantPropagation(Node* n, bool recurse) { } void ConstantPropagation(Block* block, bool recurse) { - ConstantPropagation(block->param_node(), recurse); for(auto it = block->nodes().begin(); it != block->nodes().end();) { Node *n = *it; it++; //advance iterator bc the current node may be destroyed diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index d16d4b00f07e91..d685584a4045be 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -443,14 +443,29 @@ void initPythonIRBindings(PyObject * module_) { switch(t->kind()) { case TypeKind::DynamicType: return "DynamicType"; + case TypeKind::TensorType: + return "TensorType"; + case TypeKind::NumberType: + return "NumberType"; + case TypeKind::NoneType: + return "NoneType"; case TypeKind::CompleteTensorType: return "CompleteTensorType"; case TypeKind::TupleType: return "TupleType"; - default: - AT_ERROR("unknown type kind"); - return ""; + case TypeKind::ListType: + return "ListType"; + case TypeKind::IntType: + return "IntType"; + case TypeKind::FloatType: + return "FloatType"; + case TypeKind::StringType: + return "StringType"; + case TypeKind::GeneratorType: + return "GeneratorType"; } + // not reachable, but some compilers complain + AT_ERROR("Unknown Type Kind"); }) .def("sizes",[](Type& t) { return t.expect()->sizes(); diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 5bc7bd574cf766..fee8924277d11e 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -48,6 +48,16 @@ void addInputs(Node *n, const char * name, at::TensorList value) { n->addInput(list_node->output()); } +void addInputs(Node* n, const char * name, const at::TensorOptions& options) { + // [TensorOptions in script] - update this when you change how we schematize TensorOptions + detail::genericAddInput(n, static_cast(options.dtype())); + detail::genericAddInput(n, static_cast(options.layout())); + std::vector device = { + static_cast(options.device().type()), + static_cast(options.device().index())}; + detail::genericAddInput(n, std::move(device)); +} + void addInputs(Node *n, const char * name, at::IntList value) { using ArgumentStash = jit::tracer::ArgumentStash; std::vector info = ArgumentStash::hasIntList(name) ? diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 789b3fd2d4591c..b811534ce27401 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -229,16 +229,17 @@ inline void abandon() { // NB: those serve both as an intermediate steps in addInputs below, // as well as the overloads that terminate template recursion -void addInputs(Node *n, const char * name, int64_t value); -void addInputs(Node *n, const char * name, bool value); -void addInputs(Node *n, const char * name, double value); -void addInputs(Node *n, const char * name, const at::Scalar& value); -void addInputs(Node *n, const char * name, const at::Tensor& value); -void addInputs(Node *n, const char * name, at::IntList value); -void addInputs(Node *n, const char * name, at::TensorList value); -void addInputs(Node *n, const char * name, const ArrayRef& value); -void addInputs(Node *n, const char * name, const std::string& value); -void addInputs(Node *n, const char * name, const at::SparseTensorRef& value); +TORCH_API void addInputs(Node *n, const char * name, int64_t value); +TORCH_API void addInputs(Node *n, const char * name, bool value); +TORCH_API void addInputs(Node *n, const char * name, double value); +TORCH_API void addInputs(Node *n, const char * name, const at::Scalar& value); +TORCH_API void addInputs(Node *n, const char * name, const at::Tensor& value); +TORCH_API void addInputs(Node *n, const char * name, at::IntList value); +TORCH_API void addInputs(Node *n, const char * name, at::TensorList value); +TORCH_API void addInputs(Node *n, const char * name, const ArrayRef& value); +TORCH_API void addInputs(Node *n, const char * name, const std::string& value); +TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value); +TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value); template void addInputs(Node *n, const char * name, std::array value) { diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp index c7e33fae7e20ac..e5a3e64ac067d8 100644 --- a/torch/csrc/jit/type.cpp +++ b/torch/csrc/jit/type.cpp @@ -51,6 +51,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << "None"; } else if(t.kind() == TypeKind::StringType) { out << "string"; + } else if(t.kind() == TypeKind::GeneratorType) { + out << "Generator"; } else { AT_ERROR("unknown type kind"); } diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index e0314acea4a173..551a17565e1763 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -996,12 +996,12 @@ def register_all(mod): return _builtin_table -def _register_builtin(callable, op): - _get_builtin_table()[id(callable)] = op +def _register_builtin(fn, op): + _get_builtin_table()[id(fn)] = op -def _find_builtin(callable): - return _get_builtin_table().get(id(callable)) +def _find_builtin(fn): + return _get_builtin_table().get(id(fn)) if not torch._C._jit_init(): diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 34c30aea654ed7..b65ea160b5c213 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -19,6 +19,7 @@ from torch.autograd import Function, function from torch.jit import _unique_state_dict from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes +from torch._C import ListType @contextlib.contextmanager @@ -103,24 +104,32 @@ def export(model, args, f, export_params=True, verbose=False, training=False, operator_export_type=operator_export_type) -def _list_constant_prop(g, block): +# ONNX can't handle constants that are lists of tensors, which can +# get generated in constant prop. So we split them back into prim::ListConstructs +def _split_tensor_list_constants(g, block): for node in block.nodes(): for subblock in node.blocks(): - _list_constant_prop(g, subblock) - if node.kind() == "prim::ListConstruct": - input_nodes = [i.node() for i in node.inputs()] - if all(inode.kind() == "prim::Constant" and inode.kindOf("value") == "i" for inode in input_nodes): - input_values = [inode['value'] for inode in input_nodes] - const_node = g.create("prim::Constant") - const_node.insertBefore(node) - const_node.is_("value", input_values) - const_node.output().setType(torch._C.ListType.ofInts()) - node.output().replaceAllUsesWith(const_node.output()) + _split_tensor_list_constants(g, subblock) + if node.kind() == "prim::Constant": + output_type = node.output().type() + if output_type.isSubtypeOf(ListType.ofTensors()): + inputs = [g.create("prim::Constant").t_('value', t) + .insertBefore(node).output() + for t in node['value']] + lc = (g.create("prim::ListConstruct", inputs) + .insertBefore(node) + .output() + .setType(ListType.ofTensors())) + node.output().replaceAllUsesWith(lc) def _optimize_graph(graph, operator_export_type): - _list_constant_prop(graph, graph) - + # we record now record some ops like ones/zeros + # into a trace where we previously recorded constants + # use constant prop to maintain our current level of onnx support + # without implementing symbolics for all of them + torch._C._jit_pass_constant_propagation(graph) + _split_tensor_list_constants(graph, graph) # run dce to eliminate dead parts of the graph that might have been # left behind by things like symbolic_override torch._C._jit_pass_dce(graph) From 91ecbf8b1d3e21feb03b2546cd12e9e456291a1f Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 29 Aug 2018 17:22:55 -0700 Subject: [PATCH 33/42] Remove TensorBase (#11036) Summary: Not subclassed except by Tensor. Also requried to align further with caffe2. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11036 Reviewed By: ezyang Differential Revision: D9565640 Pulled By: cpuhrsch fbshipit-source-id: ff7203a2c95d3f3956282b4f2d8dda6c2b93f4a6 --- aten/src/ATen/TensorBase.h | 53 -------------------------------- aten/src/ATen/templates/Tensor.h | 50 +++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 60 deletions(-) delete mode 100644 aten/src/ATen/TensorBase.h diff --git a/aten/src/ATen/TensorBase.h b/aten/src/ATen/TensorBase.h deleted file mode 100644 index 1bda3ddfa14915..00000000000000 --- a/aten/src/ATen/TensorBase.h +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include "ATen/TensorImpl.h" -#include "ATen/UndefinedTensor.h" -#include "ATen/core/Error.h" - -namespace at { namespace detail { - -// TensorBase is the base class for Tensor. -// TODO: Eliminate this, once we remove TensorBase from Scalar. At -// the moment it's only used to break an include cycle for Scalar -struct TensorBase { - TensorBase() {} - TensorBase(TensorImpl * tensor_impl, bool retain) : tensor_impl_(c10::intrusive_ptr::reclaim(tensor_impl)) { - if (tensor_impl == nullptr) { - throw std::runtime_error("TensorBaseImpl with nullptr not supported"); - } - if (retain && tensor_impl != UndefinedTensor::singleton()) { - c10::raw::intrusive_ptr::incref(tensor_impl); - } - } - TensorBase(c10::intrusive_ptr&& ptr) : tensor_impl_(std::move(ptr)) {} - TensorBase(const c10::intrusive_ptr& ptr) : tensor_impl_(ptr) {} - - int64_t dim() const { - return tensor_impl_->dim(); - } - - TensorImpl * unsafeGetTensorImpl() const { - return tensor_impl_.get(); - } - TensorImpl * unsafeReleaseTensorImpl() { - return tensor_impl_.release(); - } - const c10::intrusive_ptr& getIntrusivePtr() const { - return tensor_impl_; - } - - bool defined() const { - return tensor_impl_; - } - - void reset() { - tensor_impl_.reset(); - } - - friend struct WeakTensor; - -protected: - c10::intrusive_ptr tensor_impl_; -}; - -}} // namespace at::detail diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index f426c6753adc36..4d8bf60522f7db 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -9,9 +9,10 @@ #include "ATen/core/SparseTensorRef.h" #include "ATen/Storage.h" #include "ATen/TensorAccessor.h" -#include "ATen/TensorBase.h" #include "ATen/TensorImpl.h" #include "ATen/core/optional.h" +#include "ATen/UndefinedTensor.h" +#include "ATen/core/Error.h" namespace at { struct Generator; @@ -38,16 +39,48 @@ namespace at { // // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and // special care must be taken to handle this. -struct AT_API Tensor : public detail::TensorBase { - using TensorBase = detail::TensorBase; - Tensor() : TensorBase() {} - Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {} - Tensor(const c10::intrusive_ptr& ptr) : TensorBase(ptr) {} - Tensor(c10::intrusive_ptr&& ptr) : TensorBase(std::move(ptr)) {} +struct AT_API Tensor { + Tensor(){}; + Tensor(TensorImpl* tensor_impl, bool retain) + : tensor_impl_(c10::intrusive_ptr::reclaim( + tensor_impl)) { + if (tensor_impl == nullptr) { + throw std::runtime_error("TensorBaseImpl with nullptr not supported"); + } + if (retain && tensor_impl != UndefinedTensor::singleton()) { + c10::raw::intrusive_ptr::incref(tensor_impl); + } + } + Tensor(const c10::intrusive_ptr& ptr) + : tensor_impl_(std::move(ptr)) {} + Tensor(c10::intrusive_ptr&& ptr) + : tensor_impl_(ptr) {} Tensor(const Tensor&) = default; Tensor(Tensor&&) = default; + int64_t dim() const { + return tensor_impl_->dim(); + } + + TensorImpl * unsafeGetTensorImpl() const { + return tensor_impl_.get(); + } + TensorImpl * unsafeReleaseTensorImpl() { + return tensor_impl_.release(); + } + const c10::intrusive_ptr& getIntrusivePtr() const { + return tensor_impl_; + } + + bool defined() const { + return tensor_impl_; + } + + void reset() { + tensor_impl_.reset(); + } + // The following overloads are very intruiging. Consider the following // program: // @@ -242,6 +275,9 @@ struct AT_API Tensor : public detail::TensorBase { } friend struct WeakTensor; + +protected: + c10::intrusive_ptr tensor_impl_; }; struct AT_API WeakTensor { From e550eab3e20d58a68e24aaab1902c410f253914e Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Wed, 29 Aug 2018 17:53:35 -0700 Subject: [PATCH 34/42] Remove MetaNetDef test case in Predictor (#11052) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11052 Delete the test case for Predictor with constructing by MetaNetDef since the constructor actually has been deprecated. The broken PR is for construcing predictor from DB instance. Reviewed By: highker Differential Revision: D9566935 fbshipit-source-id: 5511883953a2d3f6eb0a4f1c5518a1bc4b3ffbdc --- caffe2/predictor/predictor_test.cc | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc index 40e4f720c61900..326265fc66d039 100644 --- a/caffe2/predictor/predictor_test.cc +++ b/caffe2/predictor/predictor_test.cc @@ -209,33 +209,4 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) { EXPECT_NEAR(output.front().data()[4], 0.1209, 1E-4); } -class PredictorMetaNetDefTest : public testing::Test { - public: - void SetUp() override { - DeviceOption op; - op.set_random_seed(1701); - ctx_ = caffe2::make_unique(op); - p_ = caffe2::make_unique( - makePredictorConfig(parseMetaNetDef(metaSpec))); - } - - std::unique_ptr ctx_; - std::unique_ptr p_; -}; - -TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) { - auto inputData = randomTensor({1, 4}, ctx_.get()); - Predictor::TensorMap input; - auto iter = input.emplace("data", Tensor(CPU)); - auto tensor = inputData->GetMutableTensor(CPU); - iter.first->second.ResizeLike(*tensor); - iter.first->second.ShareData(*tensor); - Predictor::TensorList output; - (*p_)(input, &output); - EXPECT_EQ(output.size(), 1); - EXPECT_EQ(output.front().dims().size(), 2); - EXPECT_EQ(output.front().dim(0), 1); - EXPECT_EQ(output.front().dim(1), 10); - EXPECT_NEAR(output.front().data()[4], 0.1209, 1E-4); -} } // namespace caffe2 From 394bdcd49a603f4e391abcfcf11b5b34e2868922 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Wed, 29 Aug 2018 17:56:20 -0700 Subject: [PATCH 35/42] Fix the build of aten tests when FULL_CAFFE2=1 Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11019 Reviewed By: orionr Differential Revision: D9562691 Pulled By: houseroad fbshipit-source-id: 95a8dee580e5f4dc9af3a2e1f68ec6c62a0e4e04 --- tools/build_pytorch_libs.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 994a96ad822b41..d1cdb855c9099f 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -281,6 +281,12 @@ function build_caffe2() { # STOP!!! Are you trying to add a C or CXX flag? Add it # to CMakeLists.txt and aten/CMakeLists.txt, not here. # We need the vanilla cmake build to work. + + # This is needed by the aten tests built with caffe2 + if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then + cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" + fi + ${CMAKE_INSTALL} -j"$MAX_JOBS" # Install Python proto files From 16b8e0a787fac2988e8adeba452ab6d02e6dde79 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 29 Aug 2018 20:01:38 -0700 Subject: [PATCH 36/42] at::StorageImpl: Rename size_ to numel_ and elementSize() to itemsize() Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11011 Reviewed By: ezyang Differential Revision: D9561898 Pulled By: cpuhrsch fbshipit-source-id: 0cf5cdc3e7acd397f7e2d66097856aaad0581147 --- aten/src/ATen/Storage.h | 4 +-- aten/src/ATen/StorageImpl.cpp | 10 +++---- aten/src/ATen/StorageImpl.h | 17 +++++------ aten/src/TH/THFile.cpp | 4 +-- aten/src/TH/THMemoryFile.cpp | 40 ++++++++++++------------- aten/src/TH/THStorageFunctions.cpp | 14 ++++----- aten/src/TH/THTensor.cpp | 2 +- aten/src/TH/generic/THStorage.cpp | 8 ++--- aten/src/TH/generic/THStorageCopy.cpp | 18 +++++------ aten/src/THC/THCStorage.cpp | 10 +++---- aten/src/THC/THCTensor.cpp | 2 +- aten/src/THC/generic/THCStorage.cpp | 4 +-- aten/src/THC/generic/THCStorage.cu | 2 +- aten/src/THC/generic/THCStorageCopy.cpp | 16 +++++----- aten/src/THC/generic/THCStorageCopy.cu | 8 ++--- torch/csrc/generic/Storage.cpp | 4 +-- torch/csrc/generic/StorageSharing.cpp | 10 +++---- 17 files changed, 86 insertions(+), 87 deletions(-) diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h index 8db0b231bf53f9..d797618b285e6a 100644 --- a/aten/src/ATen/Storage.h +++ b/aten/src/ATen/Storage.h @@ -26,8 +26,8 @@ struct AT_API Storage { template T* unsafe_data() const { return storage_impl_->unsafe_data(); } - size_t elementSize() const { return storage_impl_->elementSize(); } - ptrdiff_t size() const { return storage_impl_->size(); } + size_t elementSize() const { return storage_impl_->itemsize(); } + ptrdiff_t size() const { return storage_impl_->numel(); } bool resizable() const { return storage_impl_->resizable(); } // get() use here is to get const-correctness void* data() const { return storage_impl_.get()->data(); } diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index bc2d69a7aa8f5d..0ed836b9b3010a 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -4,26 +4,26 @@ namespace at { StorageImpl::StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, bool resizable) : data_type_(data_type), data_ptr_(std::move(data_ptr)), - size_(size), + numel_(numel), resizable_(resizable), allocator_(allocator) {} StorageImpl::StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::Allocator* allocator, bool resizable) : StorageImpl( data_type, - size, + numel, allocator->allocate( - at::elementSize(dataTypeToScalarType(data_type)) * size), + at::elementSize(dataTypeToScalarType(data_type)) * numel), allocator, resizable) {} diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index 68c5012777edd7..35639478df664e 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -23,13 +23,13 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { ~StorageImpl() {}; StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, bool resizable); StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::Allocator* allocator, bool resizable); StorageImpl(StorageImpl&) = delete; @@ -65,18 +65,17 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { void operator=(const StorageImpl&) = delete; - size_t elementSize() const { + size_t itemsize() const { return at::elementSize(dataTypeToScalarType(data_type_)); } Type& type(); - // TODO: Rename to size() and size to size_ - ptrdiff_t size() const { - return size_; + int64_t numel() const { + return numel_; }; - void set_size(ptrdiff_t size) { - size_ = size; + void set_numel(int64_t numel) { + numel_ = numel; }; bool resizable() const { return resizable_; @@ -127,7 +126,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { private: at::DataType data_type_; at::DataPtr data_ptr_; - ptrdiff_t size_; + int64_t numel_; bool resizable_; at::Allocator* allocator_; }; diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp index c8924b54f4bf70..4a2cb18b92e07e 100644 --- a/aten/src/TH/THFile.cpp +++ b/aten/src/TH/THFile.cpp @@ -140,12 +140,12 @@ IMPLEMENT_THFILE_SCALAR(Half, THHalf) #define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE) \ size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ { \ - return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \ + return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \ } \ \ size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ { \ - return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \ + return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \ } IMPLEMENT_THFILE_STORAGE(Byte, uint8_t) diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp index 011c1d1f54aaee..3f2187b68f74ea 100644 --- a/aten/src/TH/THMemoryFile.cpp +++ b/aten/src/TH/THMemoryFile.cpp @@ -56,7 +56,7 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size) return; else { - if(size < self->storage->size()) /* note the "<" and not "<=" */ + if(size < self->storage->numel()) /* note the "<" and not "<=" */ { self->size = size; THCharStorage_data(self->storage)[self->size] = '\0'; @@ -64,10 +64,10 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size) } } - missingSpace = size-self->storage->size()+1; /* +1 for the '\0' */ - THCharStorage_resize(self->storage, (self->storage->size()/2 > missingSpace ? - self->storage->size() + (self->storage->size()/2) - : self->storage->size() + missingSpace)); + missingSpace = size-self->storage->numel()+1; /* +1 for the '\0' */ + THCharStorage_resize(self->storage, (self->storage->numel()/2 > missingSpace ? + self->storage->numel() + (self->storage->numel()/2) + : self->storage->numel() + missingSpace)); } static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable) @@ -188,12 +188,12 @@ static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable) while (1) \ { \ ASCII_WRITE_ELEM; \ - if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) ) \ + if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) ) \ { \ mfself->position += nByteWritten; \ break; \ } \ - THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2); \ + THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2); \ } \ if(mfself->file.isAutoSpacing) \ { \ @@ -297,7 +297,7 @@ static void THMemoryFile_free(THFile *self) /* READ_WRITE_METHODS(bool, Bool, */ /* int value = 0; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */ -/* int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", value), */ +/* int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", value), */ /* 1) */ READ_WRITE_METHODS(uint8_t, Byte, @@ -307,7 +307,7 @@ READ_WRITE_METHODS(uint8_t, Byte, nread = ret; \ i = n-1; \ memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead), - nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \ + nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \ i = n-1; \ if(nByteWritten > -1) memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten), @@ -322,7 +322,7 @@ READ_WRITE_METHODS(int8_t, Char, nread = ret; \ i = n-1; \ memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead), - nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \ + nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \ i = n-1; \ if(nByteWritten > -1) memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten), @@ -330,29 +330,29 @@ READ_WRITE_METHODS(int8_t, Char, READ_WRITE_METHODS(int16_t, Short, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%hd", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%hd", data[i]), 1) READ_WRITE_METHODS(int32_t, Int, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", data[i]), 1) READ_WRITE_METHODS(float, Float, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]), 1) READ_WRITE_METHODS(THHalf, Half, int nByteRead_; float buf; \ int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \ data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", TH_half2float(data[i])), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])), 1) READ_WRITE_METHODS(double, Double, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.17g", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.17g", data[i]), 1) static ssize_t THMemoryFile_readLong(THFile *self, int64_t *data, ssize_t n) @@ -491,13 +491,13 @@ static ssize_t THMemoryFile_writeLong(THFile *self, int64_t *data, ssize_t n) ssize_t nByteWritten; while (1) { - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%" PRId64, data[i]); - if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) ) + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%" PRId64, data[i]); + if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) ) { mfself->position += nByteWritten; break; } - THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2); + THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2); } if(mfself->file.isAutoSpacing) { @@ -654,7 +654,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) if(storage) { - THArgCheck(THCharStorage_data(storage)[storage->size()-1] == '\0', 1, "provided CharStorage must be terminated by 0"); + THArgCheck(THCharStorage_data(storage)[storage->numel()-1] == '\0', 1, "provided CharStorage must be terminated by 0"); THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'"); THCharStorage_retain(storage); } @@ -668,7 +668,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) mfself = static_cast(THAlloc(sizeof(THMemoryFile))); mfself->storage = storage; - mfself->size = (storage ? storage->size()-1 : 0); + mfself->size = (storage ? storage->numel()-1 : 0); mfself->position = 0; mfself->longSize = 0; diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index b0e4abe9329db7..a5319e67dabe61 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -34,7 +34,7 @@ void THStorage_free(THStorage* storage) { ptrdiff_t THStorage_size(const THStorage *self) { - return self->size(); + return self->numel(); } void THStorage_retain(THStorage *storage) @@ -49,21 +49,21 @@ void THStorage_resize(THStorage* storage, ptrdiff_t size) { /* case when the allocator does not have a realloc defined */ at::DataPtr new_data; if (size != 0) { - new_data = storage->allocator()->allocate(storage->elementSize() * size); + new_data = storage->allocator()->allocate(storage->itemsize() * size); } at::DataPtr old_data = storage->set_data_ptr(std::move(new_data)); - ptrdiff_t old_size = storage->size(); - storage->set_size(size); + ptrdiff_t old_size = storage->numel(); + storage->set_numel(size); if (old_data != nullptr) { ptrdiff_t copy_size = old_size; - if (storage->size() < copy_size) { - copy_size = storage->size(); + if (storage->numel() < copy_size) { + copy_size = storage->numel(); } if (copy_size > 0) { memcpy( storage->data(), old_data.get(), - storage->elementSize() * copy_size); + storage->itemsize() * copy_size); } } } else { diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 1b1f493ac4e289..0c731779b95685 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -125,7 +125,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons if(!THTensor_getStoragePtr(self)) { THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type())); } - if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) { + if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) { THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset()); } } diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 384ce9c632e22b..992cbd5bb7509f 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -59,7 +59,7 @@ THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int false).release(); if (size <= 0) { - storage->set_size(actual_size / at::elementSize(scalar_type)); + storage->set_numel(actual_size / at::elementSize(scalar_type)); } return storage; @@ -132,19 +132,19 @@ void THStorage_(resize)(THStorage *storage, ptrdiff_t size) void THStorage_(fill)(THStorage *storage, real value) { ptrdiff_t i; - for(i = 0; i < storage->size(); i++) + for(i = 0; i < storage->numel(); i++) THStorage_(data)(storage)[i] = value; } void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value) { - THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds"); + THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds"); THStorage_(data)(self)[idx] = value; } real THStorage_(get)(const THStorage *self, ptrdiff_t idx) { - THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds"); + THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds"); return THStorage_(data)(self)[idx]; } diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp index 0cde162d4c2843..442f7dbde2925d 100644 --- a/aten/src/TH/generic/THStorageCopy.cpp +++ b/aten/src/TH/generic/THStorageCopy.cpp @@ -6,13 +6,13 @@ void THStorage_(rawCopy)(THStorage *storage, real *src) { ptrdiff_t i; real *data = THStorage_(data)(storage); - for(i = 0; i < storage->size(); i++) + for(i = 0; i < storage->numel(); i++) data[i] = src[i]; } void THStorage_(copy)(THStorage *storage, THStorage *src) { - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); THStorage_(rawCopy)(storage, THStorage_(data)(src)); } @@ -25,40 +25,40 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = static_cast(src_data[i]); \ } #define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \ + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = (real)TH_half2float(src_data[i]); \ } #define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \ + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = TH_float2half((float)(src_data[i])); \ } #define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \ + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = static_cast(src_data[i]); \ } diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index a9a1790c58c830..96e3938e20b0f9 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -20,17 +20,17 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) if (!self->resizable()) THError("Trying to resize storage that is not resizable"); - size_t elementSize = self->elementSize(); + size_t itemsize = self->itemsize(); if(size == 0) { self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device))); - self->set_size(0); + self->set_numel(0); } else { at::DataPtr data = - self->allocator()->allocate(size * elementSize); + self->allocator()->allocate(size * itemsize); if (self->data_ptr()) { // Enable p2p access when the memcpy is across devices @@ -38,14 +38,14 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) THCudaCheck(cudaMemcpyAsync(data.get(), self->data(), - THMin(self->size(), size) * elementSize, + THMin(self->numel(), size) * itemsize, cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); } // Destructively overwrite data_ptr self->set_data_ptr(std::move(data)); - self->set_size(size); + self->set_numel(size); } } diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index 3826ea57fc5da3..de787bd380b6e6 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -148,7 +148,7 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, const if(!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) { + if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) { THCStorage_resize(state, THTensor_getStoragePtr(self), totalSize+self->storage_offset()); } } diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index aef30d62517061..feb2e94959abf2 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -21,7 +21,7 @@ int THCStorage_(elementSize)(THCState *state) void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value) { - THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds"); + THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(real), cudaMemcpyHostToDevice, @@ -31,7 +31,7 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real v real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) { - THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds"); + THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); real value; cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(real), diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu index a6b3bf557e2f63..95f2bc7163d46f 100644 --- a/aten/src/THC/generic/THCStorage.cu +++ b/aten/src/THC/generic/THCStorage.cu @@ -10,7 +10,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value) #if CUDA_VERSION >= 7000 thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), #endif - self_data, self_data+self->size(), value); + self_data, self_data+self->numel(), value); } void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp index 9194ab7d3c80d4..546777baaf98c7 100644 --- a/aten/src/THC/generic/THCStorageCopy.cpp +++ b/aten/src/THC/generic/THCStorageCopy.cpp @@ -4,11 +4,11 @@ void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src) { - THArgCheck(self->size() == src->size(), 2, "size does not match"); + THArgCheck(self->numel() == src->numel(), 2, "size does not match"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), THStorage_(data)(src), - self->size() * sizeof(real), + self->numel() * sizeof(real), cudaMemcpyHostToDevice, stream)); THCudaCheck(cudaStreamSynchronize(stream)); @@ -18,9 +18,9 @@ void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *s void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ { \ THCTensor* selfTensor = \ - THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1); \ + THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1); \ struct TH##TYPEC##Tensor* srcTensor = \ - TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size(), 1); \ + TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->numel(), 1); \ THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor); \ TH##TYPEC##Tensor_free(srcTensor); \ THCTensor_(free)(state, selfTensor); \ @@ -36,11 +36,11 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Double) void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src) { - THArgCheck(self->size() == src->size(), 2, "size does not match"); + THArgCheck(self->numel() == src->numel(), 2, "size does not match"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self), THCStorage_(data)(state, src), - self->size() * sizeof(real), + self->numel() * sizeof(real), cudaMemcpyDeviceToHost, stream)); THCudaCheck(cudaStreamSynchronize(stream)); @@ -50,9 +50,9 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ { \ TH##TYPEC##Tensor* selfTensor = \ - TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size(), 1); \ + TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->numel(), 1); \ struct THCTensor* srcTensor = \ - THCTensor_(newWithStorage1d)(state, src, 0, src->size(), 1); \ + THCTensor_(newWithStorage1d)(state, src, 0, src->numel(), 1); \ TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \ THCTensor_(free)(state, srcTensor); \ TH##TYPEC##Tensor_free(selfTensor); \ diff --git a/aten/src/THC/generic/THCStorageCopy.cu b/aten/src/THC/generic/THCStorageCopy.cu index bea4fe699623fb..962167c73b82c8 100644 --- a/aten/src/THC/generic/THCStorageCopy.cu +++ b/aten/src/THC/generic/THCStorageCopy.cu @@ -4,17 +4,17 @@ void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src) { - THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->size() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->numel() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); } // conversions are delegated to THCTensor implementation #define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA) \ void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src) \ { \ - THArgCheck(self->size() == src->size(), 2, "size does not match"); \ - THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1); \ + THArgCheck(self->numel() == src->numel(), 2, "size does not match"); \ + THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1); \ struct THCuda##TYPECUDA##Tensor* srcTensor = \ - THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size(), 1); \ + THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->numel(), 1); \ THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor); \ THCuda##TYPECUDA##Tensor_free(state, srcTensor); \ THCTensor_(free)(state, selfTensor); \ diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp index 42f3f583b848e9..d8f33c533b2039 100644 --- a/torch/csrc/generic/Storage.cpp +++ b/torch/csrc/generic/Storage.cpp @@ -151,9 +151,9 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index) int64_t nindex = THPUtils_unpackLong(index); if (nindex < 0) nindex += THWStorage_(size)(LIBRARY_STATE self->cdata); - if (nindex < 0 || nindex >= self->cdata->size()) { + if (nindex < 0 || nindex >= self->cdata->numel()) { PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of " - "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size()); + "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->numel()); return NULL; } real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex); diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp index 4a7c01b2ca2e82..6b462160c6d0b2 100644 --- a/torch/csrc/generic/StorageSharing.cpp +++ b/torch/csrc/generic/StorageSharing.cpp @@ -79,7 +79,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self) } else { // TODO: retry on collision // TODO: free GIL - but remember to reacquire it when an exception is thrown - THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size())); + THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->numel())); THWStorage_(copy)(new_storage, storage); THWStorage_(swap)(storage, new_storage); ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr()); @@ -90,7 +90,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self) if (!manager_handle) return NULL; THPObjectPtr storage_handle(PyBytes_FromString(ctx->filename())); if (!storage_handle) return NULL; - THPObjectPtr size(PyLong_FromLong(storage->size())); + THPObjectPtr size(PyLong_FromLong(storage->numel())); if (!size) return NULL; THPObjectPtr tuple(PyTuple_New(3)); @@ -158,7 +158,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self) if ((ctx = THMapAllocator::fromDataPtr(storage->data_ptr()))) { // done } else { - THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size())); + THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->numel())); THWStorage_(copy)(new_storage, storage); THWStorage_(swap)(storage, new_storage); ctx = THMapAllocator::fromDataPtr(storage->data_ptr()); @@ -167,7 +167,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self) THPObjectPtr storage_handle(PyLong_FromLong(ctx->fd())); if (!storage_handle) return NULL; - THPObjectPtr size(PyLong_FromLong(storage->size())); + THPObjectPtr size(PyLong_FromLong(storage->numel())); if (!size) return NULL; THPObjectPtr tuple(PyTuple_New(2)); @@ -220,7 +220,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self) THPObjectPtr device(PyLong_FromLong(storage->device().index())); THPObjectPtr _handle(Py_None); Py_INCREF(Py_None); - THPObjectPtr size(PyLong_FromLong(storage->size())); + THPObjectPtr size(PyLong_FromLong(storage->numel())); THPObjectPtr _offset(PyLong_FromLong(0)); if (THWStorage_(data)(LIBRARY_STATE storage)) { size_t base_size; From ad1670cf547940ebbaa63818585e61c19f795ce6 Mon Sep 17 00:00:00 2001 From: Shihao Xu Date: Wed, 29 Aug 2018 20:09:02 -0700 Subject: [PATCH 37/42] Kill the dummy TaskOutput when task.get_step() (#11048) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11048 Pull Request resolved: https://github.com/pytorch/pytorch/pull/10739 I wanted to assert that the blobs in the workspace of the new session after loading checkpoint are exactly the same as the blobs in the workspace of the old session before saving to a checkpoint. But I found that when calling `task.get_step()`, a dummy task output blob, `task:output/ConstIntFill:0`, is added. Also a dummy net `task:output` was also added along with it. See https://fburl.com/937lf2yk This makes it hard to assert "Equal", forcing me to assert "LessThan" or "GreaterThan". This adding a dummy TaskOutput when user specifies no TaskOutput is a hack. The reason for this is that ZMQ socket can't send empty blob list. As a result, if the Task on the Worker had no output, The master would never stop waiting and hang forever. See https://fburl.com/rd7fhy6p and imagine `socket.recv(net, 0)`. TaskOuput is at user layer. The hack shouldn't be exposed to user layer, polluting user workspaces. Instead, we should move the creating of the dummy blob to some deeper layer, and remove the dummy blob in the workspace afterwards to avoid polluting user workspaces. After this change, the workaround becomes totally transparent and no side-effect to users. Reviewed By: mraway Differential Revision: D9566744 fbshipit-source-id: 18292dd64a6d48192c34034200a7c9811d2172af --- caffe2/python/checkpoint_test.py | 4 +-- caffe2/python/core_test.py | 4 +-- caffe2/python/task.py | 46 +++++++++++++------------------- 3 files changed, 22 insertions(+), 32 deletions(-) diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py index a91bbf9910e29a..afba3dddcd5aae 100644 --- a/caffe2/python/checkpoint_test.py +++ b/caffe2/python/checkpoint_test.py @@ -161,9 +161,9 @@ def test_ckpt_name_and_load_model_from_ckpts(self): num_epochs = job_runner.train(session) self.assertEquals(num_epochs, len(EXPECTED_TOTALS)) - # There are 17 global blobs after finishing up the job runner. + # There are 15 global blobs after finishing up the job runner. # (only blobs on init_group are checkpointed) - self.assertEquals(len(ws.blobs), 17) + self.assertEquals(len(ws.blobs), 15) ws = workspace.C.Workspace() session = LocalSession(ws) diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index 7120843f33152d..d989471a16bab2 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -533,8 +533,8 @@ def test_create_plan_from_proto_correctly(self): self.assertEqual(len(plan.Steps()), 1) self.assertEqual(len(test_plan.Steps()), 1) - self.assertEqual(len(plan.Proto().network), 9) - self.assertEqual(len(test_plan.Proto().network), 9) + self.assertEqual(len(plan.Proto().network), 8) + self.assertEqual(len(test_plan.Proto().network), 8) self.assertEqual(len(plan.Proto().execution_step), 1) self.assertEqual(len(test_plan.Proto().execution_step), 1) self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name()) diff --git a/caffe2/python/task.py b/caffe2/python/task.py index 311211dfdff3ee..9cfe7089332a18 100644 --- a/caffe2/python/task.py +++ b/caffe2/python/task.py @@ -150,7 +150,7 @@ def add_setup_steps(step, init_nets, exit_nets, name): if init_nets: steps.append(core.execution_step('%s:init' % name, init_nets)) steps.append(step) - if len(exit_nets) > 0: + if exit_nets: steps.append(core.execution_step('%s:exit' % name, exit_nets)) return core.execution_step(name, steps) @@ -215,10 +215,11 @@ def add(self, task): self._tasks.append(task) def tasks(self): - for task in self._tasks_to_add: - self.add(task) - self._tasks_to_add = [] - self._already_used = True + if not self._already_used: + for task in self._tasks_to_add: + self.add(task) + self._tasks_to_add = [] + self._already_used = True return self._tasks def num_registered_tasks(self): @@ -259,9 +260,8 @@ def tasks_by_node(self, node_remap=None): # tasks_by_node can't be called twice because the setup won't # work properly a second time. node_map = {} - for task in self.tasks(): - node_map[task.node] =\ - node_remap(task.node) if node_remap else task.node + for node in self.used_nodes(): + node_map[node] = node_remap(node) if node_remap else node if self._tasks_by_node is not None: tasks_by_node, prev_node_map = self._tasks_by_node assert prev_node_map == node_map, ( @@ -285,11 +285,7 @@ def tasks_by_node(self, node_remap=None): grouped_by_node = TaskGroup() for node, tasks in viewitems(tasks_by_node): report_steps = report_steps_by_node[node] - node_inits, node_exits = get_setup_nets( - TaskGroup.LOCAL_SETUP, - [t.get_step() for t in tasks] + report_steps, - self) - # shortcut for single task with no queue + steps = report_steps outputs = [] grouped_workspace_type = WorkspaceType.PRIVATE @@ -311,16 +307,15 @@ def tasks_by_node(self, node_remap=None): else: step = core.execution_step( '%s:body' % node, steps, concurrent_substeps=True) - if len(node_inits) > 0 or len(node_exits) > 0: - steps = [] - if len(node_inits) > 0: - steps.append( - core.execution_step('%s:init' % node, node_inits)) - steps.append(step) - if len(node_exits) > 0: - steps.append( - core.execution_step('%s:exit' % node, node_exits)) - step = core.execution_step(node, steps) + + # Prepend and append setup nets. + node_inits, node_exits = get_setup_nets( + TaskGroup.LOCAL_SETUP, + [t.get_step() for t in tasks] + report_steps, + self, + ) + step = add_setup_steps(step, node_inits, node_exits, node) + Task( node=node, step=step, outputs=outputs, name='grouped_by_node', @@ -582,11 +577,6 @@ def get_step(self): Task.TASK_SETUP, [self._step] + report_steps, self) instance_init_nets, instance_exit_nets = get_setup_nets( Task.TASK_INSTANCE_SETUP, [self._step] + report_steps, self) - if len(self._outputs) == 0: - output_net = core.Net('%s:output' % self.name) - self.add_output(output_net.ConstantFill( - [], 1, dtype=core.DataType.INT32, value=0)) - task_exit_nets.append(output_net) # Add instance-level report steps body = self._step if not report_steps else core.execution_step( From 23af7deea7efa1172a54133cba5d13358cdf3cc0 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Wed, 29 Aug 2018 22:33:28 -0700 Subject: [PATCH 38/42] Add has_lapack flag (#11024) Summary: Currently our `skipIfLapack` has uses a try-catch block and regex match the error message. It is highly unreliable. This PR adds `hasLAPACK` and `hasMAGMA` on ATen context, and expose the flags to python. Also fixes refcounting bug with `PyModule_AddObject`. The method steals reference, but we didn't `Py_INCREF` in some places before calling it with `Py_True` or `Py_False`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11024 Differential Revision: D9564898 Pulled By: SsnL fbshipit-source-id: f46862ec3558d7e0058ef48991cd9c720cb317e2 --- aten/src/ATen/Context.cpp | 10 ++++++++++ aten/src/ATen/Context.h | 12 ++++++++++++ aten/src/ATen/cuda/detail/CUDAHooks.cpp | 10 +++++++++- aten/src/ATen/cuda/detail/CUDAHooks.h | 1 + aten/src/ATen/detail/CUDAHooksInterface.h | 4 ++++ test/common.py | 8 +++----- torch/csrc/Module.cpp | 22 +++++++++++++++------- torch/csrc/cuda/Module.cpp | 11 +++++------ 8 files changed, 59 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index a2c3fb40a7d415..5b420d87b34fc1 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -11,6 +11,8 @@ #include "ATen/CPUGenerator.h" #include "ATen/RegisterCPU.h" +#include "TH/TH.h" // for USE_LAPACK + #ifdef USE_SSE3 #include #endif @@ -80,6 +82,14 @@ bool Context::hasMKL() const { #endif } +bool Context::hasLAPACK() const { +#ifdef USE_LAPACK + return true; +#else + return false; +#endif +} + bool Context::setFlushDenormal(bool on) { #ifdef USE_SSE3 // Setting flush-to-zero (FTZ) flag diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 5584963fefe57f..bab1fa5dc5d069 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -50,6 +50,10 @@ class AT_API Context { return *generator; } bool hasMKL() const; + bool hasLAPACK() const; + bool hasMAGMA() const { + return detail::getCUDAHooks().hasMAGMA(); + } bool hasCUDA() const { return detail::getCUDAHooks().hasCUDA(); } @@ -158,6 +162,14 @@ static inline bool hasMKL() { return globalContext().hasMKL(); } +static inline bool hasLAPACK() { + return globalContext().hasLAPACK(); +} + +static inline bool hasMAGMA() { + return globalContext().hasMAGMA(); +} + static inline int64_t current_device() { return globalContext().current_device(); } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 7d73fafc994da5..570a375e3888a3 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -69,7 +69,7 @@ DynamicCUDAInterfaceSetter _; // let's not if we don't need to!) std::unique_ptr CUDAHooks::initCUDA() const { THCState* thc_state = THCState_alloc(); - + THCudaInit(thc_state); return std::unique_ptr( thc_state, [](THCState* p) { @@ -92,6 +92,14 @@ bool CUDAHooks::hasCUDA() const { return true; } +bool CUDAHooks::hasMAGMA() const { +#ifdef USE_MAGMA + return true; +#else + return false; +#endif +} + bool CUDAHooks::hasCuDNN() const { return AT_CUDNN_ENABLED(); } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 766ab62b8ef79f..491adfc4d73f1a 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -13,6 +13,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { std::unique_ptr initCUDA() const override; std::unique_ptr initCUDAGenerator(Context*) const override; bool hasCUDA() const override; + bool hasMAGMA() const override; bool hasCuDNN() const override; int64_t current_device() const override; Allocator* getPinnedMemoryAllocator() const override; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 6b2e87c4f762af..cccf6dc28453dc 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -65,6 +65,10 @@ struct AT_API CUDAHooksInterface { return false; } + virtual bool hasMAGMA() const { + return false; + } + virtual bool hasCuDNN() const { return false; } diff --git a/test/common.py b/test/common.py index 545ba4f1f0dd22..e7d6940ea56cc1 100644 --- a/test/common.py +++ b/test/common.py @@ -112,12 +112,10 @@ def wrapper(*args, **kwargs): def skipIfNoLapack(fn): @wraps(fn) def wrapper(*args, **kwargs): - try: + if not torch._C.has_lapack: + raise unittest.SkipTest('PyTorch compiled without Lapack') + else: fn(*args, **kwargs) - except Exception as e: - if 'Lapack library not found' in repr(e): - raise unittest.SkipTest('Compiled without Lapack') - raise return wrapper diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index af367c3e544905..e17997e6e9baba 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -584,13 +584,20 @@ static PyObject* initModule() { ASSERT_TRUE(THCPStream_init(module)); #endif + auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) { + // PyModule_AddObject steals reference + if (incref) { + Py_INCREF(v); + } + return PyModule_AddObject(module, name, v) == 0; + }; + #ifdef USE_CUDNN PyObject *has_cudnn = Py_True; #else PyObject *has_cudnn = Py_False; #endif - Py_INCREF(has_cudnn); - ASSERT_TRUE(PyModule_AddObject(module, "has_cudnn", has_cudnn) == 0); + ASSERT_TRUE(set_module_attr("has_cudnn", has_cudnn)); #ifdef USE_DISTRIBUTED_MW // See comment on CUDA objects @@ -611,19 +618,20 @@ static PyObject* initModule() { // Set ATen warnings to issue Python warnings at::Warning::set_warning_handler(&warning_handler); - ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0); + ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False)); + ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False)); #ifdef _GLIBCXX_USE_CXX11_ABI - ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", - _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0); + ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False)); #else - ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0); + ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False)); #endif auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU); THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator( defaultGenerator); - ASSERT_TRUE(PyModule_AddObject(module, "default_generator", (PyObject*)THPDefaultGenerator) == 0); + // This reference is meant to be given away, so no need to incref here. + ASSERT_TRUE(set_module_attr("default_generator", (PyObject*)THPDefaultGenerator, /* incref= */ false)); #ifdef USE_NUMPY if (_import_array() < 0) return NULL; diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index a4fcc6c45e874d..8fd95eda86f121 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -333,16 +333,15 @@ static PyObject * THCPModule_initExtension(PyObject *self) THCPCharStorage_postInit(m); THCPByteStorage_postInit(m); -#ifdef USE_MAGMA - THCMagma_init(state); - bool has_magma = true; -#else - bool has_magma = false; -#endif + bool has_magma = at::hasMAGMA(); + if (has_magma) { + THCMagma_init(state); + } bool has_half = true; auto set_module_attr = [&](const char* name, PyObject* v) { + // PyObject_SetAttrString doesn't steal reference. So no need to incref. if (PyObject_SetAttrString(m, name, v) < 0) { throw python_error(); } From dbc0004f995556b288b43077458b61683ac93855 Mon Sep 17 00:00:00 2001 From: Tullie Murrell Date: Wed, 29 Aug 2018 23:40:15 -0700 Subject: [PATCH 39/42] Remove use_count() == 1 in Tensor::Extend (#11046) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11046 As suggested by jerryzh168, temporary fix for a new constraint that was added D9350686 is to remove this assert. Long term jerryzh168 is going to work out a better way of handling this. Reviewed By: jerryzh168 Differential Revision: D9566323 fbshipit-source-id: e4630c7cbe0cc68a084974ea7048654811fae01f --- caffe2/core/tensor.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 21dc126c7f2c0b..f1934f5ddbc28d 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -275,9 +275,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); CAFFE_ENFORCE_GE_WITH_CALLER( num, 0, "`num` must be non-negative for Extend"); - CAFFE_ENFORCE( - storage_.use_count() == 1, - "Can't call Extend on shared storage, please call Resize instead"); auto newDims = dims_; newDims[0] += num; if (!storage_->data()) { From a8af7fe46ab1e79066f418b4cc79a5fd56316df5 Mon Sep 17 00:00:00 2001 From: Xingdong Zuo Date: Thu, 30 Aug 2018 08:09:23 -0700 Subject: [PATCH 40/42] Support import of `nn.RNNCellBase` in `__all__` Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10992 Differential Revision: D9572005 Pulled By: soumith fbshipit-source-id: 26b546830b6a25a4f7ba6f825cd888d678233a97 --- torch/nn/modules/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index 01ce9bf2ac7cab..f2683756ce59f8 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -43,10 +43,10 @@ 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout', 'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d', - 'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell', - 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance', - 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d', - 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', + 'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', + 'LSTMCell', 'GRUCell', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', + 'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', + 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold', 'AdaptiveLogSoftmaxWithLoss', ] From e7195431e0dffa7a065b65286ce414ac68f6bd7d Mon Sep 17 00:00:00 2001 From: Fei Sun Date: Thu, 30 Aug 2018 09:47:16 -0700 Subject: [PATCH 41/42] Add benchmarking functionality to the benchmark app (#10976) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10976 The app can run in XCode with the benchmark metrics collected. It can also run when building with buck Reviewed By: llyfacebook Differential Revision: D9546755 fbshipit-source-id: 60ad0112946f8cf57138417f6838a58ed6d2c90f --- binaries/benchmark_helper.cc | 88 +++++++++++++++++++++++++++++++++++- binaries/benchmark_helper.h | 18 ++++++++ binaries/caffe2_benchmark.cc | 55 ++++++---------------- 3 files changed, 118 insertions(+), 43 deletions(-) diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc index 7a441dc1c5c2ee..daf3ccac90eecf 100644 --- a/binaries/benchmark_helper.cc +++ b/binaries/benchmark_helper.cc @@ -14,8 +14,9 @@ * limitations under the License. */ -#include #include +#include +#include #include #include "binaries/benchmark_helper.h" @@ -309,3 +310,88 @@ void writeOutput( } } } + +int benchmark( + int argc, + char* argv[], + const string& FLAGS_backend, + const string& FLAGS_init_net, + const string& FLAGS_input, + const string& FLAGS_input_dims, + const string& FLAGS_input_file, + const string& FLAGS_input_type, + int FLAGS_iter, + const string& FLAGS_net, + const string& FLAGS_output, + const string& FLAGS_output_folder, + bool FLAGS_run_individual, + int FLAGS_sleep_before_run, + bool FLAGS_text_output, + int FLAGS_warmup, + bool FLAGS_wipe_cache) { + caffe2::GlobalInit(&argc, &argv); + // Check arguments to be correct + { + // Need to check whether file exists, as the file reader does not assert if + // file does not exist + std::ifstream net_file(FLAGS_net); + CAFFE_ENFORCE(net_file.good()); + + std::ifstream init_net_file(FLAGS_init_net); + CAFFE_ENFORCE(init_net_file.good()); + + if (FLAGS_input_file.size() > 0) { + vector input_files = caffe2::split(',', FLAGS_input_file); + for (auto input_file : input_files) { + std::ifstream ifile(input_file); + CAFFE_ENFORCE(ifile.good()); + } + } + } + + observerConfig(); + caffe2::ShowLogInfoToStderr(); + + auto workspace = std::make_shared(new caffe2::Workspace()); + bool run_on_gpu = backendCudaSet(FLAGS_backend); + // Run initialization network. + caffe2::NetDef init_net_def; + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); + setOperatorEngine(&init_net_def, FLAGS_backend); + CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); + + // Run main network. + caffe2::NetDef net_def; + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); + setOperatorEngine(&net_def, FLAGS_backend); + + map tensor_protos_map; + + loadInput( + workspace, + run_on_gpu, + tensor_protos_map, + FLAGS_input, + FLAGS_input_file, + FLAGS_input_dims, + FLAGS_input_type); + + runNetwork( + workspace, + net_def, + tensor_protos_map, + FLAGS_wipe_cache, + FLAGS_run_individual, + FLAGS_warmup, + FLAGS_iter, + FLAGS_sleep_before_run); + + writeOutput( + workspace, + run_on_gpu, + FLAGS_output, + FLAGS_output_folder, + FLAGS_text_output); + + return 0; +} diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h index df23ed8651118a..5bf79182dab7e1 100644 --- a/binaries/benchmark_helper.h +++ b/binaries/benchmark_helper.h @@ -98,3 +98,21 @@ void runNetwork( const int, const int, const int); +int benchmark( + int argc, + char* argv[], + const string& FLAGS_backend, + const string& FLAGS_init_net, + const string& FLAGS_input, + const string& FLAGS_input_dims, + const string& FLAGS_input_file, + const string& FLAGS_input_type, + int FLAGS_iter, + const string& FLAGS_net, + const string& FLAGS_output, + const string& FLAGS_output_folder, + bool FLAGS_run_individual, + int FLAGS_sleep_before_run, + bool FLAGS_text_output, + int FLAGS_warmup, + bool FLAGS_wipe_cache); diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc index c5a93ae7cbae33..38badccfa1e4bb 100644 --- a/binaries/caffe2_benchmark.cc +++ b/binaries/caffe2_benchmark.cc @@ -77,51 +77,22 @@ CAFFE2_DEFINE_bool( "Whether to evict the cache before running network."); int main(int argc, char** argv) { - caffe2::GlobalInit(&argc, &argv); - - observerConfig(); - caffe2::ShowLogInfoToStderr(); - - auto workspace = make_shared(new caffe2::Workspace()); - bool run_on_gpu = backendCudaSet(caffe2::FLAGS_backend); - // Run initialization network. - caffe2::NetDef init_net_def; - CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def)); - setOperatorEngine(&init_net_def, caffe2::FLAGS_backend); - CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); - - // Run main network. - caffe2::NetDef net_def; - CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def)); - setOperatorEngine(&net_def, caffe2::FLAGS_backend); - - map tensor_protos_map; - - loadInput( - workspace, - run_on_gpu, - tensor_protos_map, + benchmark( + argc, + argv, + caffe2::FLAGS_backend, + caffe2::FLAGS_init_net, caffe2::FLAGS_input, - caffe2::FLAGS_input_file, caffe2::FLAGS_input_dims, - caffe2::FLAGS_input_type); - - runNetwork( - workspace, - net_def, - tensor_protos_map, - caffe2::FLAGS_wipe_cache, - caffe2::FLAGS_run_individual, - caffe2::FLAGS_warmup, + caffe2::FLAGS_input_file, + caffe2::FLAGS_input_type, caffe2::FLAGS_iter, - caffe2::FLAGS_sleep_before_run); - - writeOutput( - workspace, - run_on_gpu, + caffe2::FLAGS_net, caffe2::FLAGS_output, caffe2::FLAGS_output_folder, - caffe2::FLAGS_text_output); - - return 0; + caffe2::FLAGS_run_individual, + caffe2::FLAGS_sleep_before_run, + caffe2::FLAGS_text_output, + caffe2::FLAGS_warmup, + caffe2::FLAGS_wipe_cache); } From 535633bddc9e52bb70b5be0dce80002e55d07cbd Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Thu, 30 Aug 2018 10:31:09 -0700 Subject: [PATCH 42/42] Export MPI functions (#11037) Summary: Potential fix for https://github.com/caffe2/caffe2/issues/2551#issuecomment-417124872 cc Yangqing mingzhe09088 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11037 Reviewed By: mingzhe09088 Differential Revision: D9580937 Pulled By: orionr fbshipit-source-id: 5e1fbf718728271a5b5af526d8e67cc5b48f0575 --- caffe2/mpi/mpi_common.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/caffe2/mpi/mpi_common.h b/caffe2/mpi/mpi_common.h index 3e1e7a5625bd2e..b283a0aea382c3 100644 --- a/caffe2/mpi/mpi_common.h +++ b/caffe2/mpi/mpi_common.h @@ -4,6 +4,7 @@ #include #include +#include "caffe2/core/common.h" #include "caffe2/core/logging.h" namespace caffe2 { @@ -29,7 +30,7 @@ MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE) #undef MPI_DATATYPE_WRAPPER // For all Caffe MPI calls, we will wrap it inside an MPI mutex lock guard. -std::mutex& MPIMutex(); +CAFFE2_API std::mutex& MPIMutex(); #define MPI_CHECK(condition) \ do { \ @@ -49,23 +50,23 @@ std::mutex& MPIMutex(); * @brief Gets the global MPI communicator used by Caffe2. In default, this * is MPI_COMM_WORLD unless you call SetGlobalMPIComm(). */ -MPI_Comm GlobalMPIComm(); +CAFFE2_API MPI_Comm GlobalMPIComm(); /** * @brief Sets the global MPI communicator. Caffe2 takes over the ownership * of the passed in communicator. */ -void SetGlobalMPIComm(MPI_Comm new_comm); +CAFFE2_API void SetGlobalMPIComm(MPI_Comm new_comm); /** * @brief A helper function to return the size of the given communicator. */ -int MPICommSize(MPI_Comm comm); +CAFFE2_API int MPICommSize(MPI_Comm comm); /** * @brief A helper function to return the rank of the given communicator. */ -int MPICommRank(MPI_Comm comm); +CAFFE2_API int MPICommRank(MPI_Comm comm); /** * @brief A simple wrapper over an MPI common world.